In [1]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="mesolitica/Malaysian-UltraChat-Speech-Multiturn-Instructions",
    repo_type='dataset',
    allow_patterns="data/*.parquet",
    local_dir="./Malaysian-UltraChat-Speech-Multiturn-Instructions",
)

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

'/home/mesolitica/stt/Malaysian-UltraChat-Speech-Multiturn-Instructions'

In [2]:
# snapshot_download(
#     repo_id="mesolitica/Malaysian-UltraChat-Speech-Multiturn-Instructions",
#     repo_type='dataset',
#     allow_patterns="ultrachat-speech-*.zip",
#     ignore_patterns=["*alignment.zip"],
#     local_dir="./",
# )

In [3]:
from glob import glob
from tqdm import tqdm
from multiprocess import Pool
import itertools
import zipfile
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

def loop(files):
    files, _ = files
    for zip_file_path in tqdm(files):
        destination_folder = './'
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
        os.remove(zip_file_path)

files = glob('ultrachat-speech*.zip')
if len(files):
    multiprocessing(files, loop, cores = min(len(files), 20), returned = False)

In [4]:
from glob import glob
import pandas as pd
import os
import json
from transformers import AutoProcessor



In [15]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer

In [6]:
!ls Malaysian-UltraChat-Speech-Multiturn-Instructions/data

train-00000-of-00002.parquet  voice_assistant-00000-of-00001.parquet
train-00001-of-00002.parquet


In [7]:
data = []
for f in glob('Malaysian-UltraChat-Speech-Multiturn-Instructions/data/*.parquet'):
    data.extend(pd.read_parquet(f).to_dict(orient = 'records'))
    
len(data)

192821

In [11]:
conversation = json.loads(data[0]['conversation'])

In [16]:
def loop(rows):
    rows, _ = rows
    data = []
    for row in tqdm(rows):
        try:
            conversation = json.loads(row['conversation'])
            text = processor.apply_chat_template(conversation, tokenize=False)
            audio = []
            for c in conversation:
                if c['role'] == 'user':
                    for c_ in c['content']:
                        if c_['type'] == 'audio':
                            audio.append(c_['audio_url'])
        except Exception as e:
            continue

        data.append({
            'text': text,
            'audio': audio,
        })
    return data

In [20]:
processed = loop((data[-100:], 0))

100%|███████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 3488.89it/s]


In [21]:
len(processed)

100

In [22]:
print(processed[-1]['text'])

<|im_start|>system
You are a voice-based assistant designed to be brief and precise. All replies must be under 300 characters. Speak in a friendly, natural tone suitable for spoken output. Avoid technical jargon and long replies. Ask short questions if more info is needed.<|im_end|>
<|im_start|>user
learners. The advantage 








International Journal Languages and Education (Vol. 1, No 1)                                   



 
89 



©The Author(s) (2021). Published by USIM Press on behalf of the Universiti Sains Islam Malaysia.  This is an Open 



Access article  distributed  under the  terms of the Creative Commons Attribution 4.0 International (CC BY 4.0) license. 



as per the analysis is concerned with the improvement in engagement of learners. For instance, 



when the flipped classroom method is considered and used, it plays a critical role in the estab-



lishment or development of an environment in which learners can engage with each other. In 



fact, in the classroo

In [23]:
processed[-1]['audio']

['ultrachat-speech/195824.mp3',
 'ultrachat-speech/195825.mp3',
 'ultrachat-speech/195826.mp3']

In [26]:
processed = multiprocessing(data, loop, cores = 30)

100%|█████████████████████████████████████████████████████████████████████████████████| 6427/6427 [00:01<00:00, 4159.67it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 6427/6427 [00:01<00:00, 4542.31it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 6427/6427 [00:01<00:00, 4509.73it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 6427/6427 [00:01<00:00, 4493.10it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 6427/6427 [00:01<00:00, 4531.37it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 6427/6427 [00:01<00:00, 4250.47it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 6427/6427 [00:01<00:00, 4519.10it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 6427/6427 [00:01<00:00, 4305.02it/s]


In [27]:
len(processed)

192821

In [28]:
with open('prepare-Malaysian-UltraChat-Speech-Multiturn-Instructions.json', 'w') as fopen:
    json.dump(processed, fopen)