In [1]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_type='dataset',
    repo_id="mesolitica/Malaysian-Speech-Instructions", 
    allow_patterns="without-audio/*.parquet", local_dir = './')

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

'/home/mesolitica/stt'

In [2]:
# from huggingface_hub import snapshot_download

# snapshot_download(
#     repo_type='dataset',
#     repo_id="mesolitica/Malaysian-STT-Whisper-Stage2", 
#     allow_patterns='prepared-mixed-malaysian-instructions*.zip', local_dir = './')

In [3]:
from glob import glob
from tqdm import tqdm
from multiprocess import Pool
import itertools
import zipfile
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

def loop(files):
    files, _ = files
    for zip_file_path in tqdm(files):
        destination_folder = './'
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
        os.remove(zip_file_path)

# files = glob('*.zip')
# if len(files):
#     multiprocessing(files, loop, cores = min(len(files), 20), returned = False)

In [4]:
from transformers import AutoProcessor



In [5]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer

In [6]:
processor.chat_template

"{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"

In [7]:
import json
import pandas as pd

def loop(rows):
    rows, _ = rows
    data = []
    for r in tqdm(rows):
        try:
            conversation = json.loads(r['prompt'])
            messages = []
            for c in conversation:
                if c['content'] is None:
                    break
                messages.append(c)
            if messages[-1]['role'] == 'user':
                messages = messages[:-1]
            text = processor.apply_chat_template(messages, tokenize=False)
            if '<|AUDIO|>' not in text:
                continue
        except Exception as e:
            continue
        f = r['audio_filename']
        if not os.path.exists(f):
            continue

        data.append({
            'text': text,
            'audio': f,
        })
    return data

In [8]:
data = []
for f in glob('without-audio/*.parquet'):
    data.extend(pd.read_parquet(f).to_dict(orient = 'records'))
    
len(data)

722004

In [9]:
processed = loop((data[-10:], 0))
len(processed)

100%|█████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 1201.53it/s]


10

In [10]:
processed = multiprocessing(data, loop, cores = 20)

100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:06<00:00, 5510.31it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 4678.51it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:06<00:00, 5333.67it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 4878.42it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 4910.16it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:06<00:00, 5284.19it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 4864.85it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 4920.43it/s]


In [11]:
len(processed)

720969

In [12]:
processed[-2]

{'text': '<|im_start|>system\nAct as a voice assistant chatbot. Keep every response under 300 characters. Be accurate, brief, and easy to understand when spoken aloud. Don’t overexplain or repeat. Ask for clarification only when needed. Prioritize clarity and brevity at all times.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\n<|im_end|>\n<|im_start|>assistant\nD. menyeberangi<|im_end|>\n',
 'audio': 'mallm-v3/6562.mp3'}

In [13]:
with open('prepare-Malaysian-Speech-Instructions.json', 'w') as fopen:
    json.dump(processed, fopen)