In [1]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="mesolitica/Malaysian-Reasoning-Speech-Instructions",
    repo_type='dataset',
    allow_patterns="data/*.parquet",
    local_dir="./Malaysian-Reasoning-Speech-Instructions",
)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

data/train-00001-of-00005.parquet:   0%|          | 0.00/202M [00:00<?, ?B/s]

data/train-00003-of-00005.parquet:   0%|          | 0.00/454M [00:00<?, ?B/s]

data/train-00000-of-00005.parquet:   0%|          | 0.00/354M [00:00<?, ?B/s]

data/train-00004-of-00005.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

data/train-00002-of-00005.parquet:   0%|          | 0.00/439M [00:00<?, ?B/s]

'/home/mesolitica/stt/Malaysian-Reasoning-Speech-Instructions'

In [2]:
from glob import glob
from tqdm import tqdm
from multiprocess import Pool
import itertools
import zipfile
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

In [3]:
from glob import glob
import pandas as pd
import os
import json
from transformers import AutoProcessor



In [4]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer

In [5]:
files = sorted(glob('Malaysian-Reasoning-Speech-Instructions/data/*.parquet'))
files

['Malaysian-Reasoning-Speech-Instructions/data/train-00000-of-00005.parquet',
 'Malaysian-Reasoning-Speech-Instructions/data/train-00001-of-00005.parquet',
 'Malaysian-Reasoning-Speech-Instructions/data/train-00002-of-00005.parquet',
 'Malaysian-Reasoning-Speech-Instructions/data/train-00003-of-00005.parquet',
 'Malaysian-Reasoning-Speech-Instructions/data/train-00004-of-00005.parquet']

In [6]:
!mkdir Malaysian-Reasoning-Speech-Instructions-audio

In [17]:
def loop(files):
    files, _ = files
    data = []
    for f in files:
        f_only = os.path.split(f)[1].replace('.parquet', '')
        df = pd.read_parquet(f)
        for i in tqdm(range(len(df))):
            new_f = os.path.join('Malaysian-Reasoning-Speech-Instructions-audio', 
                                 f'{f_only}-{i}.mp3')
            with open(new_f, 'wb') as fopen:
                fopen.write(df['audio_filename'].iloc[i]['bytes'])
            
            try:
                prompt = json.loads(df['prompt'].iloc[i])
                conversation = [
                    {"role": "system", "content": 'You are going to enter reasoning mode. First, you try to think step-by-step in Malay. After that, put your final answer.'},
                    {"role": "user", "content": [
                        {"type": "audio", "audio_url": "audio.wav"},
                    ]},
                    {"role": "assistant", "content": prompt[-1]['content']},
                ]
                text = processor.apply_chat_template(conversation, tokenize=False)
            except Exception as e:
                continue
            
            data.append({
                'text': text,
                'audio': new_f,
            })
    return data

In [18]:
processed = loop((files[:1], 0))

100%|█████████████████████████████████████████████████████████████████████████████████| 5038/5038 [00:01<00:00, 3092.16it/s]


In [19]:
len(processed)

5038

In [22]:
print(processed[-1]['text'])

<|im_start|>system
You are going to enter reasoning mode. First, you try to think step-by-step in Malay. After that, put your final answer.<|im_end|>
<|im_start|>user
Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>
<|im_end|>
<|im_start|>assistant
Saya akan melakukan analisis **panjang dan terperinci** seperti yang anda minta. Pertama sekali, mari kita lihat ayat dalam dialek Pahang ini dan cuba kita memahami komponen-komponennya satu persatu.

## **Teks Dialek:**

**"Dia dengkat lepaih jatoh pokok."**

## **Langkah 1: Memahami Konteks dan Nada Ayat**

Sebelum kita pecahkan ayat ini, penting untuk mendapatkan idea tentang **konteks dan nada** di sebalik perkataan-perkataan ini. Ayat ini kelihatan seperti satu pernyataan yang menggambarkan keadaan seseorang selepas kejadian tertentu, iaitu jatuh dari pokok.

Kita boleh meneka bahawa **nada** ayat ini **netral atau informatif**, tanpa menunjukkan emosi yang kuat seperti kemarahan atau kegembiraan. Namun, **kesan perkataan seperti "dengkat" 

In [23]:
processed = multiprocessing(files, loop, cores = min(len(files), 30))

100%|█████████████████████████████████████████████████████████████████████████████████| 5038/5038 [00:01<00:00, 3455.49it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 5038/5038 [00:01<00:00, 3201.08it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 5038/5038 [00:01<00:00, 2960.37it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 5038/5038 [00:01<00:00, 2997.03it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 5038/5038 [00:01<00:00, 2980.57it/s]


In [24]:
len(processed)

25190

In [25]:
with open('prepare-Malaysian-Reasoning-Speech-Instructions.json', 'w') as fopen:
    json.dump(processed, fopen)