In [1]:
from huggingface_hub import snapshot_download

f = snapshot_download(
    repo_id="mesolitica/Malaysian-Speech-Description-Timestamp-Instructions",
    repo_type='dataset',
    allow_patterns="data/*.parquet",
    local_dir="./Malaysian-Speech-Description-Timestamp-Instructions")

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/150M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/158M [00:00<?, ?B/s]

In [2]:
from glob import glob
from tqdm import tqdm
from multiprocess import Pool
import itertools
import zipfile
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

def loop(files):
    files, _ = files
    for zip_file_path in tqdm(files):
        destination_folder = './'
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
        os.remove(zip_file_path)

# files = glob('*.zip')
# if len(files):
#     multiprocessing(files, loop, cores = min(len(files), 20), returned = False)

In [3]:
from transformers import AutoProcessor



In [4]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer

In [6]:
from glob import glob
import pandas as pd

rows = []

files = glob('Malaysian-Speech-Description-Timestamp-Instructions/data/*.parquet')
for f in files:
    df = pd.read_parquet(f).to_dict(orient = 'records')
    rows.extend(df)

In [7]:
len(rows)

445224

In [8]:
import json
import pandas as pd

def loop(rows):
    rows, _ = rows
    data = []
    for r in tqdm(rows):
        f = r['audio_filename']
        if not os.path.exists(f):
            continue
            
        try:
            conversation = [
                {"role": "user", "content": [
                    {"type": "audio", "audio_url": "audio.wav"},
                    {"type": "text", "text": r['question']},
                ]},
                {"role": "assistant", "content": r['answer']},
            ]
            text = processor.apply_chat_template(conversation, tokenize=False)
        except Exception as e:
            continue
        

        data.append({
            'text': text,
            'audio': f,
        })
    return data

In [9]:
processed = multiprocessing(rows, loop, cores = 30)

 26%|█████████████████████                                                           | 3902/14840 [00:00<00:01, 5745.42it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 14840/14840 [00:02<00:00, 5659.13it/s]
 65%|████████████████████████████████████████████████████▎                           | 9699/14840 [00:01<00:00, 5659.57it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 14840/14840 [00:02<00:00, 5608.69it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 14840/14840 [00:02<00:00, 5660.16it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 14840/14840 [00:02<00:00, 5656.23it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 14840/14840 [00:02<00:00, 5640.86it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 14840/14840 [00:02<00:00, 5573.01it/s]


In [10]:
len(processed)

445224

In [11]:
with open('prepare-Malaysian-Speech-Description-Timestamp-Instructions.json', 'w') as fopen:
    json.dump(processed, fopen)

In [12]:
processed[0]

{'text': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\ndescribe the audio include timestamps<|im_end|>\n<|im_start|>assistant\nAudio ini memulakan dengan nada mengajak dan menggalakkan. Pada cap masa 0.00-1.80, penutur mengajak warga Sabah, terutamanya masyarakat Tionghoa, untuk berganding bahu menjaga keamanan dan kemakmuran di kawasan mereka. Ini menunjukkan semangat kerjasama antara etnik dan komuniti.\n\nPada 7.10-12.08, penutur memberikan butiran tentang program yang diadakan di Dewan Sekolah Tinggi Cina Sabah, yang dihadiri oleh kira-kira 3,000 orang. Acara ini dimeriahkan dengan pelbagai persembahan kebudayaan, menunjukkan perpaduan budaya dan sokongan komuniti.\n\nNada keseluruhan audio ini adalah positif dan inklusif, dengan nada mengajak dan merayakan kepelbagaian budaya. Ia mencerminkan usaha untuk memupuk hubungan baik dan keharmonian antara komuniti yang berbeza di Sabah.<|im_end|>\n',
 'audio':