In [1]:
import json
import pandas as pd
import os
from tqdm import tqdm
from glob import glob
from datasets import Dataset, Audio

audio = Audio(sampling_rate = 16000)

In [2]:
with open('instructions-keys.json') as fopen:
    instructions = json.load(fopen)

In [24]:
files = glob('partition-instructions-part-*.json')
files = [f for f in files if os.path.exists(f.replace('.json', ''))]
files

['partition-instructions-part-0.json',
 'partition-instructions-part-2.json',
 'partition-instructions-part-4.json',
 'partition-instructions-part-5.json',
 'partition-instructions-part-3.json',
 'partition-instructions-part-1.json',
 'partition-instructions-part-17.json']

In [25]:
filtered = []
for f in files:
    folder = f.replace('.json', '')
    with open(f) as fopen:
        data = json.load(fopen)
        
    for i, d in tqdm(enumerate(data)):
        filename = os.path.join(folder, f'{i}.mp3')
        if not os.path.exists(filename):
            continue
        
        d['prompt'] = json.dumps(d['prompt'])
        d['audio_filename'] = filename
        d['dataset'] = instructions.get(d['prompt'])
        
        filtered.append(d)
        
len(filtered)

30000it [00:00, 65275.58it/s]
30000it [00:00, 34045.39it/s]
30000it [00:00, 115190.79it/s]
30000it [00:00, 82779.81it/s]
30000it [00:00, 63943.60it/s]
30000it [00:00, 82808.36it/s] 
30000it [00:00, 143301.76it/s]


133388

In [26]:
with open('tatabahasa.json') as fopen:
    tatabahasa = json.load(fopen)
    
for i, row in tqdm(enumerate(tatabahasa)):
    filename = os.path.join('tatabahasa', f'{i}.mp3')
    if not os.path.exists(filename):
        continue
    q = row['question']
    if 'IV' in q or 'II' in q:
        continue
    d = {
        'prompt': json.dumps([
            {"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}]},
            {'role': 'assistant', 'content': row['answer']},
        ]),
        'question': row['question'],
        'audio_filename': filename,
        'dataset': 'tatabahasa',
        'speaker': row['speaker']
    }
    filtered.append(d)

1284it [00:00, 97458.99it/s]


In [27]:
filtered[-1]

{'prompt': '[{"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}]}, {"role": "assistant", "content": "C. dengan, ke"}]',
 'question': 'Isi tempat kosong dalam ayat-ayat di bawah dengan jawapan yang paling sesuai.\nKami dijangka __________ terlewat sampai __________ Pulau Langkawi kerana kereta mengalami kerosakan.\n\nA. dari, di\nB. akan, di\nC. akan, dari\nD. dengan, ke',
 'audio_filename': 'tatabahasa/1283.mp3',
 'dataset': 'tatabahasa',
 'speaker': {'audio': 'dedup-parliament/parlimen-24k-LANGSUNG： Persidangan Dewan Rakyat ｜ Mesyuarat Pertama Penggal Ketiga 7 Mac 2024 ｜ Sesi Petang [RvL2ZIBGkzM]_000_278.mp3',
  'transcription': 'Malah, impact-nya turut disedari sendiri oleh yang amat berhormat Perdana Menteri yang menadakan lawatan ke negeri Pulau Pinang pada bulan yang lalu.'}}

In [28]:
with open('mallm.json') as fopen:
    mallm = json.load(fopen)
    
for i, row in tqdm(enumerate(mallm)):
    filename = os.path.join('mallm', f'{i}.mp3')
    if not os.path.exists(filename):
        continue
    q = row['question']
    if 'IV' in q or 'II' in q:
        continue
    d = {
        'prompt': json.dumps([
            {"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}]},
            {'role': 'assistant', 'content': row['answer']},
        ]),
        'question': row['question'],
        'audio_filename': filename,
        'dataset': 'mallm',
        'speaker': row['speaker']
    }
    filtered.append(d)

6564it [00:00, 144149.55it/s]


In [29]:
len(filtered)

136257

In [30]:
filtered[-1]

{'prompt': '[{"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}]}, {"role": "assistant", "content": "A. jerebu"}]',
 'question': 'Berikut adalah kegiatan ekonomi di Malaysia. Perindustrian, Pengangkutan. Kegiatan ekonomi tersebut boleh menyebabkan berlakunya\n\nA. jerebu\nB. banjir kilat\nC. kemarau\nD. perubahan angin',
 'audio_filename': 'mallm/3933.mp3',
 'dataset': 'mallm',
 'speaker': {'audio': 'dedup-parliament/parlimen-24k-LANGSUNG ： Persidangan Dewan Negara 16 Ogos 2022 ｜ Sesi Petang [pSD6QGx8GL4]_001_335.mp3',
  'transcription': 'Manakala, manakala kapal hilang masih lagi belum siap-siap. Ada, tapi belum siap.'}}

In [31]:
dataset = Dataset.from_list(filtered)

In [32]:
dataset = dataset.cast_column("audio_filename", audio)
dataset

Dataset({
    features: ['prompt', 'question', 'speaker', 'audio_filename', 'dataset'],
    num_rows: 136257
})

In [None]:
dataset.push_to_hub('malaysia-ai/Speech-Instructions')

Uploading the dataset shards:   0%|          | 0/14 [00:00<?, ?it/s]

Map:   0%|          | 0/9733 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

Map:   0%|          | 0/9733 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

Map:   0%|          | 0/9733 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

Map:   0%|          | 0/9733 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

Map:   0%|          | 0/9733 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

Map:   0%|          | 0/9733 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

Map:   0%|          | 0/9733 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

Map:   0%|          | 0/9733 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

Map:   0%|          | 0/9733 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

Map:   0%|          | 0/9732 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

Map:   0%|          | 0/9732 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

Map:   0%|          | 0/9732 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

Map:   0%|          | 0/9732 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="partition-instructions-part-3.json",
    path_in_repo="text/partition-instructions-part-3.json",
    repo_id="malaysia-ai/Speech-Instructions",
    repo_type="dataset",
)

In [None]:
api.upload_file(
    path_or_fileobj="partition-instructions-part-4.json",
    path_in_repo="text/partition-instructions-part-4.json",
    repo_id="malaysia-ai/Speech-Instructions",
    repo_type="dataset",
)

In [None]:
api.upload_file(
    path_or_fileobj="partition-instructions-part-5.json",
    path_in_repo="text/partition-instructions-part-5.json",
    repo_id="malaysia-ai/Speech-Instructions",
    repo_type="dataset",
)

In [None]:
api.upload_file(
    path_or_fileobj="partition-instructions-part-6.json",
    path_in_repo="text/partition-instructions-part-6.json",
    repo_id="malaysia-ai/Speech-Instructions",
    repo_type="dataset",
)