In [1]:
import json
import pandas as pd
import os
from tqdm import tqdm
from glob import glob
from datasets import Dataset, Audio

audio = Audio(sampling_rate = 16000)

In [2]:
with open('instructions-keys.json') as fopen:
    instructions = json.load(fopen)

In [3]:
with open('fix-instructions-mixtral-multiturn.json') as fopen:
    mixtral_multiturn = json.load(fopen)

In [13]:
files = glob('short-coding-*.json')
files = [f for f in files if os.path.exists(f.replace('.json', ''))]
files

['short-coding-2.json', 'short-coding-0.json', 'short-coding-1.json']

In [14]:
filtered = []
for f in files:
    folder = f.replace('.json', '')
    with open(f) as fopen:
        data = json.load(fopen)
    
    for i, d in tqdm(enumerate(data)):
        filename = os.path.join(folder, f'{i}.mp3')
        if not os.path.exists(filename):
            continue
        
        d = {
            'prompt': json.dumps([
                {"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}]},
                {'role': 'assistant', 'content': d['answer']},
            ]),
            'question': d['question'],
            'audio_filename': filename,
            'dataset': 'short-coding',
            'speaker': d['speaker']
        }
        filtered.append(d)
len(filtered)

18990it [00:00, 75352.20it/s] 
18991it [00:00, 63408.36it/s]
18991it [00:00, 95039.50it/s] 


45415

In [15]:
files = glob('partition-instructions-part-*.json')
files = [f for f in files if os.path.exists(f.replace('.json', ''))]
files

['partition-instructions-part-7.json',
 'partition-instructions-part-16.json',
 'partition-instructions-part-8.json',
 'partition-instructions-part-0.json',
 'partition-instructions-part-2.json',
 'partition-instructions-part-4.json',
 'partition-instructions-part-5.json',
 'partition-instructions-part-6.json',
 'partition-instructions-part-9.json',
 'partition-instructions-part-10.json',
 'partition-instructions-part-3.json',
 'partition-instructions-part-1.json',
 'partition-instructions-part-17.json']

In [16]:
from collections import defaultdict

selected = []
already = defaultdict(set)
count = defaultdict(int)
for f in files:
    folder = f.replace('.json', '')
    with open(f) as fopen:
        data = json.load(fopen)
        
    for i, d in tqdm(enumerate(data)):
        filename = os.path.join(folder, f'{i}.mp3')
        if not os.path.exists(filename):
            continue
        
        d['prompt'] = json.dumps(d['prompt'])
        d['audio_filename'] = filename
        d['dataset'] = instructions.get(d['prompt'], 'unknown')
        if 'mixtral' in d['dataset'] and d['question'] in mixtral_multiturn:
            d['prompt'] = mixtral_multiturn[d['question']]
        
        count[d['dataset']] += 1
        
        q = d['question'].lower()
        if q in already[d['dataset']]:
            continue
        
        already[d['dataset']].add(q)
        filtered.append(d)
        
len(filtered)

30000it [00:00, 61380.93it/s]
30000it [00:00, 51500.33it/s]
30000it [00:00, 81298.72it/s]
30000it [00:00, 81128.52it/s]
30000it [00:00, 47330.94it/s]
30000it [00:00, 47902.29it/s]
30000it [00:00, 81769.45it/s] 
30000it [00:00, 140412.50it/s]
30000it [00:00, 85810.47it/s]
30000it [00:00, 318692.70it/s]
30000it [00:00, 62060.82it/s]
30000it [00:00, 105149.83it/s]
30000it [00:01, 26093.31it/s]


316719

In [17]:
for k, v in already.items():
    print(k, len(v), count[k])

mixtral_critis_malaysia 70312 70342
unknown 482 488
force_jawi 22150 30699
mixtral_critis_politician 5 5
chatgpt4_malaysian_general_qa 26505 26573
malaysian_ultrachat 87266 89992
malaysian_alpaca 18243 18245
synthetic_coding 3282 3282
mixtral_conversation_stupid 43059 43506


In [18]:
with open('tatabahasa.json') as fopen:
    tatabahasa = json.load(fopen)
    
for i, row in tqdm(enumerate(tatabahasa)):
    filename = os.path.join('tatabahasa', f'{i}.mp3')
    if not os.path.exists(filename):
        continue
    q = row['question']
    if 'IV' in q or 'II' in q:
        continue
    d = {
        'prompt': json.dumps([
            {"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}]},
            {'role': 'assistant', 'content': row['answer']},
        ]),
        'question': row['question'],
        'audio_filename': filename,
        'dataset': 'tatabahasa',
        'speaker': row['speaker']
    }
    filtered.append(d)

1284it [00:00, 169739.23it/s]


In [19]:
filtered[-1]

{'prompt': '[{"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}]}, {"role": "assistant", "content": "C. dengan, ke"}]',
 'question': 'Isi tempat kosong dalam ayat-ayat di bawah dengan jawapan yang paling sesuai.\nKami dijangka __________ terlewat sampai __________ Pulau Langkawi kerana kereta mengalami kerosakan.\n\nA. dari, di\nB. akan, di\nC. akan, dari\nD. dengan, ke',
 'audio_filename': 'tatabahasa/1283.mp3',
 'dataset': 'tatabahasa',
 'speaker': {'audio': 'dedup-parliament/parlimen-24k-LANGSUNG： Persidangan Dewan Rakyat ｜ Mesyuarat Pertama Penggal Ketiga 7 Mac 2024 ｜ Sesi Petang [RvL2ZIBGkzM]_000_278.mp3',
  'transcription': 'Malah, impact-nya turut disedari sendiri oleh yang amat berhormat Perdana Menteri yang menadakan lawatan ke negeri Pulau Pinang pada bulan yang lalu.'}}

In [20]:
with open('mallm.json') as fopen:
    mallm = json.load(fopen)
    
for i, row in tqdm(enumerate(mallm)):
    filename = os.path.join('mallm', f'{i}.mp3')
    if not os.path.exists(filename):
        continue
    q = row['question']
    if 'IV' in q or 'II' in q:
        continue
    d = {
        'prompt': json.dumps([
            {"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}]},
            {'role': 'assistant', 'content': row['answer']},
        ]),
        'question': row['question'],
        'audio_filename': filename,
        'dataset': 'mallm',
        'speaker': row['speaker']
    }
    filtered.append(d)

6564it [00:00, 201229.47it/s]


In [21]:
len(filtered)

321225

In [22]:
dataset = Dataset.from_list(filtered)

In [23]:
dataset = dataset.cast_column("audio_filename", audio)
dataset

Dataset({
    features: ['prompt', 'question', 'audio_filename', 'dataset', 'speaker'],
    num_rows: 321225
})

In [24]:
dataset.push_to_hub('malaysia-ai/Speech-Instructions')

Uploading the dataset shards:   0%|          | 0/35 [00:00<?, ?it/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9178 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9177 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9177 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9177 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9177 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Map:   0%|          | 0/9177 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/malaysia-ai/Speech-Instructions/commit/566a9ab90ffd06a16cbe37fa406ab14bc8ee6208', commit_message='Upload dataset', commit_description='', oid='566a9ab90ffd06a16cbe37fa406ab14bc8ee6208', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/malaysia-ai/Speech-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='malaysia-ai/Speech-Instructions'), pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="partition-instructions-part-3.json",
    path_in_repo="text/partition-instructions-part-3.json",
    repo_id="malaysia-ai/Speech-Instructions",
    repo_type="dataset",
)

In [None]:
api.upload_file(
    path_or_fileobj="partition-instructions-part-4.json",
    path_in_repo="text/partition-instructions-part-4.json",
    repo_id="malaysia-ai/Speech-Instructions",
    repo_type="dataset",
)

In [None]:
api.upload_file(
    path_or_fileobj="partition-instructions-part-5.json",
    path_in_repo="text/partition-instructions-part-5.json",
    repo_id="malaysia-ai/Speech-Instructions",
    repo_type="dataset",
)

In [None]:
api.upload_file(
    path_or_fileobj="partition-instructions-part-6.json",
    path_in_repo="text/partition-instructions-part-6.json",
    repo_id="malaysia-ai/Speech-Instructions",
    repo_type="dataset",
)