In [1]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-SFT/resolve/main/data/ayat_aktif_pasif-00000-of-00001.parquet

In [2]:
# !wget https://huggingface.co/datasets/azrilhafizi/tatabahasa/resolve/main/tatabahasa.jsonl
# !wget https://huggingface.co/datasets/azrilhafizi/MaLLM-Bench/resolve/main/all.json

In [5]:
import pandas as pd
import json
import re
import random

In [6]:
speakers = pd.read_parquet('dedup-malaysia-parliament.parquet').to_dict(orient = 'records')
len(speakers)

610804

In [7]:
tatabahasa = []
mallm = []

ABCDE = 'ABCDE'
sound = {
    'A': 'Ae',
    'B': 'Bee',
    'C': 'See',
    'D': 'Dee',
    'E': 'Eee',
}

with open('tatabahasa.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        answer = None
        answer_text = None
        min_len = 999
        for k, v in l['choices'].items():
            answer_text = v['text'].strip()
            min_len = min(min_len, len(answer_text))
            if v['answer']:
                answer = k

        if min_len < 3:
            continue
        if answer is None:
            continue
        
        choices = []
        for k, v in l['choices'].items():
            t = v['text']
            choices.append(f'{k}. {t}')
            
        q = l['instruction']
        if l['question'] is not None:
            q = q + '\n' + l['question']
        
        question = q + '\n\n' + '\n'.join(choices)
        
        choices = []
        for k, v in l['choices'].items():
            t = v['text'].replace(',', '')
            choices.append(f'{sound[k]} , {t} .')
            
        pronunciation = q + '\n\n' + ' '.join(choices)
        pronunciation = re.sub(r'_+', ', Tempat Kosong , ', pronunciation).replace('\n', ' ')
        pronunciation = re.sub(r'[ ]+', ' ', pronunciation).strip()
            
        tatabahasa.append({
            'question': question,
            'pronunciation': pronunciation,
            'answer': f'{answer}. {answer_text}'
        })
        
with open('all.json') as fopen:
    d = json.load(fopen)
    
for d_ in d:
    choices = []
    min_len = 999
    for no, t in enumerate(d_['options']):
        choices.append(f'{ABCDE[no]}. {t}')
        min_len = min(min_len, len(t))
    if min_len < 3:
        continue
    
    question = d_['question'] + '\n\n' + '\n'.join(choices)
    
    choices = []
    for no, t in enumerate(d_['options']):
        choices.append(f'{sound[ABCDE[no]]} , {t} .')
        
    pronunciation = d_['question'] + '\n\n' + '\n'.join(choices)
    pronunciation = re.sub(r'_+', ', Tempat Kosong , ', pronunciation).replace('\n', ' ').replace('%', ' peratus ')
    pronunciation = re.sub(r'[ ]+', ' ', pronunciation).strip()
        
    mallm.append({
        'question': question,
        'pronunciation': pronunciation,
        'answer': f"{ABCDE[d_['answer'] - 1]}. {d_['options'][d_['answer'] - 1]}"
    })
    
len(tatabahasa), len(mallm)

(1284, 6564)

In [8]:
mallm[-2]

{'question': 'Isi tempat kosong dalam ayat-ayat di bawah dengan jawapan yang paling sesuai. Sang Kancil yang cerdik itu berjaya __________ sungai yang dalam dengan tipu helahnya.\n\nA. melepasi\nB. melangkahi\nC. mengharungi\nD. menyeberangi',
 'pronunciation': 'Isi tempat kosong dalam ayat-ayat di bawah dengan jawapan yang paling sesuai. Sang Kancil yang cerdik itu berjaya , Tempat Kosong , sungai yang dalam dengan tipu helahnya. Ae , melepasi . Bee , melangkahi . See , mengharungi . Dee , menyeberangi .',
 'answer': 'D. menyeberangi'}

In [19]:
s = random.sample(speakers, k = len(tatabahasa))

for i in range(len(tatabahasa)):
    tatabahasa[i]['speaker'] = s[i]
    
with open('tatabahasa.json', 'w') as fopen:
    json.dump(tatabahasa, fopen)

In [12]:
s = random.sample(speakers, k = len(mallm))

for i in range(len(mallm)):
    mallm[i]['speaker'] = s[i]
    
with open('mallm.json', 'w') as fopen:
    json.dump(mallm, fopen)

In [13]:
mallm[1284]

{'question': 'Bahan-bahan manakah adalah bukan elektrolit? I. Etanol, II. Asetamida, III. Plumbum(II), bromida IV. Natrium klorida\n\nA. I dan II sahaja\nB. I dan III sahaja\nC. II dan IV sahaja\nD. III dan IV sahaja',
 'pronunciation': 'Bahan-bahan manakah adalah bukan elektrolit? I. Etanol, II. Asetamida, III. Plumbum(II), bromida IV. Natrium klorida Ae , I dan II sahaja . Bee , I dan III sahaja . See , II dan IV sahaja . Dee , III dan IV sahaja .',
 'answer': 'A. I dan II sahaja',
 'speaker': {'audio': 'dedup-parliament/parlimen-24k-chunk_processed_parlimen-24k-LANGSUNG ： Sesi Kamar Khas ｜ Persidangan Dewan Rakyat ｜ 21 Mac 2024 [9KNfI8pn3n4]_000_parlimen-24k-LANGSUNG ： Sesi Kamar Khas ｜ Persidangan Dewan Rakyat ｜ 21 Mac 2024 [9KNfI8pn3n4]_000_103_0.mp3',
  'transcription': 'Sindang Bahat. Paket kerja kosong lapan, Singkong Fleet, JV, Sindang Bahat. Paket kerja, sepuluh, ah, satu kosong, eh, sepuluh.'}}