In [57]:
import json
import re
import pandas as pd
import random
import string
import os

In [3]:
speakers = pd.read_parquet('dedup-malaysian-podcasts.parquet').to_dict(orient = 'records')
parliaments = pd.read_parquet('dedup-malaysia-parliament.parquet').to_dict(orient = 'records')
parliaments = random.sample(parliaments, 100000)
speakers.extend(parliaments)
len(speakers)

173384

In [39]:
random.shuffle(speakers)

In [35]:
questions = []
with open('combined-malaysian-reasoning.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        if len(l[1]['content']) > 350:
            continue
        questions.append(l)
        
len(questions)

33785

In [36]:
ends = set()

def should_expand(word):
    # Skip if word contains non-letters before digit (e.g., hyphen, slash)
    if re.search(r'[^a-zA-Z](\d+)$', word):
        return False
    # Skip if starts with known currency or capital letter (optional)
    if word.startswith(("RM", "USD", "SGD", 'rm')):
        return False
    if re.fullmatch(r'[A-Z]+\d+', word):  # e.g. A1, B3
        return False
    if word.isdigit():  # e.g. 10, 156
        return False
    return True

def expand_repeated_words(text):
    def replacer(match):
        word = match.group(1)
        if len(word) < 2:
            return match.group(0)
        num = int(match.group(2))
        if not should_expand(match.group(0)):
            return match.group(0)
        return ' '.join([word] * num)

    # Match words like 'satu2', 'kawan2' but not 'RM50' or 'ke-3'
    pattern = r'\b([a-zA-Z]+)(\d+)\b'
    return re.sub(pattern, replacer, text)

for i in range(len(questions)):
    q = questions[i][1]['content']
    pronunciation = re.sub(r'_+', ', Tempat Kosong , ', q).replace('\n', ' ')
    pronunciation = re.sub(r'[ ]+', ' ', pronunciation).strip()
    questions[i][1]['pronunciation'] = pronunciation
    if 'terjemah' in pronunciation:
        before = pronunciation
        pronunciation = expand_repeated_words(pronunciation)
        if before != pronunciation:
            print(pronunciation)
        
        questions[i][1]['pronunciation'] = pronunciation
        words = pronunciation.split()
        for w in words:
            if w[-1] in string.digits:
                ends.add(w)

Aku teringin nak cantik macam kawan kawan perempuan aku. terjemah ke inggeris baku
Aku sentap jugak kadang kadang, yela effort aku orang tak nampak lepas tu suka suka cakap yang negatif pasal aku. terjemah ke standard english
Bila keluar dengan kawan kawan aku yang flawless, aku dah tak malu lagi. terjemah ke standard inggeris
Abang Abang aku adalah hasil perkahwinan abah dan isteri pertama manakala kakak dan aku anak abah dan mak aku. terjemah ke inggeris baku
Adalah lebih baik jika mengeluarkan peluh dari liang liang pori kulit muka, muka akan nampak glowing dan pinkish akibat peredaran darah yang baik. terjemah ke standard inggeris
Bagitau lebih lebih hampa tak kenai pun hehe. terjemah ke inggeris baku
Kwan Kwan perempuan aku pun tak ada masalah kerja dalam bidang ni. terjemah ke standard inggeris
Takkan dah tua tua pun nak panjat panjat berpanas. terjemah ke standard inggeris
So aku nak tanye pendapat para engineer wanita atau engineer lelaki atau siapa siapa yang dalam bidang cons

In [37]:
ends

{'(20',
 '(4-5',
 '(5-6',
 '(7',
 '000',
 '1',
 '10',
 '11',
 '12',
 '14',
 '15',
 '156',
 '2',
 '2-3',
 '20',
 '2011',
 '2016',
 '2018',
 '2019',
 '24',
 '24/7',
 '27',
 '2A2',
 '3',
 '3.00',
 '38',
 '4',
 '4.5',
 '45',
 '5',
 '500',
 '55',
 '6',
 '60',
 '7',
 '7A1',
 '8',
 '8000',
 '85',
 '9',
 'A1',
 'A2',
 'B3',
 'B5',
 'RM34',
 'RM50',
 'RM500',
 'RM600',
 'b4',
 'ke-3',
 'ke-5'}

In [59]:
!mkdir prepare-malaysian-reasoning

mkdir: cannot create directory ‘prepare-malaysian-reasoning’: File exists


In [62]:
import shutil

prepared = []
for no, q in enumerate(questions):
    splitted = os.path.split(speakers[no]['audio'])
    new_f = os.path.join('prepare-malaysian-reasoning', splitted[1])
    shutil.copyfile(speakers[no]['audio'], new_f)
    prepared.append({
        'speaker': speakers[no],
        'question': q,
        'new_audio_filename': new_f,
    })

In [63]:
prepared[-1]

{'speaker': {'audio': 'dedup-parliament/parlimen-24k-LANGSUNG ： Sesi Kamar Khas ｜ Persidangan Dewan Rakyat ｜ 12 November 2024 [f_LN9qnukT8]_000_112.mp3',
  'transcription': 'Dan sepuluh peratus ini biasanya daripada hasil kutipan jawatankuasa Kariah Masjid.'},
 'question': [{'role': 'system',
   'content': 'You are going to enter reasoning mode, first you try to think step-by-step in malay after that give the final answer.'},
  {'role': 'user',
   'content': 'Mungkin bagi mak ayah aku biasa je.\n\nterjemah ke standard english',
   'pronunciation': 'Mungkin bagi mak ayah aku biasa je. terjemah ke standard english'},
  {'role': 'assistant',
   'content': 'Okay, mari kita pecahkan ayat Melayu tempatan ini dan ubahnya menjadi bahasa Inggeris standard langkah demi langkah.\n\n## Langkah Demi Langkah: Terjemahan Ayat Melayu Tempatan ke Bahasa Inggeris Standard\n\nAyat Melayu tempatan:\n**"Mungkin bagi mak ayah aku biasa je."**\n\nMari kita fahami dan pecahkan ayat ini untuk diterjemah dengan

In [64]:
!du -hs prepare-malaysian-reasoning

1.3G	prepare-malaysian-reasoning


In [65]:
pd.DataFrame(prepared).to_parquet('prepared-malaysian-reasoning.parquet')

In [66]:
!zip -rq prepare-malaysian-reasoning.zip prepare-malaysian-reasoning

In [68]:
!huggingface-cli upload mesolitica/Malaysian-Reasoning-Speech-Instruction prepared-malaysian-reasoning.parquet prepare/prepared-malaysian-reasoning.parquet --repo-type dataset

Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
prepared-malaysian-reasoning.parquet: 100%|█| 94.0M/94.0M [00:11<00:00, 8.22MB/s
https://huggingface.co/datasets/mesolitica/Malaysian-Reasoning-Speech-Instruction/blob/main/prepare/prepared-malaysian-reasoning.parquet


In [69]:
!HF_HUB_ENABLE_HF_TRANSFER="1" huggingface-cli upload mesolitica/Malaysian-Reasoning-Speech-Instruction prepare-malaysian-reasoning.zip prepare/prepare-malaysian-reasoning.zip --repo-type dataset

  0%|                                                     | 0/1 [00:00<?, ?it/s]
prepare/prepare-malaysian-reasoning.zip:   0%|      | 0.00/1.28G [00:00<?, ?B/s][A
prepare/prepare-malaysian-reasoning.zip:   1%| | 16.0M/1.28G [00:02<03:01, 6.97M[A
prepare/prepare-malaysian-reasoning.zip:   2%| | 32.0M/1.28G [00:09<06:46, 3.07M[A
prepare/prepare-malaysian-reasoning.zip:   4%| | 48.0M/1.28G [00:10<04:03, 5.05M[A
prepare/prepare-malaysian-reasoning.zip:   5%| | 64.0M/1.28G [00:10<02:39, 7.61M[A
prepare/prepare-malaysian-reasoning.zip:   6%| | 80.0M/1.28G [00:10<01:44, 11.5M[A
prepare/prepare-malaysian-reasoning.zip:   7%| | 96.0M/1.28G [00:11<01:20, 14.7M[A
prepare/prepare-malaysian-reasoning.zip:   9%| | 112M/1.28G [00:11<01:07, 17.3MB[A
prepare/prepare-malaysian-reasoning.zip:  10%| | 128M/1.28G [00:13<01:10, 16.4MB[A
prepare/prepare-malaysian-reasoning.zip:  11%| | 144M/1.28G [00:13<00:59, 19.0MB[A
prepare/prepare-malaysian-reasoning.zip:  12%| | 160M/1.28G [00:13<00:46, 24.3M