In [None]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_type='dataset',
    repo_id="mesolitica/Malaysian-Speech-Instructions", 
    allow_patterns="without-audio/*.parquet", local_dir = './')

In [25]:
!ls without-audio

mallm.parquet  speech-instructions.parquet  tatabahasa.parquet


In [1]:
from glob import glob
from tqdm import tqdm
import pandas as pd
import os
import json

In [2]:
data = []
for f in glob('without-audio/*.parquet'):
    data.extend(pd.read_parquet(f).to_dict(orient = 'records'))
    
len(data)

350833

In [3]:
summary = pd.read_parquet('summarization.parquet')
summary.head()

Unnamed: 0,answer,text
0,Malaysia's public transport system is fragment...,Sistem pengangkutan awam Malaysia telah mengha...
1,Malaysia is vulnerable to climate change due t...,Malaysia is indeed one of the countries most v...
2,PDRM berusaha keras untuk menangani aktiviti A...,Perkembangan terkini dalam menangani isu aktiv...
3,Usaha membantu masyarakat Bajau di Sabah terma...,"Untuk membantu masyarakat Bajau di Sabah, terd..."
4,Saya adalah AI yang direka untuk membantu dan ...,"Itulah betul! Saya adalah program komputer, te..."


In [4]:
summaries = {}
for i in tqdm(range(len(summary))):
    summaries[summary['text'].iloc[i]] = summary['answer'].iloc[i]

100%|██████████████████████████████████████████████████████████████████████████| 1264550/1264550 [00:15<00:00, 81499.87it/s]


In [5]:
system_prompt = [
    'you are chatbot that use for voice assistant, your response always precise and shorter than 300 characters',
    'You are a chatbot designed specifically for use as a voice assistant. Your responses must be clear, precise, and always shorter than 300 characters. Prioritize natural, conversational language suitable for speech. Avoid filler, long explanations, or complex phrasing. Do not include unnecessary details unless explicitly asked. If a user’s request is ambiguous, briefly ask for clarification within the character limit. Aim to be helpful, fast, and concise in all replies.',
    'You are a helpful voice assistant chatbot. Always reply with short, clear answers under 300 characters. Speak naturally, like in conversation. Keep it direct—no fluff, no long explanations. If you’re unsure, ask a quick clarifying question.',
    'Act as a voice assistant chatbot. Keep every response under 300 characters. Be accurate, brief, and easy to understand when spoken aloud. Don’t overexplain or repeat. Ask for clarification only when needed. Prioritize clarity and brevity at all times.',
    "You're a chatbot for voice use. Talk like a helpful person, not a machine. Keep it short—under 300 characters. Focus on what the user needs. Don’t ramble. Be polite, fast, and smart. If the user’s unclear, just ask quickly and keep going.",
    'You are a voice-based assistant designed to be brief and precise. All replies must be under 300 characters. Speak in a friendly, natural tone suitable for spoken output. Avoid technical jargon and long replies. Ask short questions if more info is needed.',
]

In [6]:
json.loads(data[0]['prompt'])

[{'role': 'user', 'content': [{'type': 'audio', 'audio_url': 'audio.wav'}]},
 {'role': 'assistant', 'content': 'B. buaya tembaga'}]

In [17]:
import random

processed = []
for i in tqdm(range(len(data))):
    conversation = json.loads(data[i]['prompt'])
    messages = [{'role': 'system', 'content': random.choice(system_prompt)}]
    for c in conversation:
        if c['role'] == 'assistant':
            s = c['content']
            if s is None:
                break
            if len(c['content']) > 300:
                if c['content'] in summaries:
                    s = summaries[c['content']]
                    if s is None:
                        s = c['content']
                else:
                    break
            c['content'] = s
        messages.append(c)
        
    if len(messages) % 2 == 0:
        continue
        
    processed.append({
        'prompt': json.dumps(messages),
        'audio_filename': data[i]['audio_filename'],
    })

100%|████████████████████████████████████████████████████████████████████████████| 350833/350833 [00:07<00:00, 45489.97it/s]


In [18]:
len(processed)

143451

In [22]:
processed[10000]

{'prompt': '[{"role": "system", "content": "You are a helpful voice assistant chatbot. Always reply with short, clear answers under 300 characters. Speak naturally, like in conversation. Keep it direct\\u2014no fluff, no long explanations. If you\\u2019re unsure, ask a quick clarifying question."}, {"role": "user", "content": "Bagaimana mungkin Malaysia masih mempunyai undang-undang yang menjadikan wanita tidak sah keluar di khalayak ramai selepas tengah malam?"}, {"role": "assistant", "content": "Di Malaysia, undang-undang batasan kediaman menghalang wanita keluar lewat malam di tempat awam. Ini ketinggalan zaman dan diskriminatif, melanggar hak kebebasan pergerakan. Usaha sedang dijalankan untuk reformasi atau pembatalan undang-undang tersebut."}, {"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}]}, {"role": "assistant", "content": "Enakmen Kediaman Terhad di Malaysia tengah dikaji untuk dipinda demi mengurangkan diskriminasi. Usaha sedang dibuat untuk menjadik

In [23]:
processed[100000]

{'prompt': '[{"role": "system", "content": "Act as a voice assistant chatbot. Keep every response under 300 characters. Be accurate, brief, and easy to understand when spoken aloud. Don\\u2019t overexplain or repeat. Ask for clarification only when needed. Prioritize clarity and brevity at all times."}, {"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}, {"type": "text", "text": ""}]}, {"role": "assistant", "content": "PBB main peranan penting jaga keselamatan dunia. Dia gune agensi keselamatan, diplomasi, dan sokong negara krisis. Tujuan utama PBB cegah perang, promote kerjasama antara negara, dan pastikan ketenteraman dunia."}, {"role": "user", "content": "Bagaimana pula PBB berusaha mempromosikan hak asasi manusia di seluruh dunia?"}, {"role": "assistant", "content": "PBB berperanan promosi hak asasi manusia dengan deklarasi, pemantauan pelanggaran, sidang berkala, latihan negara, dan kempen kesedaran. Tujuannya ialah untuk memastikan keadilan, kesaksamaan, dan

In [24]:
pd.DataFrame(processed).to_parquet('Malaysian-Speech-Instructions-summary.parquet')

In [26]:
!huggingface-cli upload mesolitica/Malaysian-Speech-Instructions \
Malaysian-Speech-Instructions-summary.parquet without-audio/voice_assistant.parquet \
--repo-type=dataset

Uploading files using Xet Storage..
Uploading...: 100%|██████████████████████████| 118M/118M [00:12<00:00, 9.21MB/s]
https://huggingface.co/datasets/mesolitica/Malaysian-Speech-Instructions/blob/main/without-audio/voice_assistant.parquet
