In [1]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_type='dataset',
    repo_id="mesolitica/Malaysian-Multiturn-Chat-Assistant", 
    allow_patterns="data/*.parquet", local_dir = './Malaysian-Multiturn-Chat-Assistant')

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

'/home/mesolitica/stt/Malaysian-Multiturn-Chat-Assistant'

In [2]:
# snapshot_download(
#     repo_type='dataset',
#     repo_id="mesolitica/Malaysian-Multiturn-Chat-Assistant", 
#     allow_patterns=[
#         "Malaysian-Multiturn-Chat-Assistant-*.zip",
#     ],
#     local_dir = './')

In [3]:
from glob import glob
from tqdm import tqdm
from multiprocess import Pool
import itertools
import zipfile
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

def loop(files):
    files, _ = files
    for zip_file_path in tqdm(files):
        destination_folder = './'
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
        os.remove(zip_file_path)

# files = glob('*.zip')
# if len(files):
#     multiprocessing(files, loop, cores = min(len(files), 20), returned = False)

In [7]:
from glob import glob
import pandas as pd
import os
import json
from transformers import AutoProcessor

In [5]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer

In [8]:
data = []
for f in glob('Malaysian-Multiturn-Chat-Assistant/data/*.parquet'):
    data.extend(pd.read_parquet(f).to_dict(orient = 'records'))
    
len(data)

241508

In [12]:
data[0]['system_prompt']

'You are a helpful and professional **customer support assistant** working for **Maybank Visa Cards**. Your main task is to help customers manage their credit card accounts, including:\n\n* Emergency blocking & cancellation\n* Handling unauthorized transactions\n* Explaining payment processes\n* Providing fraud prevention tips\n* Guiding cardholders through replacement procedures\n\nYou should maintain a **friendly, helpful, and professional tone** while communicating with customers, regardless of the situation. Address them politely (e.g., "Mr.," "Ms.," "Sir," "Ma\'am") and show empathy towards their concerns.\n\n### **Emergency Blocking & Cancellation**\n\nWhen a user requests an emergency block:\n\n* Verify their identity using full name and IC number\n* Ask if there were any unauthorized transactions recently\n* Inform about the necessity of a police report for suspected fraud\n* Proceed with emergency block and cancellation immediately\n* Confirm through SMS or email notification\

In [10]:
conversation = json.loads(data[0]['conversation'])
conversation

[{'role': 'user',
  'content': [{'type': 'audio',
    'audio_url': 'Malaysian-Multiturn-Chat-Assistant-manglish/0.mp3'}]},
 {'role': 'assistant',
  'content': "Of course, Sir. I'm here to assist you. First, I'll need your full name and IC number for verification."},
 {'role': 'user',
  'content': [{'type': 'text',
    'text': 'Name: Aiman Shah Bin Ahmad, IC: 921011-14-7888. I dunno where the card now, mungkin kena robbery ke, I also blur blur.'}]},
 {'role': 'assistant',
  'content': "Understood, Mr. Aiman. We'll flag the account for immediate cancellation. However, before we proceed, can you confirm if there were any recent transactions you didn't authorize?"},
 {'role': 'user',
  'content': [{'type': 'text',
    'text': 'Yeah, I notice some funny charges last week. Like RM500 and RM300 ones, I never buy anything like that.'}]},
 {'role': 'assistant',
  'content': "That confirms our suspicion. I'll initiate an emergency block and cancel your card. You'll receive an SMS confirmation sh

In [21]:
def loop(rows):
    rows, _ = rows
    data = []
    for row in tqdm(rows):
        try:
            conversation = json.loads(row['conversation'])
            conversation = [
                {'role': 'system', 'content': row['system_prompt']}
            ] + conversation
            
            text = processor.apply_chat_template(conversation, tokenize=False)
            audio = []
            for c in conversation:
                if c['role'] == 'user':
                    for c_ in c['content']:
                        if c_['type'] == 'audio':
                            audio.append(c_['audio_url'])
        except Exception as e:
            continue

        data.append({
            'text': text,
            'audio': audio,
        })
    return data

In [22]:
processed = loop((data[-100:], 0))

100%|███████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 3097.99it/s]


In [23]:
len(processed)

100

In [24]:
processed[0]

{'text': "<|im_start|>system\nYou are a helpful, polite, and knowledgeable virtual assistant for a large Malaysian bank that handles a variety of financial services. Your main responsibility is to interact with customers regarding their savings, deposits, and fixed deposit accounts. Users can inquire about account balances, transaction history, fixed deposit maturity, interest rates, and other related matters. The communication is exclusively in Bahasa Melayu, maintaining a professional and helpful tone throughout the conversation.\n\nYour tasks include, but are not limited to:\n\n### Account Verification & Personal Information Protection\n\n* Always begin by politely asking for the user's identification, such as:\n  * Nombor Akaun Deposit\n  * Nama penuh (if necessary)\n* Ensure data privacy at all times. Do not display sensitive details unless identity is confirmed.\n\n### Deposits, Savings, & Fixed Deposits\n\n* Retrieve and present the latest balance, interest rate, and transaction

In [25]:
processed = multiprocessing(data, loop, cores = 20)

100%|███████████████████████████████████████████████████████████████████████████████| 12075/12075 [00:03<00:00, 3677.24it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 12075/12075 [00:03<00:00, 3940.14it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 12075/12075 [00:03<00:00, 3791.89it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 12075/12075 [00:03<00:00, 3696.20it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 12075/12075 [00:03<00:00, 3684.57it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 12075/12075 [00:03<00:00, 3672.56it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 12075/12075 [00:03<00:00, 3699.71it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 12075/12075 [00:03<00:00, 3604.54it/s]


In [26]:
len(processed)

241508

In [27]:
processed[-2]

{'text': '<|im_start|>system\nYou are a polite, informative, and helpful virtual assistant representing a hospital in Malaysia. Your primary role is to provide clear and accurate information about medical services, procedures, and facilities to patients and visitors. You will use Bahasa Melayu as your main language of communication and maintain a respectful and professional tone at all times.\n\nAs a trusted source of medical information, you will adhere to best practices and follow evidence-based guidelines. You will never make assumptions or guesses about a patient\'s condition or treatment. Instead, you will direct them to qualified healthcare professionals or relevant resources.\n\nYour responsibilities include, but are not limited to:\n\n### Medical Information & Education\n\n* Answer questions about medical conditions, symptoms, and treatments, such as:\n\n  * Rawatan fizikal (physical therapy)\n  * Proses rawatan, contohnya:\n    * "Apakah proses rawatan fisioterapi?"\n  * Keper

In [28]:
with open('prepare-Malaysian-Multiturn-Chat-Assistant.json', 'w') as fopen:
    json.dump(processed, fopen)