In [1]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-SFT/resolve/main/combine/combined-malaysian-sft-5k-sample.jsonl

In [2]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="mesolitica/Malaysian-Instructions",
    repo_type="dataset",
    allow_patterns=[
        'data/longer*.parquet',
        'data/*manglish*.parquet',
        'data/voice*.parquet',
    ],
    local_dir="./Malaysian-Instructions",
)

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

'/home/mesolitica/stt/Malaysian-Instructions'

In [3]:
import librosa
import torch
import torch.nn as nn
import pandas as pd
from datasets import Audio
from transformers import AutoTokenizer
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'



In [4]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/Malaysian-Audio-Qwen2.5-7B-Instruct')

In [5]:
torch_dtype = torch.bfloat16
min_dtype = torch.finfo(torch_dtype).min
sequence_length = 10240

In [6]:
combine = []
with open('combined-malaysian-sft-5k-sample.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        combine.append(l)

len(combine)

190597it [00:02, 67644.17it/s] 


190597

In [7]:
glob('Malaysian-Instructions/data/*.parquet')

['Malaysian-Instructions/data/voice_assistant-00000-of-00002.parquet',
 'Malaysian-Instructions/data/mixed_manglish-00000-of-00002.parquet',
 'Malaysian-Instructions/data/voice_assistant-00001-of-00002.parquet',
 'Malaysian-Instructions/data/longer_respond-00000-of-00001.parquet',
 'Malaysian-Instructions/data/manglish-00000-of-00001.parquet',
 'Malaysian-Instructions/data/mixed_manglish-00001-of-00002.parquet']

In [8]:
voice_assistant = glob('Malaysian-Instructions/data/voice*.parquet')

for f in voice_assistant:
    df = pd.read_parquet(f)
    for i in tqdm(range(len(df))):
        q = json.loads(df['question'].iloc[i])
        chat = []
        for q_ in q:
            if 'content_ms' in q_:
                q_['content'] = q_.pop('content_ms')
            if q_['content'] is None:
                break
            chat.append(q_)
        if len(chat):
            combine.append(chat)

100%|████████████████████████████████████████████████████████████████████████████| 224611/224611 [00:04<00:00, 55575.57it/s]
100%|████████████████████████████████████████████████████████████████████████████| 224610/224610 [00:03<00:00, 65865.74it/s]


In [9]:
manglish = glob('Malaysian-Instructions/data/*manglish*.parquet')

for f in manglish:
    print(f)
    df = pd.read_parquet(f)
    for i in tqdm(range(len(df))):
        if df['question'].iloc[i] is None or df['answer'].iloc[i] is None:
            continue
        
        if len(df['question'].iloc[i]) < 10 or len(df['answer'].iloc[i]) < 10:
            continue
        
        chat = [
            {'role': 'user', 'content': df['question'].iloc[i]},
            {'role': 'assistant', 'content': df['answer'].iloc[i]}
        ]
        combine.append(chat)

Malaysian-Instructions/data/mixed_manglish-00000-of-00002.parquet


100%|████████████████████████████████████████████████████████████████████████████| 165430/165430 [00:06<00:00, 27235.21it/s]


Malaysian-Instructions/data/manglish-00000-of-00001.parquet


100%|████████████████████████████████████████████████████████████████████████████| 118466/118466 [00:03<00:00, 30246.02it/s]


Malaysian-Instructions/data/mixed_manglish-00001-of-00002.parquet


100%|████████████████████████████████████████████████████████████████████████████| 165430/165430 [00:05<00:00, 30205.07it/s]


In [10]:
manglish = glob('Malaysian-Instructions/data/*longer*.parquet')

for f in manglish:
    print(f)
    df = pd.read_parquet(f)
    for i in tqdm(range(len(df))):
        if df['question'].iloc[i] is None or df['answer'].iloc[i] is None:
            continue
        
        if len(df['question'].iloc[i]) < 10 or len(df['answer'].iloc[i]) < 10:
            continue
        
        chat = [
            {'role': 'system', 'content': "You are a highly knowledgeable and articulate chatbot. Your primary role is to provide very long, detailed, and precise explanations on any topic the user asks about. Structure your responses with clear logic, thorough reasoning, and factual depth. Avoid oversimplifying complex ideas. If appropriate, break your answers into sections with headings, examples, and breakdowns. Ensure every part of the user's question is fully addressed."},
            {'role': 'user', 'content': df['question'].iloc[i]},
            {'role': 'assistant', 'content': df['answer'].iloc[i]}
        ]
        combine.append(chat)

Malaysian-Instructions/data/longer_respond-00000-of-00001.parquet


100%|████████████████████████████████████████████████████████████████████████████████| 3148/3148 [00:00<00:00, 29761.39it/s]


In [12]:
len(combine)

1092292

In [13]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': '',
        'text': '',
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [14]:
!mkdir tokenized-10k

In [16]:
import time

def loop(files, block_size = sequence_length):
    rows, index = files
    out_root = f'tokenized-10k/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):
            prompt = tokenizer.apply_chat_template(row, tokenize=False)
            outputs = tokenizer(prompt, add_special_tokens = False)
            temp.append(outputs['input_ids'])
            position_ids.append(range(len(outputs['input_ids'])))
            count += len(outputs['input_ids'])
            while count >= block_size:
                block, temp = slice_and_balance(temp, block_size)
                block_position, position_ids = slice_and_balance(position_ids, block_size)
                count = count - block_size
                o = collator(block, block_position)
                last_block = block
                last_position_block = block_position
                out.write(o)
                
        block, _ = slice_and_balance(last_block, block_size - count)
        block_position, _ = slice_and_balance(last_position_block, block_size - count)

        block.extend(temp)
        block_position.extend(position_ids)

        o = collator(block, block_position)
        if len(o['input_ids']) == block_size:
            out.write(o)
            return o

In [18]:
from multiprocess import Pool

chunks = chunks(combine, 50000)
pool = Pool(30)
pooled = pool.map(loop, chunks)
pool.close()
pool.join()

100%|███████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:44<00:00, 1116.93it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 50000/50000 [01:04<00:00, 772.51it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:25<00:00, 1932.82it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 50000/50000 [01:06<00:00, 752.67it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:45<00:00, 1103.63it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:47<00:00, 1058.59it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 50000/50000 [01:04<00:00, 780.57it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:33<00:00, 1485.30it/s]


In [19]:
folders = sorted(glob('tokenized-10k/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized-10k/tokenized-0',
 'tokenized-10k/tokenized-1',
 'tokenized-10k/tokenized-2',
 'tokenized-10k/tokenized-3',
 'tokenized-10k/tokenized-4',
 'tokenized-10k/tokenized-5',
 'tokenized-10k/tokenized-6',
 'tokenized-10k/tokenized-7',
 'tokenized-10k/tokenized-8',
 'tokenized-10k/tokenized-9',
 'tokenized-10k/tokenized-10',
 'tokenized-10k/tokenized-11',
 'tokenized-10k/tokenized-12',
 'tokenized-10k/tokenized-13',
 'tokenized-10k/tokenized-14',
 'tokenized-10k/tokenized-15',
 'tokenized-10k/tokenized-16',
 'tokenized-10k/tokenized-17',
 'tokenized-10k/tokenized-18',
 'tokenized-10k/tokenized-19',
 'tokenized-10k/tokenized-20',
 'tokenized-10k/tokenized-21']