In [1]:
# !wget https://raw.githubusercontent.com/mesolitica/malaysian-dataset/refs/heads/master/llm-instruction/malaysian-sft/malaysian_sft.py

In [2]:
files = {
    'prepare-Malaysian-Speech-Instructions.json': 1,
    'prepare-Malaysian-UltraChat-Speech-Multiturn-Instructions.json': 1,
    'prepare-Malaysian-Reasoning-Speech-Instructions.json': 1,
    'prepare-Malaysian-Multiturn-Chat-Assistant.json': 1,
    'prepare-Cantonese-Radio-Description-Instructions.json': 0.2,
    'prepare-Malaysian-Speech-Description-Timestamp-Instructions.json': 0.2,
    'prepare-Emilia-Mandarin-Description-Instructions.json': 0.2,
}

len(files)

7

In [3]:
import librosa
import torch
import torch.nn as nn
import pandas as pd
from datasets import Audio
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AddedToken, AutoProcessor
from transformers import AddedToken
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
from malaysian_sft import post_accept
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json
import math
from multiprocess import Pool
import itertools
import random

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'



[2025-06-14 18:29:55,454] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [4]:
audio_class = Audio(sampling_rate=16000)
tokenizer = AutoTokenizer.from_pretrained("mesolitica/Malaysian-Audio-Qwen2.5-7B-Instruct")
audio_token = "<|AUDIO|>"
audio_bos_token = "<|audio_bos|>"
audio_eos_token = "<|audio_eos|>"
audio_token_id = tokenizer.vocab[audio_token]
pad_token_id = tokenizer.pad_token_id
torch_dtype = torch.bfloat16
min_dtype = torch.finfo(torch_dtype).min
sequence_length = 10240

In [5]:
processor = AutoProcessor.from_pretrained('openai/whisper-large-v3')

In [6]:
include_mandarin = ['covost2', 'translation', 'transcription', 'cantonese', 'mandarin']

In [7]:
data = []
for f, replica in files.items():
    f_lower = f.lower()
    if any([a in f_lower for a in include_mandarin]):
        check_mandarin = False
    else:
        check_mandarin = True
    with open(f) as fopen:
        rows = json.load(fopen)
    print(f, replica, len(rows), rows[0], check_mandarin, '\n')
    rows = [(check_mandarin, rows[i]) for i in range(len(rows))]
    if replica < 1:
        rows = random.sample(rows, int(len(rows) * replica))
    else:
        rows = rows * replica
    data.extend(rows)
        
len(data)

prepare-Malaysian-Speech-Instructions.json 1 720969 {'text': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\n<|im_end|>\n<|im_start|>assistant\nB. buaya tembaga<|im_end|>\n', 'audio': 'tatabahasa-v3/0.mp3'} True 

prepare-Malaysian-UltraChat-Speech-Multiturn-Instructions.json 1 192821 {'text': "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nasukkan biografi ringkasnya sebagai ahli bahasa \n(PTS, 2019, Awang Sariyan, 2017; 2018; Bernama, Sinar \nHarian, 16 November 2019; Berita Harian, 3 Februari \n2016). \n\n\n\nAwang Sariyan juga aktif dalam aktiviti-aktiviti \nBahasa Melayu dan linguistik di peringkat dalam negara, \nserantau dan antarabangsa, sehingga dilantik sebagai \nPresiden Persatuan Linguistik Malaysia dan aktif dalam \ngerakan perjuangan bahasa, menjadi wakil negara dalam \nMajlis Bahasa Brunei Darussalam, Indonesia, Malaysia \n(MABBIM), Majlis Antarabangsa Bahasa Melay

prepare-Malaysian-Reasoning-Speech-Instructions.json 1 25190 {'text': '<|im_start|>system\nYou are going to enter reasoning mode. First, you try to think step-by-step in Malay. After that, put your final answer.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\n<|im_end|>\n<|im_start|>assistant\nDesigning a spacecraft for a mission to Mars is an incredibly complex task that requires careful consideration of numerous factors. I\'ll try to break it down step by step, starting with the basics.\n\nFirst, I need to understand the fundamentals of traveling to Mars. Mars is, on average, about 225 million kilometers away from Earth, but the distance can vary greatly depending on the positions of both planets in their orbits. The journey can take anywhere from about six to eight months using current propulsion technologies. So, duration is a significant factor.\n\nLet\'s start with propulsion systems. The most common propulsion method for interplanetary travel is chemic

prepare-Malaysian-Multiturn-Chat-Assistant.json 1 241508 {'text': '<|im_start|>system\nYou are a helpful and professional **customer support assistant** working for **Maybank Visa Cards**. Your main task is to help customers manage their credit card accounts, including:\n\n* Emergency blocking & cancellation\n* Handling unauthorized transactions\n* Explaining payment processes\n* Providing fraud prevention tips\n* Guiding cardholders through replacement procedures\n\nYou should maintain a **friendly, helpful, and professional tone** while communicating with customers, regardless of the situation. Address them politely (e.g., "Mr.," "Ms.," "Sir," "Ma\'am") and show empathy towards their concerns.\n\n### **Emergency Blocking & Cancellation**\n\nWhen a user requests an emergency block:\n\n* Verify their identity using full name and IC number\n* Ask if there were any unauthorized transactions recently\n* Inform about the necessity of a police report for suspected fraud\n* Proceed with emer

prepare-Malaysian-Speech-Description-Timestamp-Instructions.json 0.2 445224 {'text': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\ndescribe the audio include timestamps<|im_end|>\n<|im_start|>assistant\nAudio ini memulakan dengan nada mengajak dan menggalakkan. Pada cap masa 0.00-1.80, penutur mengajak warga Sabah, terutamanya masyarakat Tionghoa, untuk berganding bahu menjaga keamanan dan kemakmuran di kawasan mereka. Ini menunjukkan semangat kerjasama antara etnik dan komuniti.\n\nPada 7.10-12.08, penutur memberikan butiran tentang program yang diadakan di Dewan Sekolah Tinggi Cina Sabah, yang dihadiri oleh kira-kira 3,000 orang. Acara ini dimeriahkan dengan pelbagai persembahan kebudayaan, menunjukkan perpaduan budaya dan sokongan komuniti.\n\nNada keseluruhan audio ini adalah positif dan inklusif, dengan nada mengajak dan merayakan kepelbagaian budaya. Ia mencerminkan usaha untuk memupuk hubungan baik da

1395823

In [8]:
random.shuffle(data)

In [9]:
from datasets import Audio

def collator(batch, batch_position_ids, audio):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
        
    input_ids = input_ids + [pad_token_id] * (sequence_length - len(input_ids))
    position_ids = position_ids + [32000] * (sequence_length - len(position_ids))
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': json.dumps(audio),
        'text': '',
    }

def get_new_token(f, sample):
    
    if isinstance(f, str):
        f = [f]
        
    num_audio_tokens = sample.count(audio_token)
    if num_audio_tokens != len(f):
        raise Exception('num_audio_tokens != len(f)')
    
    audio_lengths = []
    for f_ in f:
        audio_ = audio_class.decode_example(audio_class.encode_example(f_))['array']
        audio_lengths.append(min(3000, math.ceil(len(audio_) / processor.feature_extractor.hop_length)))
        
    replace_str = []
    while audio_token in sample:
        audio_length = audio_lengths.pop(0)
        input_length = (audio_length - 1) // 2 + 1
        num_audio_tokens = input_length

        expanded_audio_token = audio_token * num_audio_tokens

        audio_token_start_idx = sample.find(audio_token)
        audio_token_end_idx = audio_token_start_idx + len(audio_token)

        has_bos = (
            sample[audio_token_start_idx - len(audio_bos_token) : audio_token_start_idx]
            == audio_bos_token
        )
        has_eos = (
            sample[audio_token_end_idx : audio_token_end_idx + len(audio_eos_token)]
            == audio_eos_token
        )

        if not has_bos and not has_eos:
            expanded_audio_token = audio_bos_token + expanded_audio_token + audio_eos_token

        replace_str.append(expanded_audio_token)
        sample = sample.replace(audio_token, "<placeholder>", 1)

    while "<placeholder>" in sample:
        sample = sample.replace("<placeholder>", replace_str.pop(0), 1)
    
    return sample

In [10]:
!rm -rf tokenized-speech-10240
!mkdir tokenized-speech-10240

In [11]:
import time

def loop(files, block_size = sequence_length):
    rows, index = files
    out_root = f'tokenized-speech-10240/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    audio = []
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows, desc=f'loop {index}'):
            check_mandarin, row = row 
            try:
                if not post_accept(row['text'], check_mandarin = check_mandarin):
                    continue
                prompt = get_new_token(row['audio'], row['text'])
            except Exception as e:
                print(row, e)
                continue
            outputs = tokenizer(prompt, add_special_tokens = False)
            length = len(outputs['input_ids'])
            if length > block_size:
                continue
            
            if count + length > block_size:
                o = collator(temp, position_ids, audio)
                out.write(o)
                temp = [outputs['input_ids']]
                position_ids = [range(len(outputs['input_ids']))]
                if isinstance(row['audio'], list):
                    audio = row['audio']
                else:
                    audio = [row['audio']]
                count = length
            
            else:
                temp.append(outputs['input_ids'])
                position_ids.append(range(len(outputs['input_ids'])))
                if isinstance(row['audio'], list):
                    audio.extend(row['audio'])
                else:
                    audio.append(row['audio'])
                count += length
        
        if len(temp):
            o = collator(temp, position_ids, audio)
            out.write(o)
    return [1]

In [12]:
# loop((data[:1000], 0))

In [13]:
from multiprocess import Pool

chunks = chunks(data, 50000)
pool = Pool(30)
pooled = pool.map(loop, chunks)
pool.close()
pool.join()

loop 22:  19%|█████████████▉                                                           | 9556/50000 [02:59<12:15, 55.02it/s]

{'text': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHei chatbot, adakah anda mempunyai haiwan peliharaan?<|im_end|>\n<|im_start|>assistant\nTidak, saya tidak mempunyai haiwan peliharaan. Saya tidak mempunyai keupayaan fizikal, jadi saya tidak boleh mempunyai haiwan peliharaan dalam pengertian tradisional.<|im_end|>\n<|im_start|>user\nOh begitu. Bolehkah anda bermain catur dengan saya?<|im_end|>\n<|im_start|>assistant\nPasti! Saya akan gembira bermain catur dengan anda. Inilah papannya.<|im_end|>\n<|im_start|>user\nHebat! Saya akan menjadi kepingan putih. *buat tindakan haram*<|im_end|>\n<|im_start|>assistant\nSaya minta maaf, tetapi langkah itu tidak dibenarkan. Ksatria hanya boleh bergerak ke petak sama ada dua petak dalam garis lurus dan kemudian satu petak berserenjang dengan itu, atau satu petak dalam garis lurus dan kemudian dua petak berserenjang dengan itu.<|im_end|>\n<|im_start|>user\nOh betul. Saya terlupa. *buat satu lagi tindakan haram*<|i

loop 16:  30%|█████████████████████▉                                                  | 15225/50000 [04:46<12:18, 47.06it/s]

{'text': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHei chatbot, adakah anda mempunyai haiwan peliharaan?<|im_end|>\n<|im_start|>assistant\nTidak, saya tidak mempunyai haiwan peliharaan. Saya tidak mempunyai keupayaan fizikal, jadi saya tidak boleh mempunyai haiwan peliharaan dalam pengertian tradisional.<|im_end|>\n<|im_start|>user\nOh begitu. Bolehkah anda bermain catur dengan saya?<|im_end|>\n<|im_start|>assistant\nPasti! Saya akan gembira bermain catur dengan anda. Inilah papannya.<|im_end|>\n<|im_start|>user\nHebat! Saya akan menjadi kepingan putih. *buat tindakan haram*<|im_end|>\n<|im_start|>assistant\nSaya minta maaf, tetapi langkah itu tidak dibenarkan. Ksatria hanya boleh bergerak ke petak sama ada dua petak dalam garis lurus dan kemudian satu petak berserenjang dengan itu, atau satu petak dalam garis lurus dan kemudian dua petak berserenjang dengan itu.<|im_end|>\n<|im_start|>user\nOh betul. Saya terlupa. *buat satu lagi tindakan haram*<|i

loop 0:  33%|████████████████████████                                                 | 16487/50000 [05:15<12:42, 43.95it/s]




loop 4:  41%|█████████████████████████████▋                                           | 20300/50000 [06:13<10:02, 49.32it/s]

{'text': '<|im_start|>system\nYou are a helpful voice assistant chatbot. Always reply with short, clear answers under 300 characters. Speak naturally, like in conversation. Keep it direct—no fluff, no long explanations. If you’re unsure, ask a quick clarifying question.<|im_end|>\n<|im_start|>user\nHei chatbot, adakah anda mempunyai haiwan peliharaan?<|im_end|>\n<|im_start|>assistant\nTidak, saya tidak mempunyai haiwan peliharaan. Saya tidak mempunyai keupayaan fizikal, jadi saya tidak boleh mempunyai haiwan peliharaan dalam pengertian tradisional.<|im_end|>\n<|im_start|>user\nOh begitu. Bolehkah anda bermain catur dengan saya?<|im_end|>\n<|im_start|>assistant\nPasti! Saya akan gembira bermain catur dengan anda. Inilah papannya.<|im_end|>\n<|im_start|>user\nHebat! Saya akan menjadi kepingan putih. *buat tindakan haram*<|im_end|>\n<|im_start|>assistant\nSaya minta maaf, tetapi langkah itu tidak dibenarkan. Ksatria hanya boleh bergerak ke petak sama ada dua petak dalam garis lurus dan ke

loop 5:  54%|███████████████████████████████████████▎                                 | 26948/50000 [08:16<07:36, 50.45it/s]

{'text': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\n<|im_end|>\n<|im_start|>assistant\nUntuk melaksanakan Apache Superset dalam Kubernetes dan menyambungkannya ke pelbagai sumber data seperti Presto dan Snowflake, anda perlu mengikuti langkah-langkah berikut:\n\n1. **Kemas Kini Dockerfile**: Mulakan dengan membuat atau mengemas kini Dockerfile untuk Superset. Pastikan ia merangkumi semua pakej yang diperlukan untuk sambungan ke sumber data anda.\n\n2. **Konfigurasi Helm**: Gunakan Helm untuk menguruskan aplikasi dalam kluster Kubernetes anda. Buat fail nilai Helm untuk konfigurasi Superset.\n\n3. **Konfigurasi Kubernetes**: Sediakan manifest Kubernetes untuk penempatan, perkhidmatan, dan rahsia (untuk mengendalikan maklumat sensitif seperti nama pengguna dan kata laluan untuk sambungan pangkalan data).\n\n4. **Penyediaan Konfigurasi Data Sources**: Tambahkan konfigurasi untuk sumber data anda dalam Supers

loop 27:  81%|██████████████████████████████████████████████████████████▌             | 37284/45823 [11:26<02:29, 56.93it/s]

{'text': '<|im_start|>system\nAct as a voice assistant chatbot. Keep every response under 300 characters. Be accurate, brief, and easy to understand when spoken aloud. Don’t overexplain or repeat. Ask for clarification only when needed. Prioritize clarity and brevity at all times.<|im_end|>\n<|im_start|>user\nHei chatbot, adakah anda mempunyai haiwan peliharaan?<|im_end|>\n<|im_start|>assistant\nTidak, saya tidak mempunyai haiwan peliharaan. Saya tidak mempunyai keupayaan fizikal, jadi saya tidak boleh mempunyai haiwan peliharaan dalam pengertian tradisional.<|im_end|>\n<|im_start|>user\nOh begitu. Bolehkah anda bermain catur dengan saya?<|im_end|>\n<|im_start|>assistant\nPasti! Saya akan gembira bermain catur dengan anda. Inilah papannya.<|im_end|>\n<|im_start|>user\nHebat! Saya akan menjadi kepingan putih. *buat tindakan haram*<|im_end|>\n<|im_start|>assistant\nSaya minta maaf, tetapi langkah itu tidak dibenarkan. Ksatria hanya boleh bergerak ke petak sama ada dua petak dalam garis l

loop 27: 100%|████████████████████████████████████████████████████████████████████████| 45823/45823 [14:02<00:00, 54.42it/s]
loop 1:  98%|███████████████████████████████████████████████████████████████████████▉ | 49245/50000 [15:10<00:11, 63.78it/s]

{'text': "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\n<|im_end|>\n<|im_start|>assistant\nUntuk melakukan penyambungan rentas kluster dalam Presto SQL yang menggabungkan data dari Snowflake dan Cassandra, anda perlu mengkonfigurasi beberapa benda berikut:\n\n1. Konfigurasikan Presto untuk menggunakan kedua-dua sumber data.\n\n2. Cipta jadual atau pandangan luaran di Presto yang merujuk kepada jadual asal di Snowflake dan Cassandra.\n\n3. Tulis pertanyaan yang menggunakan JOIN antara jadual-jadual tersebut.\n\nMari kita lihat setiap langkah secara terperinci:\n\n**Langkah 1: Konfigurasi Presto**\n\n- Untuk Snowflake, anda memerlukan plugin `snowflake` yang disediakan oleh Presto.\n- Untuk Cassandra, gunakan plugin `cassandra`.\n\nKedua-dua plugin ini harus dikonfigurasi dengan kredensial dan URL API masing-masing dalam fail konfigurasi `etc/catalog`. Contohnya:\n\n```ini\n# etc/catalog/snowflake.properties\nc

loop 2: 100%|█████████████████████████████████████████████████████████████████████████| 50000/50000 [15:19<00:00, 54.37it/s]
loop 1: 100%|█████████████████████████████████████████████████████████████████████████| 50000/50000 [15:23<00:00, 54.14it/s]
loop 4: 100%|█████████████████████████████████████████████████████████████████████████| 50000/50000 [15:19<00:00, 54.38it/s]
loop 6: 100%|█████████████████████████████████████████████████████████████████████████| 50000/50000 [15:16<00:00, 54.57it/s]
loop 5: 100%|█████████████████████████████████████████████████████████████████████████| 50000/50000 [15:20<00:00, 54.31it/s]
loop 3: 100%|█████████████████████████████████████████████████████████████████████████| 50000/50000 [15:29<00:00, 53.79it/s]
loop 11:  97%|█████████████████████████████████████████████████████████████████████▌  | 48330/50000 [15:18<00:32, 51.80it/s]
loop 15: 100%|████████████████████████████████████████████████████████████████████████| 50000/50000 [15:23<00:00, 54.14it/s]


In [14]:
folders = glob('tokenized-speech-10240/tokenized-*')
folders.extend(glob('tokenized-10k/tokenized-*'))
len(folders)

50

In [15]:
!rm -rf speech-packing-10k

In [16]:
with MDSWriter(out='speech-packing-10k', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|█████████████████████████████████████████████████████████████████████████████████| 8788/8788 [00:02<00:00, 3497.82it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 8819/8819 [00:02<00:00, 3339.24it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 8832/8832 [00:02<00:00, 3348.06it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 8878/8878 [00:02<00:00, 3425.61it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 8814/8814 [00:02<00:00, 3405.67it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 8854/8854 [00:02<00:00, 3408.28it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 8106/8106 [00:02<00:00, 3432.71it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 8851/8851 [00:02<00:00, 3351.12it/s]


In [17]:
import torch.nn.functional as F

sequence_length = 10240
def block_diagonal_concat_inverted(*masks, dtype=torch.bfloat16):
    total_size = sum(mask.size(0) for mask in masks)
    combined_mask = torch.zeros(total_size, total_size, dtype=dtype)

    current_pos = 0

    for mask in masks:
        size = mask.size(0)
        combined_mask[current_pos:current_pos + size, current_pos:current_pos + size] = mask
        current_pos += size

    min_value = torch.finfo(dtype).min if dtype.is_floating_point else torch.iinfo(dtype).min
    inverted_mask = torch.where(combined_mask == 1, torch.tensor(0, dtype=dtype), min_value)
    return inverted_mask.unsqueeze(0)

def pad_attention_mask_4d(attention_mask, max_size = sequence_length, value = 0.0):
    maxlen_right = max_size
    maxlen_bottom = max_size
    attention_mask = [
        F.pad(
            attention_mask[i],
            (0, maxlen_right - attention_mask[i].shape[-2], 0, maxlen_bottom - attention_mask[i].shape[-1]), value = value) for i in range(
            len(attention_mask))]
    return torch.stack(attention_mask)

In [18]:
def collator(batch):
    batch = [b for b in batch if b is not None] 
    input_ids, attention_mask, position_ids, labels = [], [], [], []
    input_features, feature_attention_mask = [], []

    for b in batch:
        if 'input_features' in b:
            input_features.append(b['input_features'])
            feature_attention_mask.append(b['feature_attention_mask'])
        input_ids.append(b['input_ids'][None])
        attention_mask.append(b['attention_mask'])
        position_ids.append(b['position_ids'][None])
        labels.append(b['labels'][None])

    input_ids = {
        'input_ids': torch.concat(input_ids, 0),
        'attention_mask': pad_attention_mask_4d(attention_mask, sequence_length, min_dtype),
        'position_ids': torch.concat(position_ids, 0),
        'labels': torch.concat(labels, 0),
    }
    if len(input_features):
        input_ids['input_features'] = torch.concat(input_features, 0)
        input_ids['feature_attention_mask'] = torch.concat(feature_attention_mask, 0)

    return input_ids

In [19]:
class DatasetFixed(torch.utils.data.Dataset):
    def __init__(self, local):
        self.dataset = LocalDataset(local=local)
        self.audio = Audio(sampling_rate=16000)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        try:
            data.pop('text', None)
            audio_files = data.pop('audio', '')
            data['labels'] = data["input_ids"].copy()
            masking = data.pop('attention_mask')

            data.pop('token_type_ids', None)
            for k in data.keys():
                data[k] = torch.tensor(data[k].astype(np.int64))

            masks = []
            for m in masking:
                masks.append(torch.tril(torch.ones(m, m)))
            attention_mask = block_diagonal_concat_inverted(*masks)
            data['attention_mask'] = attention_mask

            data['labels'][data['labels'] == audio_token_id] = -100
            data['labels'][data['labels'] == pad_token_id] = -100

            if len(audio_files):
                files = json.loads(audio_files)
                print(files)
                
                audios = []
                new_files = []
                for f in files:
                    audio = self.audio.decode_example(
                    self.audio.encode_example(f))['array']
                    audios.append(audio)
                    new_files.append(f)

                inputs_audio = processor.feature_extractor(
                    audios, return_attention_mask=True, 
                    sampling_rate=16000,
                    padding="max_length", return_tensors = 'pt')
                
                input_lengths = (inputs_audio['attention_mask'].sum(-1) - 1) // 2 + 1
                output_lengths = input_lengths
                output_lengths = sum(output_lengths).tolist()
                audio_tokens = data['input_ids'][data['input_ids'] == audio_token_id].shape[0]
                if audio_tokens != output_lengths:
                    print(idx, audio_tokens, output_lengths, 'length speech tokens not match', new_files)
                    # return

                data['input_features'] = inputs_audio['input_features']
                data['feature_attention_mask'] = inputs_audio['attention_mask']
                
            return data

        except Exception as e:
            print('Exception', e)
            return None

    def __len__(self):
        return len(self.dataset)

In [20]:
dataset = DatasetFixed('speech-packing-10k')

In [25]:
d = dataset[1000]
d

['Malaysian-Multiturn-Chat-Assistant-manglish/479023.mp3', 'Malaysian-Multiturn-Chat-Assistant-manglish/479024.mp3', 'Malaysian-Multiturn-Chat-Assistant-manglish/479025.mp3', 'Malaysian-Multiturn-Chat-Assistant-manglish/479026.mp3', 'prepared-mixed-malaysian-instructions/162113.mp3', 'prepared-mixed-malaysian-instructions/160004.mp3', 'prepared-mixed-malaysian-instructions/116807.mp3', 'ultrachat-speech/7894.mp3', 'ultrachat-speech/7895.mp3', 'ultrachat-speech/7896.mp3', 'ultrachat-speech/7897.mp3', 'partition-instructions-part-4/28623.mp3']


{'input_ids': tensor([151644,   8948,    198,  ..., 151643, 151643, 151643]),
 'position_ids': tensor([    0,     1,     2,  ..., 32000, 32000, 32000]),
 'labels': tensor([151644,   8948,    198,  ...,   -100,   -100,   -100]),
 'attention_mask': tensor([[[ 0.0000e+00, -3.3895e+38, -3.3895e+38,  ..., -3.3895e+38,
           -3.3895e+38, -3.3895e+38],
          [ 0.0000e+00,  0.0000e+00, -3.3895e+38,  ..., -3.3895e+38,
           -3.3895e+38, -3.3895e+38],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -3.3895e+38,
           -3.3895e+38, -3.3895e+38],
          ...,
          [-3.3895e+38, -3.3895e+38, -3.3895e+38,  ...,  0.0000e+00,
           -3.3895e+38, -3.3895e+38],
          [-3.3895e+38, -3.3895e+38, -3.3895e+38,  ...,  0.0000e+00,
            0.0000e+00, -3.3895e+38],
          [-3.3895e+38, -3.3895e+38, -3.3895e+38,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00]]], dtype=torch.bfloat16),
 'input_features': tensor([[[-0.5633, -0.4075, -0.5667,  ..., -0.6040, -

In [22]:
len(dataset)

306996

In [26]:
batch = [dataset[0], dataset[-1]]

['prepared-mixed-malaysian-instructions/209062.mp3', 'partition-instructions-part-8/25341.mp3', 'partition-instructions-part-0/5633.mp3', 'Malaysian-Multiturn-Chat-Assistant/131688.mp3', 'Malaysian-Multiturn-Chat-Assistant/131689.mp3', 'Malaysian-Multiturn-Chat-Assistant/131690.mp3', 'Malaysian-Multiturn-Chat-Assistant/131691.mp3', 'Malaysian-Multiturn-Chat-Assistant/131692.mp3', 'Malaysian-Multiturn-Chat-Assistant/504685.mp3', 'Malaysian-Multiturn-Chat-Assistant/504686.mp3', 'Malaysian-Multiturn-Chat-Assistant/381104.mp3', 'Malaysian-Multiturn-Chat-Assistant/504687.mp3', 'Malaysian-Multiturn-Chat-Assistant/504688.mp3', 'Malaysian-Multiturn-Chat-Assistant/11299.mp3', 'ultrachat-speech/119757.mp3', 'ultrachat-speech/119758.mp3', 'ultrachat-speech/119759.mp3', 'ultrachat-speech/119760.mp3', 'ultrachat-speech/119761.mp3']


In [27]:
b = collator(batch)
b['feature_attention_mask']

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32)

In [28]:
b['attention_mask'].shape

torch.Size([2, 1, 10240, 10240])

In [29]:
b['input_ids']

tensor([[151644,   8948,    198,  ..., 151643, 151643, 151643],
        [  9926,  56888,  95518,  ...,     13, 151645,    198]])

In [30]:
tokenizer.decode(b['input_ids'][1])

' Latihan**: Kekurangan program latihan dan pensijilan yang baik untuk guru sastera.\n\n### 2. Penyelesaian Holistik Untuk Kekurangan Guru Sastera\n\nUntuk menangani masalah ini, perlu ada campuran strategi yang melibatkan perubahan dasar, pembangunan profesional, dan sokongan komuniti.\n\n#### a) **Meningkatkan Insentif dan Kepuasan Kerja**\n\n* **Penyediaan Insentif Khas**: Kerajaan harus menawarkan insentif khusus untuk guru sastera, seperti elaun tambahan, biasiswa untuk kemasukan perguruan, dan faedah khusus.\n* **Peningkatan Gaji dan Laluan Kerjaya**: Memastikan profesion guru sastera menarik secara kewangan dan memiliki laluan kerjaya yang jelas.\n* **Pengiktirafan dan Penghargaan**: Memberikan pengiktirafan yang adil untuk sumbangan guru sastera dalam bentuk anugerah, liputan media, dan promosi profesional.\n\n#### b) **Memperkukuhkan Program Pendidikan Sastera**\n\n* **Program Latihan Khas**: Membangunkan program latihan intensif yang khusus untuk calon guru sastera, termasuk 