In [1]:
# !wget https://raw.githubusercontent.com/mesolitica/malaysian-dataset/refs/heads/master/llm-instruction/malaysian-sft/malaysian_sft.py

In [2]:
files = {
    'prepare-Animal-Sound-Instructions.json': 3,
    'prepare-AudioSet-Audio-Adversarial-Instructions.json': 1,
    'prepare-AudioSet-Audio-Instructions.json': 1,
    'prepare-Cantonese-Radio-Description-Instructions.json': 1,
    'prepare-Classification-Speech-Adversarial-Instructions.json': 1,
    'prepare-Classification-Speech-Instructions.json': 3,
    'prepare-CoVoST2-Instruction.json': 1,
    'prepare-CompA-R-Instructions.json': 1,
    'prepare-Malaysian-Speech-Description-Timestamp-Instructions.json': 1,
    'prepare-MusicBench-Instructions.json': 2,
    'prepare-Sampling-Multitask-National-Speech-Corpus-v1.json': 1,
    'prepare-Speaker-Diarization-Instructions.json': 4,
    'prepare-Speech-Translation-Instructions.json': 2,
    'prepare-Transcription-Instructions.json': 1,
    'prepare-Zeroshot-Audio-Classification-Instructions.json': 1,
    'prepare-Emilia-Mandarin-Description-Instructions.json': 1,
}

len(files)

16

In [3]:
import librosa
import torch
import torch.nn as nn
import pandas as pd
from datasets import Audio
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AddedToken, AutoProcessor
from transformers import AddedToken
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
from malaysian_sft import post_accept
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json
import math
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'



[2025-06-09 14:51:05,628] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [4]:
audio_class = Audio(sampling_rate=16000)
tokenizer = AutoTokenizer.from_pretrained("mesolitica/Malaysian-Qwen2.5-7B-Instruct")
audio_token = "<|AUDIO|>"
audio_bos_token = "<|audio_bos|>"
audio_eos_token = "<|audio_eos|>"
new_tokens = [AddedToken(audio_token), AddedToken(audio_bos_token), AddedToken(audio_eos_token)]
tokenizer.add_tokens(new_tokens)
audio_token_id = tokenizer.vocab[audio_token]
pad_token_id = tokenizer.pad_token_id
torch_dtype = torch.bfloat16
min_dtype = torch.finfo(torch_dtype).min
sequence_length = 8192

In [5]:
processor = AutoProcessor.from_pretrained('openai/whisper-large-v3')

In [6]:
include_mandarin = ['covost2', 'translation', 'transcription', 'cantonese', 'mandarin']

In [7]:
data = []
for f, replica in files.items():
    f_lower = f.lower()
    if any([a in f_lower for a in include_mandarin]):
        check_mandarin = False
    else:
        check_mandarin = True
    with open(f) as fopen:
        rows = json.load(fopen)
    print(f, replica, len(rows), rows[0], check_mandarin, '\n')
    rows = [(check_mandarin, rows[i]) for i in range(len(rows))] * replica
    data.extend(rows)
        
len(data)


prepare-AudioSet-Audio-Adversarial-Instructions.json 1 312883 {'text': '<|im_start|>system\nyou are an assistant to classify an audio, only reply Yes / No only.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nis there any bell sound in the audio<|im_end|>\n<|im_start|>assistant\nYes<|im_end|>\n', 'audio': 'AudioSet-Audio-Instructions-audio/500k_part1_nonspeech-00000-of-00009-0.mp3'} True 

prepare-AudioSet-Audio-Instructions.json 1 312883 {'text': "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat are the key elements you hear in the audio?<|im_end|>\n<|im_start|>assistant\nThe audio you've provided features the clear and resonant sound of a bell. It has a distinct ringing tone that echoes with a pleasant and often soothing quality.<|im_end|>\n", 'audio': 'AudioSet-Audio-Instructions-audio/500k_part1_nonspeech-00000-of-00009-0.mp3'} True 

prepare-Cantonese-Radio-Description-Instr

prepare-Sampling-Multitask-National-Speech-Corpus-v1.json 1 142028 {'text': '<|im_start|>system\nYou are AI assistant from Infocomm Media Development Authority (IMDA) Singapore.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat was the reason behind Speaker1 crying during the parent-teacher meeting?<|im_end|>\n<|im_start|>assistant\nSpeaker1 cried because their dad scolded them for not reading up, as instructed by their teachers.<|im_end|>\n', 'audio': 'sampling-audio/SQA-PART3-Train-audio_train-00153-of-00171-0.mp3'} True 

prepare-Speaker-Diarization-Instructions.json 4 98568 {'text': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\ndiarize the audio using whisper format<|im_end|>\n<|im_start|>assistant\n<|0.00|> speaker A<|1.14|><|4.76|> speaker A<|6.66|><|7.32|> speaker B<|7.92|><|8.52|> speaker A<|11.72|><|15.44|> speaker A<|15.92|><|16.30|> speaker B<|18.88|><|20.22|> speaker

prepare-Emilia-Mandarin-Description-Instructions.json 1 292649 {'text': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\ncan you describe the audio<|im_end|>\n<|im_start|>assistant\nThe speaker is discussing the role of Baidu, a prominent Chinese internet company, in collecting user data through its products. The statement highlights how Baidu can directly access and utilize user information and habits through the various products it offers.\n\nHere’s a breakdown of the key points:\n\n1. **Products as Data Collection Tools**: The phrase "依靠这些产品" (relying on these products) suggests that Baidu\'s products are the primary means through which the company gathers user data. These products could include search engines, maps, news apps, and other services that users interact with regularly.\n\n2. **Direct Access to User Information**: "可以直接掌握用户资讯" (can directly grasp user information) indicates that Baidu has immedia

8225744

In [8]:
import random

random.shuffle(data)

In [9]:
from datasets import Audio

def collator(batch, batch_position_ids, audio):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
        
    input_ids = input_ids + [pad_token_id] * (sequence_length - len(input_ids))
    position_ids = position_ids + [32000] * (sequence_length - len(position_ids))
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': json.dumps(audio),
        'text': '',
    }

def get_new_token(f, sample):
    
    if isinstance(f, str):
        f = [f]
    
    audio_lengths = []
    for f_ in f:
        audio_ = audio_class.decode_example(audio_class.encode_example(f_))['array']
        audio_lengths.append(min(3000, math.ceil(len(audio_) / processor.feature_extractor.hop_length)))

    num_audio_tokens = sample.count(audio_token)
    replace_str = []
    while audio_token in sample:
        audio_length = audio_lengths.pop(0)
        input_length = (audio_length - 1) // 2 + 1
        num_audio_tokens = input_length

        expanded_audio_token = audio_token * num_audio_tokens

        audio_token_start_idx = sample.find(audio_token)
        audio_token_end_idx = audio_token_start_idx + len(audio_token)

        has_bos = (
            sample[audio_token_start_idx - len(audio_bos_token) : audio_token_start_idx]
            == audio_bos_token
        )
        has_eos = (
            sample[audio_token_end_idx : audio_token_end_idx + len(audio_eos_token)]
            == audio_eos_token
        )

        if not has_bos and not has_eos:
            expanded_audio_token = audio_bos_token + expanded_audio_token + audio_eos_token

        replace_str.append(expanded_audio_token)
        sample = sample.replace(audio_token, "<placeholder>", 1)

    while "<placeholder>" in sample:
        sample = sample.replace("<placeholder>", replace_str.pop(0), 1)
    
    return sample

In [10]:
!rm -rf tokenized-audio-8192
!mkdir tokenized-audio-8192

In [11]:
import time

def loop(files, block_size = sequence_length):
    rows, index = files
    out_root = f'tokenized-audio-8192/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    audio = []
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows, desc=f'loop {index}'):
            check_mandarin, row = row 
            try:
                if not post_accept(row['text'], check_mandarin = check_mandarin):
                    continue
                prompt = get_new_token(row['audio'], row['text'])
            except Exception as e:
                print(row)
                continue
            outputs = tokenizer(prompt, add_special_tokens = False)
            length = len(outputs['input_ids'])
            if length > block_size:
                continue
            
            if count + length > block_size:
                o = collator(temp, position_ids, audio)
                out.write(o)
                temp = [outputs['input_ids']]
                position_ids = [range(len(outputs['input_ids']))]
                if isinstance(row['audio'], list):
                    audio = row['audio']
                else:
                    audio = [row['audio']]
                count = length
            
            else:
                temp.append(outputs['input_ids'])
                position_ids.append(range(len(outputs['input_ids'])))
                if isinstance(row['audio'], list):
                    audio.extend(row['audio'])
                else:
                    audio.append(row['audio'])
                count += length
        
        if len(temp):
            o = collator(temp, position_ids, audio)
            out.write(o)
    return [1]

In [12]:
# loop((data[:1000], 0))

In [31]:
from multiprocess import Pool

chunks = chunks(data, 50000)
pool = Pool(30)
pooled = pool.map(loop, chunks)
pool.close()
pool.join()

In [15]:
folders = glob('tokenized-audio-8192/tokenized-*')
len(folders)

165

In [16]:
!rm -rf audio-packing-8k

In [32]:
with MDSWriter(out='audio-packing-8k', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

In [18]:
import torch.nn.functional as F

def block_diagonal_concat_inverted(*masks, dtype=torch.bfloat16):
    total_size = sum(mask.size(0) for mask in masks)
    combined_mask = torch.zeros(total_size, total_size, dtype=dtype)

    current_pos = 0

    for mask in masks:
        size = mask.size(0)
        combined_mask[current_pos:current_pos + size, current_pos:current_pos + size] = mask
        current_pos += size

    min_value = torch.finfo(dtype).min if dtype.is_floating_point else torch.iinfo(dtype).min
    inverted_mask = torch.where(combined_mask == 1, torch.tensor(0, dtype=dtype), min_value)
    return inverted_mask.unsqueeze(0)

def pad_attention_mask_4d(attention_mask, max_size = sequence_length, value = 0.0):
    maxlen_right = max_size
    maxlen_bottom = max_size
    attention_mask = [
        F.pad(
            attention_mask[i],
            (0, maxlen_right - attention_mask[i].shape[-2], 0, maxlen_bottom - attention_mask[i].shape[-1]), value = value) for i in range(
            len(attention_mask))]
    return torch.stack(attention_mask)

In [19]:
def collator(batch):
    batch = [b for b in batch if b is not None] 
    input_ids, attention_mask, position_ids, labels = [], [], [], []
    input_features, feature_attention_mask = [], []

    for b in batch:
        if 'input_features' in b:
            input_features.append(b['input_features'])
            feature_attention_mask.append(b['feature_attention_mask'])
        input_ids.append(b['input_ids'][None])
        attention_mask.append(b['attention_mask'])
        position_ids.append(b['position_ids'][None])
        labels.append(b['labels'][None])

    input_ids = {
        'input_ids': torch.concat(input_ids, 0),
        'attention_mask': pad_attention_mask_4d(attention_mask, sequence_length, min_dtype),
        'position_ids': torch.concat(position_ids, 0),
        'labels': torch.concat(labels, 0),
    }
    if len(input_features):
        input_ids['input_features'] = torch.concat(input_features, 0)
        input_ids['feature_attention_mask'] = torch.concat(feature_attention_mask, 0)

    return input_ids

In [20]:
class DatasetFixed(torch.utils.data.Dataset):
    def __init__(self, local):
        self.dataset = LocalDataset(local=local)
        self.audio = Audio(sampling_rate=16000)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        try:
            data.pop('text', None)
            audio_files = data.pop('audio', '')
            data['labels'] = data["input_ids"].copy()
            masking = data.pop('attention_mask')

            data.pop('token_type_ids', None)
            for k in data.keys():
                data[k] = torch.tensor(data[k].astype(np.int64))

            masks = []
            for m in masking:
                masks.append(torch.tril(torch.ones(m, m)))
            attention_mask = block_diagonal_concat_inverted(*masks)
            data['attention_mask'] = attention_mask

            data['labels'][data['labels'] == audio_token_id] = -100
            data['labels'][data['labels'] == pad_token_id] = -100

            if len(audio_files):
                files = json.loads(audio_files)
                print(files)
                audios = []
                for f in files:
                    audio = self.audio.decode_example(
                    self.audio.encode_example(f))['array']
                    audios.append(audio)

                inputs_audio = processor.feature_extractor(
                    audios, return_attention_mask=True, 
                    sampling_rate=16000,
                    padding="max_length", return_tensors = 'pt')

                data['input_features'] = inputs_audio['input_features']
                data['feature_attention_mask'] = inputs_audio['attention_mask']
                
            return data

        except Exception as e:
            print('Exception', e)
            return None

    def __len__(self):
        return len(self.dataset)

In [21]:
dataset = DatasetFixed('audio-packing-8k')

In [22]:
(len(dataset) * 8192) / 1e9

6.706642944

In [23]:
(len(dataset) * 8192) / 1e9

6.706642944

In [24]:
a = dataset[0]['input_features'].numpy()

['covost-mp3/common_voice_de_18344997.mp3', 'urbansound8k/100263-2-0-36.wav', 'Classification-Speech-Instructions-audio/gender_age-00003-of-00005-5870.mp3', 'AudioSet-Audio-Instructions-audio/500k_part1_speech-00003-of-00007-1421.mp3', 'ZH/ZH_B00001_S05750_W000146.mp3', 'Speech-Translation-Instructions-audio/longer-00005-of-00006-5751.mp3', 'stt-instructions/-15 (Sesi Pagi) ｜ 25 Jun 2024 [Saf9sBgzBgc]_51.mp3', 'stt-instructions/partition-instructions-part-0_26715.mp3', 'synthetic-speaker-diarization-dataset/train-1033-0.mp3', 'covost-mp3/common_voice_en_18985975.mp3', 'esd-emotion/Neutral_0001_000225.mp3', 'covost-mp3/common_voice_fa_19480580.mp3']


In [26]:
batch = [dataset[0], dataset[-1]]

['covost-mp3/common_voice_de_18344997.mp3', 'urbansound8k/100263-2-0-36.wav', 'Classification-Speech-Instructions-audio/gender_age-00003-of-00005-5870.mp3', 'AudioSet-Audio-Instructions-audio/500k_part1_speech-00003-of-00007-1421.mp3', 'ZH/ZH_B00001_S05750_W000146.mp3', 'Speech-Translation-Instructions-audio/longer-00005-of-00006-5751.mp3', 'stt-instructions/-15 (Sesi Pagi) ｜ 25 Jun 2024 [Saf9sBgzBgc]_51.mp3', 'stt-instructions/partition-instructions-part-0_26715.mp3', 'synthetic-speaker-diarization-dataset/train-1033-0.mp3', 'covost-mp3/common_voice_en_18985975.mp3', 'esd-emotion/Neutral_0001_000225.mp3', 'covost-mp3/common_voice_fa_19480580.mp3']
['ZH/ZH_B00002_S01130_W000020.mp3', 'stt-instructions/partition-instructions-part-3_7805.mp3', 'cantonese-radio-mp3-16k/Scifi20190630F_13.mp3', 'filtered-24k_processed/00225-21/00225-21_8.mp3', 'cantonese-radio-mp3-16k/rthk2_20190807_0000-0200_12.mp3', 'birdsound/scatan/XC466879.ogg']


In [27]:
b = collator(batch)
b['feature_attention_mask']

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32)

In [28]:
b['attention_mask'].shape

torch.Size([2, 1, 8192, 8192])

In [29]:
b['input_ids'].shape

torch.Size([2, 8192])

In [30]:
tokenizer.decode(b['input_ids'][0])

'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|

In [None]:
b['feature_attention_mask'].shape

In [33]:
30 / 1500

0.02

In [35]:
dataset[0]['input_ids']

['covost-mp3/common_voice_de_18344997.mp3', 'urbansound8k/100263-2-0-36.wav', 'Classification-Speech-Instructions-audio/gender_age-00003-of-00005-5870.mp3', 'AudioSet-Audio-Instructions-audio/500k_part1_speech-00003-of-00007-1421.mp3', 'ZH/ZH_B00001_S05750_W000146.mp3', 'Speech-Translation-Instructions-audio/longer-00005-of-00006-5751.mp3', 'stt-instructions/-15 (Sesi Pagi) ｜ 25 Jun 2024 [Saf9sBgzBgc]_51.mp3', 'stt-instructions/partition-instructions-part-0_26715.mp3', 'synthetic-speaker-diarization-dataset/train-1033-0.mp3', 'covost-mp3/common_voice_en_18985975.mp3', 'esd-emotion/Neutral_0001_000225.mp3', 'covost-mp3/common_voice_fa_19480580.mp3']


tensor([151644,   8948,    198,  ..., 151643, 151643, 151643])

In [36]:
po = dataset = LocalDataset(local='audio-packing-8k')

In [44]:
audio_token_id

151665

In [43]:
(po[0]['input_ids'] == audio_token_id).sum() * 0.02

111.14