In [33]:
import torch

torch.set_grad_enabled(False)

import torch.nn as nn
import pandas as pd
from datasets import Audio
from transformers import AutoTokenizer
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'

In [2]:
with open('accept-chunk-streaming-flatten.json') as fopen:
    accepted = set(json.load(fopen))
len(accepted)

2014337

In [3]:
df = pd.read_parquet('chunk-streaming-flatten.parquet')

In [5]:
df

Unnamed: 0,filename_audio,text
0,chunk-streaming/prepare-dataset-normalizer-tex...,"Menurutnya, kejadian itu dipercayai"
1,chunk-streaming/prepare-dataset-normalizer-tex...,berlaku
2,chunk-streaming/prepare-dataset-normalizer-tex...,di Kilometer tiga puluh empat lebuh raya berke...
3,chunk-streaming/prepare-dataset-normalizer-tex...,kelmarin.
4,chunk-streaming/introduction-husein-v3_97142_0...,"Encik,"
...,...,...
5327564,chunk-streaming/en_chatbot-idayu_9187_1.mp3,am afraid I do not have a physical body to dan...
5327565,chunk-streaming/en_chatbot-idayu_9187_2.mp3,or music online.
5327566,chunk-streaming/response-husein-v2_156365_0.mp3,"Encik, kita ada jual alat panen buah yang cekap."
5327567,chunk-streaming/response-husein-v2_156365_1.mp3,Nak tanya harga tak?


In [17]:
data = []
for i in tqdm(range(len(df))):
    if i not in accepted:
        continue
    speaker = 'husein' if 'husein' in df.iloc[i]['filename_audio'] else 'idayu'
    data.append({
        'speaker': speaker,
        'text': df.iloc[i]['text'],
        'token': f'chunk-streaming-flatten/{i}.json',
    })
    

100%|██████████| 5327569/5327569 [00:54<00:00, 98367.13it/s] 


In [18]:
len(data)

2014337

In [20]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/Malaysian-TTS-1.7B')

In [21]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': '',
        'text': '',
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [22]:
!rm -rf tokenized-4k-qwen3-chunk
!mkdir tokenized-4k-qwen3-chunk

In [23]:
import time

sequence_length = 4096
def loop(files, block_size = sequence_length):
    rows, index = files
    out_root = f'tokenized-4k-qwen3-chunk/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):

            speaker = row['speaker']
            with open(row['token']) as fopen:
                token = json.load(fopen)
            token = ''.join([f'<|speech_{t}|>' for t in token])
            t = row['text']
            prompt = f'{t}<|speech_start|>{token}<|im_end|>'
            prompt = f'{speaker}: {prompt}'
            
            outputs = tokenizer(prompt, add_special_tokens = False)
            position = range(len(outputs['input_ids']))
            length = len(outputs['input_ids'])
            
            if count + length > block_size:
                o = collator(temp, position_ids)
                out.write(o)
                temp = [outputs['input_ids']]
                position_ids = [position]
                count = length
                
            else:
                temp.append(outputs['input_ids'])
                position_ids.append(range(len(outputs['input_ids'])))
                count += len(outputs['input_ids'])
        
        if len(temp):
            o = collator(temp, position_ids)
            out.write(o)
            

In [26]:
loop((data[:100], 0))

100%|██████████| 100/100 [00:00<00:00, 1335.74it/s]


In [27]:
dataset = LocalDataset('tokenized-4k-qwen3-chunk/tokenized-0')

In [30]:
len(data)

2014337

In [34]:
from multiprocess import Pool

chunks = chunks(data, 100000)
pool = Pool(20)
pooled = pool.map(loop, chunks)
pool.close()
pool.join()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [35]:
print('a')

a
