In [2]:
import torch

torch.set_grad_enabled(False)

import torch.nn as nn
import pandas as pd
from datasets import Audio
from transformers import AutoTokenizer
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'

In [3]:
with open('accept-period.json') as fopen:
    period = set(json.load(fopen))

In [5]:
processed = pd.read_parquet('processed.parquet')

In [7]:
processed.iloc[0]

reference_text              Uhm, hello, selamat pagi ye, saya dari custome...
generate_text               Encik, bolehkah Encik memberikan maklum balas ...
normalized_generate_text    Encik, bolehkah Encik memberikan maklum balas ...
reference_audio                                          husein-assistant.mp3
filename_audio                                   response-husein-v3/55420.mp3
speaker                                                                husein
similarity                                                           0.800952
audio_length                                                         4.400181
index                                                                 1367694
alignment                   [{'end': 0.38, 'score': -3.89, 'start': 0.12, ...
averaged_pitch              [279.365, 104.309, 97.318, 94.798, 95.233, 91....
distances                   [0.073, 0.007, 0.012, 0.006, 0.007, 0.012, 0.0...
Name: 0, dtype: object

In [38]:
data = []
for i in tqdm(range(len(processed))):
    if i not in period:
        continue
    index = processed.iloc[i]['index']
    speaker = 'husein' if 'husein' in processed.iloc[i]['filename_audio'] else 'idayu'
    data.append({
        'speaker': speaker,
        'text': processed.iloc[i]['normalized_generate_text'],
        'token': f'distilcodec/{index}.json',
    })

100%|██████████| 1645455/1645455 [01:22<00:00, 19934.81it/s]


In [39]:
len(data)

977091

In [40]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/Malaysian-TTS-1.7B')

In [41]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': '',
        'text': '',
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [42]:
!rm -rf tokenized-4k-qwen3
!mkdir tokenized-4k-qwen3

In [43]:
import time

sequence_length = 4096
def loop(files, block_size = sequence_length):
    rows, index = files
    out_root = f'tokenized-4k-qwen3/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):

            speaker = row['speaker']
            with open(row['token']) as fopen:
                token = json.load(fopen)
            token = ''.join([f'<|speech_{t}|>' for t in token])
            t = row['text']
            prompt = f'{t}<|speech_start|>{token}<|im_end|>'
            prompt = f'{speaker}: {prompt}'
            
            outputs = tokenizer(prompt, add_special_tokens = False)
            position = range(len(outputs['input_ids']))
            length = len(outputs['input_ids'])
            
            if count + length > block_size:
                o = collator(temp, position_ids)
                out.write(o)
                temp = [outputs['input_ids']]
                position_ids = [position]
                count = length
                
            else:
                temp.append(outputs['input_ids'])
                position_ids.append(range(len(outputs['input_ids'])))
                count += len(outputs['input_ids'])
        
        if len(temp):
            o = collator(temp, position_ids)
            out.write(o)
            

In [44]:
loop((data[:100], 0))

100%|██████████| 100/100 [00:00<00:00, 459.21it/s]


In [46]:
dataset = LocalDataset('tokenized-4k-qwen3/tokenized-0')
len(dataset)

22

In [47]:
from multiprocess import Pool

chunks = chunks(data, 20000)
pool = Pool(20)
pooled = pool.map(loop, chunks)
pool.close()
pool.join()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
folders = glob('tokenized-4k-qwen3-streaming/tokenized-*')
folders.extend(glob('tokenized-4k-qwen3-chunk/tokenized-*'))
folders.extend(glob('tokenized-4k-qwen3/tokenized-*'))
folders

In [None]:
!rm -rf packing-qwen3-combine

In [None]:
with MDSWriter(out='packing-qwen3-combine', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass