In [4]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="malaysia-ai/Multilingual-TTS", 
    repo_type="dataset",
    allow_patterns="*/*.parquet",
    local_dir="./Multilingual-TTS",
)

Fetching 23 files: 100%|██████████| 23/23 [00:01<00:00, 12.05it/s]


'/home/ubuntu/Multilingual-TTS'

In [55]:
from glob import glob
import pandas as pd
import json
import os
import torch
import IPython.display as ipd

torch.set_grad_enabled(False)

from transformers import AutoTokenizer, AddedToken
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import numpy as np
from tqdm import tqdm
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'

def new_path(f):
    splitted = f.split('/')
    folder = f.split('/')[0]
    folder = folder + '_neucodec'
    new_f = os.path.join(folder, '/'.join(splitted[1:]))
    new_f = new_f.replace('.mp3', '.json').replace('.wav', '.json')
    return new_f

In [30]:
from neucodec import NeuCodec
 
model = NeuCodec.from_pretrained("neuphonic/neucodec")
_ = model.eval().cuda()   

In [19]:
files = glob('Multilingual-TTS/*/*.parquet')
data = []
for f in tqdm(files):
    df = pd.read_parquet(f).to_dict(orient = 'records')
    for i in range(len(df)):
        token_filename = new_path(df[i]['audio_filename'])
        if not os.path.exists(token_filename):
            continue
        df[i]['token_filename'] = token_filename
        data.append(df[i])

100%|██████████| 23/23 [00:17<00:00,  1.32it/s]


In [18]:
data[0]

{'audio_filename': 'libritts_r_filtered_other/libritts_r_filtered-other-train.other.500-00055-of-00102_0.mp3',
 'text': 'Where could he be?',
 'speaker': 'libritts_r_filtered_other_428',
 'token_filename': 'libritts_r_filtered_other_neucodec/libritts_r_filtered-other-train.other.500-00055-of-00102_0.json'}

In [20]:
len(data)

2114833

In [25]:
with open('prepared-Malaysian-TTS-v2.json') as fopen:
    d = json.load(fopen)

malaysian = []
for d_ in tqdm(d):
    token_filename = os.path.join('neucodec', f"{d_['index']}.json")
    if not os.path.exists(token_filename):
        continue
    malaysian.append({
        'speaker': d_['speaker'],
        'token_filename': token_filename,
        'text': d_['normalized_generate_text']
    })
len(malaysian)

100%|██████████| 659690/659690 [00:06<00:00, 98460.62it/s] 


659690

In [37]:
malaysian[0]

{'speaker': 'husein',
 'token_filename': 'neucodec/0.json',
 'text': 'Encik, bolehkah Encik memberikan maklum balas tentang pengalaman membeli-belah dengan kami?'}

In [40]:
with open(malaysian[0]['token_filename']) as fopen:
    d = json.load(fopen)
    
recon = model.decode_code(torch.tensor(d)[None][None].cuda()).cpu()
ipd.Audio(recon[0, 0].numpy(), rate = 24000)

In [59]:
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-1.7B-Base')

In [60]:
extra = [AddedToken('<|speech_start|>')]
for i in range(65536):
    extra.append(AddedToken(f'<|s_{i}|>'))
tokenizer.add_tokens(extra)

65537

In [62]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': '',
        'text': '',
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [63]:
import time

sequence_length = 1024 * 10
def loop(files, block_size = sequence_length):
    rows, index = files
    out_root = f'tokenized-4k-qwen3/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):
            
            try:
                with open(row['token_filename']) as fopen:
                    token = json.load(fopen)
            except:
                continue

            left = row['speaker'] +': ' + row['text']
            
            token = ''.join([f'<|s_{t}|>' for t in token])
            prompt = f'<|im_start|>{left}<|speech_start|>{token}<|im_end|>'
            
            outputs = tokenizer(prompt, add_special_tokens = False)
            position = range(len(outputs['input_ids']))
            length = len(outputs['input_ids'])
            
            if count + length > block_size:
                o = collator(temp, position_ids)
                if o['input_ids'].shape[0] > 0:
                    out.write(o)
                temp = [outputs['input_ids']]
                position_ids = [position]
                count = length
                
            else:
                temp.append(outputs['input_ids'])
                position_ids.append(range(len(outputs['input_ids'])))
                count += len(outputs['input_ids'])
        
        if len(temp):
            o = collator(temp, position_ids)
            if o['input_ids'].shape[0] > 0:
                out.write(o)
            

In [69]:
# loop((data[:10], 0))

In [70]:
combined = data + malaysian

In [71]:
multiprocessing(combined, loop, cores = 20, returned = False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [72]:
folders = sorted(glob('tokenized-4k-qwen3/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized-4k-qwen3/tokenized-0',
 'tokenized-4k-qwen3/tokenized-1',
 'tokenized-4k-qwen3/tokenized-2',
 'tokenized-4k-qwen3/tokenized-3',
 'tokenized-4k-qwen3/tokenized-4',
 'tokenized-4k-qwen3/tokenized-5',
 'tokenized-4k-qwen3/tokenized-6',
 'tokenized-4k-qwen3/tokenized-7',
 'tokenized-4k-qwen3/tokenized-8',
 'tokenized-4k-qwen3/tokenized-9',
 'tokenized-4k-qwen3/tokenized-10',
 'tokenized-4k-qwen3/tokenized-11',
 'tokenized-4k-qwen3/tokenized-12',
 'tokenized-4k-qwen3/tokenized-13',
 'tokenized-4k-qwen3/tokenized-14',
 'tokenized-4k-qwen3/tokenized-15',
 'tokenized-4k-qwen3/tokenized-16',
 'tokenized-4k-qwen3/tokenized-17',
 'tokenized-4k-qwen3/tokenized-18',
 'tokenized-4k-qwen3/tokenized-19',
 'tokenized-4k-qwen3/tokenized-20']

In [73]:
!rm -rf multipacking

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [74]:
with MDSWriter(out='multipacking', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|██████████| 4178/4178 [00:00<00:00, 8928.05it/s]
100%|██████████| 4388/4388 [00:00<00:00, 7858.06it/s]
100%|██████████| 4992/4992 [00:00<00:00, 8162.63it/s]
100%|██████████| 5055/5055 [00:00<00:00, 8628.82it/s]
100%|██████████| 7295/7295 [00:00<00:00, 9122.79it/s]
100%|██████████| 7374/7374 [00:00<00:00, 8666.13it/s]
100%|██████████| 7360/7360 [00:00<00:00, 8568.50it/s]
100%|██████████| 7370/7370 [00:00<00:00, 8651.45it/s]
100%|██████████| 7168/7168 [00:00<00:00, 9120.94it/s]
100%|██████████| 7035/7035 [00:00<00:00, 9070.77it/s]
100%|██████████| 7062/7062 [00:00<00:00, 8443.13it/s]
100%|██████████| 6856/6856 [00:00<00:00, 8887.29it/s]
100%|██████████| 7024/7024 [00:00<00:00, 8989.13it/s]
100%|██████████| 7426/7426 [00:00<00:00, 8695.56it/s]
100%|██████████| 4866/4866 [00:00<00:00, 8609.05it/s]
100%|██████████| 4575/4575 [00:00<00:00, 9220.23it/s]
100%|██████████| 5118/5118 [00:00<00:00, 8854.93it/s]
100%|██████████| 5113/5113 [00:00<00:00, 8019.76it/s]
100%|██████████| 5104/5104 [

In [75]:
dataset = LocalDataset('multipacking')
len(dataset)

120458