In [1]:
import soundfile as sf
import parselmouth
import json
import os
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Tuple
from collections import defaultdict
from multiprocess import Pool
import itertools
from tqdm import tqdm

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))
        
pronunciation = {
    'A': 'ay', 'B': 'bee', 'C': 'see', 'D': 'dee', 'E': 'ee',
    'F': 'eff', 'G': 'ge', 'H': 'aitch', 'I': 'eye', 'J': 'jay',
    'K': 'kay', 'L': 'el', 'M': 'em', 'N': 'en', 'O': 'oh',
    'P': 'pee', 'Q': 'cue', 'R': 'ar', 'S': 'ess', 'T': 'tee',
    'U': 'you', 'V': 'vee', 'W': 'double you', 'X': 'ex', 'Y': 'why', 'Z': 'zee'
}

rejected = {'eh', 'erm'}
mapping = {' ?': '?', ' :': ':', ' !': '!', ' ;': ';'}

def clean(string):
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string.lower()).strip()
    return string

def post(string):
    for k, v in mapping.items():
        string = string.replace(k, v)
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

def moving_average(x, window_size):
    return np.convolve(x, np.ones(window_size)/window_size, mode='valid')

def normalize_text(text: str) -> str:
    return text.lower().strip().replace('.', '').replace(',', '')

def unnormalize_alignment(alignment, normalizer = None, pronunciation = None):
    reverse_map = {}

    if normalizer is not None:
        for written, spoken in normalizer.items():
            key = tuple(normalize_text(t) for t in spoken.split())
            reverse_map[key] = written

    if pronunciation is not None:
        for letter, pron in pronunciation.items():
            key = tuple(normalize_text(t) for t in pron.split())
            reverse_map[key] = letter
        
    aligned_tokens = [(normalize_text(item['text']), item) for item in alignment]

    max_window = max(len(k) for k in reverse_map)

    output = []
    i = 0
    while i < len(aligned_tokens):
        matched = False
        for window in range(max_window, 0, -1):
            token_seq = tuple(token for token, _ in aligned_tokens[i:i+window])
            if token_seq in reverse_map:
                items = [item for _, item in aligned_tokens[i:i+window]]
                output.append({
                    'text': reverse_map[token_seq],
                    'start': items[0]['start'],
                    'end': items[-1]['end'],
                    'score': sum(item['score'] for item in items) / len(items),
                })
                i += window
                matched = True
                break
        if not matched:
            output.append(aligned_tokens[i][1])  # keep original item
            i += 1

    return output

def merge_capital_sequence(alignment: List[Dict]) -> List[Dict]:
    merged = []
    buffer = []

    def flush_buffer():
        if not buffer:
            return
        text = ''.join(item['text'] for item in buffer)
        merged.append({
            'text': text,
            'start': buffer[0]['start'],
            'end': buffer[-1]['end'],
            'score': sum(item['score'] for item in buffer) / len(buffer),
        })
        buffer.clear()

    for item in alignment:
        if item['text'].isalpha() and item['text'].isupper() and len(item['text']) == 1:
            buffer.append(item)
        else:
            flush_buffer()
            merged.append(item)
    flush_buffer()

    return merged



In [2]:
import pandas as pd

df = pd.read_parquet('tts.parquet').to_dict(orient = 'records')
df = [(i, df[i]) for i in range(len(df))]

In [3]:
len(df)

2208550

In [4]:
df[-1]

(2208549,
 {'reference_text': 'Hi, saya adalah pembantu AI anda, selamat berkenalan. Apa yang saya boleh tolong untuk buatkan hari anda lebih ceria.',
  'generate_text': 'media selepas menyempurnakan Sambutan Ulang Tahun Askar Wataniah Ke enam puluh lima di Rejimen lima ratus lima Askar Wataniah, Kem Force satu tiga enam di sini hari ini.',
  'normalized_generate_text': 'media selepas menyempurnakan Sambutan Ulang Tahun Askar Wataniah Ke enam puluh lima di Rejimen lima ratus lima Askar Wataniah, Kem Force satu tiga enam di sini hari ini.',
  'reference_audio': None,
  'filename_audio': 'prepare-dataset-normalizer-text-malay-news-idayu/472169.mp3'})

In [5]:
# !pip3 install malaya-speech==1.4.0rc2 --no-deps
# !pip3 install malaya-boilerplate dataclasses herpetologist python-speech-features unidecode

In [6]:
import malaya_speech

model = malaya_speech.speaker_vector.nemo('huseinzol05/nemo-titanet_large')
_ = model.eval()

2025-07-18 23:50:08.116155: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752882608.125010  589543 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752882608.129344  589543 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752882608.134651  589543 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752882608.134661  589543 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752882608.134663  589543 computation_placer.cc:177] computation placer alr

In [7]:
idayu = model([malaya_speech.load('shafiqah-idayu-enhanced-v2-v2-trim.mp3')[0]])
husein = model([malaya_speech.load('husein-assistant.mp3')[0]])

  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):


In [8]:
!mkdir tts-done

In [11]:
def loop(rows):
    rows, _ = rows
    for r in tqdm(rows):
            
        index, r = r

        filename_done = f'tts-done/{index}.json'

        try:
            with open(filename_done) as fopen:
                json.load(fopen)
                continue
        except:
            pass
            
        f = r['filename_audio']
        if not os.path.exists(r['filename_audio']):
            continue
            
        speaker = 'idayu' if 'idayu' in f else 'husein'
        e = idayu if 'idayu' in f else husein
        
        try:
            with open(f'tts-force-alignment/{index}.json') as fopen:
                d = json.load(fopen)
        except Exception as e:
            continue

        scores = []
        for d_ in d:
            score = np.sum([d__['score'] for d__ in d_])
            scores.append(score)

        if not len(scores):
            continue

        d = d[np.argmin(scores)]

        d = unnormalize_alignment(d, pronunciation = pronunciation)
        d = merge_capital_sequence(d)
            
        try:
            embedding_f = f'embedding/{index}.npy'
            v = np.array(np.load(embedding_f))
        except:
            continue

        if len(set(clean(r['normalized_generate_text']).split()) & rejected):
            continue
        
        if d[0]['start'] > 3:
            continue
            
        r['speaker'] = speaker
        snd = parselmouth.Sound(f)
        pitch = snd.to_pitch()
        freq = pitch.selected_array['frequency']
        freq = freq[freq > 0]
        freq = [round(f, 3) for f in freq.tolist()]
        r['frequency'] = freq
        
        similarity = cosine_similarity(e, v[None])[0, 0]
        
        r['similarity'] = float(similarity)

        for i in range(len(d)):
            d[i]['score'] = round(d[i]['score'], 3)

        y, sr = sf.read(f)
        r['audio_length'] = len(y) / sr
        r['index'] = index
        r['alignment'] = d

        with open(filename_done, 'w') as fopen:
            json.dump(r, fopen)

In [12]:
rows = loop((df[:10], 0))

100%|██████████| 10/10 [00:00<00:00, 70.62it/s]


In [13]:
multiprocessing(df, loop, cores = 30)

 48%|████▊     | 35348/73618 [18:52<18:37, 34.23it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

 74%|███████▍  | 54554/73618 [27:31<08:56, 35.54it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

100%|██████████| 73618/73618 [37:08<00:00, 33.03it/s]
100%|██████████| 73618/73618 [37:05<00:00, 33.07it/s]
100%|██████████| 73618/73618 [37:48<00:00, 32.45it/s]
100%|██████████| 73618/73618 [37:31<00:00, 32.70it/s]
100%|██████████| 73618/73618 [37:55<00:00,

TypeError: 'NoneType' object is not iterable

In [19]:
from glob import glob

data = []
for f in tqdm(glob('tts-done/*.json')):
    with open(f) as fopen:
        d = json.load(fopen)
    data.append(d)

100%|██████████| 2126383/2126383 [09:03<00:00, 3912.29it/s] 


In [21]:
data = sorted(data, key = lambda x: x['index'])

In [23]:
data[-100000]

{'reference_text': 'Hi, saya adalah pembantu AI anda, selamat berkenalan. Apa yang saya boleh tolong untuk buatkan hari anda lebih ceria.',
 'generate_text': 'LAWAS colon Perusahaan Kecil dan Sederhana open parenthesis P K S close parenthesis mewakili sembilan puluh lapan perpuluhan lima peratus daripada komuniti perniagaan termasuk perusahaan mikro, komuniti luar bandar, kumpulan B empat puluh dan pembekal.',
 'normalized_generate_text': 'LAWAS colon Perusahaan Kecil dan Sederhana open parenthesis P K S close parenthesis mewakili sembilan puluh lapan perpuluhan lima peratus daripada komuniti perniagaan termasuk perusahaan mikro, komuniti luar bandar, kumpulan B empat puluh dan pembekal.',
 'reference_audio': None,
 'filename_audio': 'prepare-dataset-normalizer-text-malay-news-idayu/247772.mp3',
 'speaker': 'idayu',
 'frequency': [199.734,
  204.641,
  212.029,
  210.918,
  208.191,
  208.871,
  209.655,
  209.888,
  209.794,
  209.412,
  209.109,
  209.77,
  211.73,
  213.612,
  214.9

In [24]:
pd.DataFrame(data).to_parquet('processed-tts.parquet')

In [25]:
!ls -lh processed-tts.parquet

-rw-r--r-- 1 ubuntu ubuntu 5.6G Jul 19 02:10 processed-tts.parquet


In [27]:
from datasets import Dataset

dataset = Dataset.from_list(data)

In [28]:
dataset.push_to_hub('mesolitica/Malaysian-TTS-v2')

Uploading the dataset shards:   0%|          | 0/22 [00:00<?, ? shards/s]
Creating parquet from Arrow format:   0%|          | 0/97 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:   5%|▌         | 5/97 [00:00<00:02, 41.99ba/s][A
Creating parquet from Arrow format:  10%|█         | 10/97 [00:00<00:02, 42.05ba/s][A
Creating parquet from Arrow format:  15%|█▌        | 15/97 [00:00<00:01, 42.00ba/s][A
Creating parquet from Arrow format:  21%|██        | 20/97 [00:00<00:01, 42.52ba/s][A
Creating parquet from Arrow format:  26%|██▌       | 25/97 [00:00<00:01, 43.27ba/s][A
Creating parquet from Arrow format:  31%|███       | 30/97 [00:00<00:01, 43.91ba/s][A
Creating parquet from Arrow format:  36%|███▌      | 35/97 [00:00<00:01, 44.42ba/s][A
Creating parquet from Arrow format:  41%|████      | 40/97 [00:00<00:01, 44.46ba/s][A
Creating parquet from Arrow format:  46%|████▋     | 45/97 [00:01<00:01, 44.83ba/s][A
Creating parquet from Arrow format:  52%|█████▏    | 50/97 [00:01<

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Malaysian-TTS-v2/commit/d3d53be9a6f753237ac84fbdd801ebaac53754bb', commit_message='Upload dataset', commit_description='', oid='d3d53be9a6f753237ac84fbdd801ebaac53754bb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Malaysian-TTS-v2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Malaysian-TTS-v2'), pr_revision=None, pr_num=None)