In [None]:
# !wget https://huggingface.co/datasets/huseinzol05/Malay-TTS-Yasmin/resolve/main/tts-malay-yasmin.tar.gz
# !wget https://huggingface.co/datasets/huseinzol05/Malay-TTS-Yasmin/resolve/main/populated-text.json
# !tar -xf tts-malay-yasmin.tar.gz

In [None]:
# !wget https://huggingface.co/datasets/huseinzol05/Malay-TTS-Yasmin/resolve/main/tts-malay-yasmin.tar.gz
# !wget https://huggingface.co/datasets/huseinzol05/Malay-TTS-Yasmin/resolve/main/populated-text.json
# !tar -xf tts-malay-yasmin.tar.gz

In [None]:
# !wget https://huggingface.co/datasets/huseinzol05/Malay-TTS-Yasmin/resolve/main/tts-malay-yasmin-parliament.tar.gz
# !wget https://huggingface.co/datasets/huseinzol05/Malay-TTS-Yasmin/resolve/main/populated-parliament.json
# !tar -xf tts-malay-yasmin-parliament.tar.gz

In [None]:
import parselmouth
import librosa
import pyworld as pw
from sklearn.preprocessing import StandardScaler
import numpy as np
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [None]:
import yaml

with open('config.yaml') as fopen:
    config = yaml.load(fopen)
    
config

In [None]:
import numpy as np

# https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/utils/outliers.py
def is_outlier(x, p25, p75):
    """Check if value is an outlier."""
    lower = p25 - 1.5 * (p75 - p25)
    upper = p75 + 1.5 * (p75 - p25)
    return x <= lower or x >= upper


def remove_outlier(x, p_bottom: int = 25, p_top: int = 75):
    """Remove outlier from x."""
    p_bottom = np.percentile(x, p_bottom)
    p_top = np.percentile(x, p_top)

    indices_of_outliers = []
    for ind, value in enumerate(x):
        if is_outlier(value, p_bottom, p_top):
            indices_of_outliers.append(ind)

    x[indices_of_outliers] = 0.0
    x[indices_of_outliers] = np.max(x)
    return x

In [None]:
import re

_pad = 'pad'
_start = 'start'
_eos = 'eos'
_punctuation = "!'(),.:;? "
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_numbers = '0123456789'

MALAYA_SPEECH_SYMBOLS = (
    [_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters)
)

TTS_AZURE_SYMBOLS = (
    [_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters) + list(_numbers)
)

INITIAL_SYMBOLS = list(_letters) + list(_numbers)

In [None]:
import json

with open('populated-text.json') as fopen:
    texts = json.load(fopen)
    
with open('populated-parliament.json') as fopen:
    parliament = json.load(fopen)

In [None]:
def tts_encode(string: str, add_eos: bool = True):
    r = [TTS_AZURE_SYMBOLS.index(c) for c in string if c in TTS_AZURE_SYMBOLS]
    if add_eos:
        r = r + [TTS_AZURE_SYMBOLS.index('eos')]
    return r

In [None]:
from unidecode import unidecode
from malaya.text.normalization import digit, cardinal
import malaya

#normalizer = malaya.normalize.normalizer(date = False, time = False)
normalizer = malaya.normalize.normalizer()

def put_spacing_num(string):
    string = re.sub('[A-Za-z]+', lambda ele: ' ' + ele[0] + ' ', string)
    return re.sub(r'[ ]+', ' ', string).strip()

def convert_to_ascii(string):
    return unidecode(string)

def collapse_whitespace(string):
    return re.sub(_whitespace_re, ' ', string)

def put_spacing(string, chars = '()-'):
    for c in chars:
        string = string.replace(c, f' {c} ')
    return string

before = {';': ',', '_': '', '=': 'sama dengan', '*': 'asterisk',
          "'": '', '~': '', '`': '', '%': 'peratus'}

after = {'/': 'garis miring'}
def replace_chars(string, chars):
    for k, v in chars.items():
        string = string.replace(k, f' {v} ')
    return string

patterns_num = [(r"\b\d+(?:[\.,']\d+)?\b\/\b\d+(?:[\.,']\d+)?\b", '/', 'garis miring'),
           (r"\b\d+(?:[\.,']\d+)?\b\-\b\d+(?:[\.,']\d+)?\b", '-', '')]

pattern_rm = r"RM \b\d+(?:[\.,']\d+)?\b (?:ribu|puluh|juta)"

replaces = {'dollar bilion': 'bilion dollar', 'dollar ribu': 'ribu dollar', 'dollar juta': 'juta dollar'}

def fix_pattern_num(string):
    for p in patterns_num:
        results = re.findall(p[0], string)
        for r in results:
            l_, r_ = r.split(p[1])
            string = string.replace(r, f'{digit(l_)} {p[2]} {digit(r_)}')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

def fix_dash_num(string):
    results = re.findall(r"-\d+", string)
    for r in results:
        string = string.replace(r, cardinal(r).replace('negatif', 'dash'))
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

def fix_num_dash(string):
    results = re.findall(r"\d+-", string)
    for r in results:
        string = string.replace(r, cardinal(r.replace('-', '')))
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

def cleaning(string, add_eos = False):
    sequence = []
    string = convert_to_ascii(string)
    string = replace_chars(string, before)
    string = fix_1900(string)
    string = fix_isbn(string)
    string = fix_pattern_num(string)
    string = fix_dash_num(string)
    string = fix_num_dash(string)
    string = fix_rm(string)
    string = put_spacing(string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    print(string)
    string = normalizer.normalize(string, normalize_text = False)['normalize']
    string = replace_chars(string, after)
    string = replace_chars(string, replaces)

    if string[-1] in '-,':
        string = string[:-1]
    if string[-2] in '-,!:;':
        string = string[:-2]
    if string[-1] != '.':
        string = string + '.'
    if string[0] not in INITIAL_SYMBOLS:
        string = string[1:]
    string = put_spacing_num(string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string, tts_encode(string, add_eos = add_eos)

In [None]:
rejected = ['Coppa Italia', 'Pak Ramli memerlukan']

In [None]:
cleaning(texts[665]['cleaned'])[0]

In [None]:
def fix_rm(string):
    t = string + ' .'
    compound = ['ribu', 'puluh', 'juta', 'bilion']
    pattern_rm = r"RM \b\d+(?:[\.,']\d+)?(?:[\.,']\d+)?\b (?:ribu|puluh|juta|bilion)"
    results = re.findall(pattern_rm, string)
    for r in results:
        splitted = r.split()
        if t[string.find(r) + len(r)] in '(/':
            s_ = splitted[-2].split('.')
            c = ' , '.join([cardinal(s__) for s__ in s_])
        else:
            c = cardinal(splitted[-2])
        if t[-2][-1] == '0' and '.' in splitted[-2]:
            c = f'{c} kosong'
        if string[string.find(r) + len(r)] in '(/':
            string = string.replace(r, f'RM {c} {splitted[-1]}').replace('perpuluhan', ',')
        else:
            string = string.replace(r, f'{c} {splitted[-1]} RM')

    pattern_rm = r"RM \b\d+(?:[\.,']\d+)?(?:[\.,']\d+)?\b"
    results = re.findall(pattern_rm, string)
    for r in results:
        splitted = r.split()
        if t[string.find(r) + len(r)] in '(/':
            s_ = splitted[-1].split('.')
            c = ' , '.join([cardinal(s__) for s__ in s_])
        else:
            c = cardinal(splitted[-1])
        if splitted[-1][-1] == '0' and '.' in splitted[-1]:
            c = f'{c} kosong'
        if t[string.find(r) + len(r)] in '(/':
            string = string.replace(r, f'RM {c}').replace('perpuluhan', ',')
        else:
            string = string.replace(r, f'{c} RM')
    
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

fix_rm('RM 500,000.00')

In [None]:
def fix_1900(string):
    results = re.findall(r'19\d\d', string)
    for r in results:
        if r[-2:] != '00':
            if r[-2] == '0':
                c = 'kosong ' + cardinal(r[-1])
            else:
                c = cardinal(r[-2:])
            string = string.replace(r, 'sembilan belas ' + c)
        else:
            string = string.replace(r, 'sembilan belas ratus')
    return string

fix_1900('1902')

In [None]:
directory = 'output-yasmin'
os.system(f'mkdir {directory}')
directories = ['audios', 'mels', 'text_ids', 'f0s', 'energies', 'pitches']
for d in directories:
    os.system(f'mkdir {directory}/{d}')

In [None]:
txts = [(f'female/{i}.wav', texts[i]['cleaned'], i, directory) for i in range(len(texts))]

In [None]:
import IPython.display as ipd

In [None]:
ipd.Audio('female/1821.wav')

In [None]:
texts[11624]['cleaned']

In [None]:
cleaning(texts[1821]['cleaned'])[0]

In [None]:
texts[16192]['cleaned']

In [None]:
def fix_isbn(string):
    results = re.findall(r'[0-9\-]+', string)
    results = [r for r in results if string[:string.find(r) - 1].split()[-1].lower() == 'isbn']
    for r in results:
        splitted = r.split('-')
        string = string.replace(r, ' dash '.join([digit(s) for s in splitted]))
    return string
    
fix_isbn(texts[1122]['cleaned'])

In [None]:
for t in txts:
    if len(re.findall(r'13\d\d', t[1])):
           print(t[2], t[1])

In [None]:
for t in txts:
    if 'ISBN' in t[1]:
        print(t[2])
        print(t[1])
        print(cleaning(t[1])[0])
        print()

In [None]:
import malaya_speech
from malaya_speech import Pipeline
from tqdm import tqdm
vad = malaya_speech.vad.webrtc()

def process(txts, 
            start_silent_trail = int(0.05 * config['sampling_rate']),
            middle_silent_trail = int(0.12 * config['sampling_rate']),
            end_silent_trail = int(0.1 * config['sampling_rate']),
            process_middle_silent = True,
            maxlen = 25):
    
    txts = txts[0]
    audios, mels, text_ids, f0s, energies, pitches = [], [], [], [], [], []

    for f in txts:
        directory = f[3]
        index = f[2]
        text = f[1]
        f = f[0]

        text = cleaning(text, f)
        audio, _ = malaya_speech.load(f, sr = config['sampling_rate'])
        audio = audio[start_silent_trail:]

        if config['trim_silence']:
            y_= malaya_speech.resample(audio, config['sampling_rate'], 16000)
            y_ = malaya_speech.astype.float_to_int(y_)
            frames = list(malaya_speech.generator.frames(audio, 30, config['sampling_rate']))
            frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
            frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
            grouped_deep = malaya_speech.group.group_frames(frames_webrtc)
            grouped_deep = malaya_speech.group.group_frames_threshold(grouped_deep, 0.15)
            r = []
            for no, g in enumerate(grouped_deep):
                if g[1]:
                    g = g[0].array
                else:
                    if no == 0:
                        g = g[0].array[-start_silent_trail:]
                    elif no == (len(grouped_deep) - 1):
                        g = g[0].array[:end_silent_trail]
                    else:
                        if process_middle_silent:
                            g = np.concatenate([g[0].array[:middle_silent_trail], g[0].array[-middle_silent_trail:]])
                        else:
                            g = g[0].array
                        
                r.append(g)
            audio = np.concatenate(r)
        
        if (len(audio) / config['sampling_rate']) > maxlen:
            print('skipped, audio too long')
            continue

        D = librosa.stft(
            audio,
            n_fft=config['fft_size'],
            hop_length=config['hop_size'],
            win_length=config['win_length'],
            window=config['window'],
            pad_mode='reflect',
        )
        S, _ = librosa.magphase(D) 
        fmin = 0 if config["fmin"] is None else config["fmin"]
        fmax = sampling_rate // 2 if config["fmax"] is None else config["fmax"]
        mel_basis = librosa.filters.mel(
            sr=config['sampling_rate'],
            n_fft=config["fft_size"],
            n_mels=config["num_mels"],
            fmin=fmin,
            fmax=fmax,
        )
        mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T
        audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
        audio = audio[: len(mel) * config['hop_size']]

        _f0, t = pw.dio(
            audio.astype(np.double),
            fs=config['sampling_rate'],
            f0_ceil=fmax,
            frame_period=1000 * config['hop_size'] / config['sampling_rate'],
        )
        f0 = pw.stonemask(audio.astype(np.double), _f0, t, config['sampling_rate'])
        if len(f0) >= len(mel):
            f0 = f0[: len(mel)]
        else:
            f0 = np.pad(f0, (0, len(mel) - len(f0)))

        # extract energy
        energy = np.sqrt(np.sum(S ** 2, axis=0))
        f0 = remove_outlier(f0)
        energy = remove_outlier(energy)
        
        mel_len = len(mel)
        snd = parselmouth.Sound(audio,sampling_frequency=22050)
        pitch = snd.to_pitch(time_step=snd.duration / (mel_len + 3)
                         ).selected_array['frequency']

        if config["global_gain_scale"] > 0.0:
            audio *= config["global_gain_scale"]

        if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0:
            print('skipped')
            continue
            
        np.save(f'{directory}/audios/{index}.npy', audio)
        np.save(f'{directory}/mels/{index}.npy', mel)
        np.save(f'{directory}/text_ids/{index}.npy', text)
        np.save(f'{directory}/f0s/{index}.npy', f0)
        np.save(f'{directory}/energies/{index}.npy', energy)

        audios.append(audio)
        mels.append(mel)
        text_ids.append(text)
        f0s.append(f0)
        energies.append(energy)
        pitches.append(pitch)
    
    return [[audios, mels, text_ids, f0s, energies, pitches]]

In [None]:
import matplotlib.pyplot as plt
import IPython.display as ipd

In [None]:
i = 1508
r = process((txts[i: i + 1], 0))[0]

In [None]:
for n in range(len(r[2])):
    print(n, r[2][n][0])

In [None]:
k = 0
ipd.Audio(r[0][k], rate = 22050)

In [None]:
nrows = 2
fig, ax = plt.subplots(nrows = nrows, ncols = 1)
fig.set_figwidth(10)
fig.set_figheight(nrows * 3)
mel_outputs_ = np.reshape(r[1][k], [-1, 80])
im = ax[0].imshow(np.rot90(mel_outputs_), aspect='auto', interpolation='none')
fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax[0])
ax[1].plot(r[0][k])
plt.show()

In [None]:
import mp

for i in tqdm(range(0, len(txts), 1000)):
    index = min(i + 1000, len(txts))
    b = txts[i: index]
    mp.multiprocessing(b, process, cores = 15, returned = False)

In [None]:
directory = 'output-yasmin-parliament'
os.system(f'mkdir {directory}')
directories = ['audios', 'mels', 'text_ids', 'f0s', 'energies', 'pitches']
for d in directories:
    os.system(f'mkdir {directory}/{d}')

In [None]:
txts = [(f'female-parliament/{i}.wav', parliament[i]['cleaned'], i, directory) for i in range(len(parliament))]

In [None]:
i = 80
r = process((txts[i: i + 10], 0))[0]

In [None]:
for n in range(len(r[2])):
    print(n, r[2][n][0])

In [None]:
k = 0
ipd.Audio(r[0][k], rate = 22050)

In [None]:
for i in tqdm(range(0, len(txts), 1000)):
    index = min(i + 1000, len(txts))
    b = txts[i: index]
    mp.multiprocessing(b, process, cores = 15, returned = False)

In [None]:
!du -hs output-yasmin

In [None]:
!du -hs output-yasmin-parliament

In [None]:
scaler_mel = StandardScaler(copy=False)
scaler_energy = StandardScaler(copy=False)
scaler_f0 = StandardScaler(copy=False)

In [None]:
from glob import glob

mels = glob('output-yasmin/mels/*.npy')
len(mels)

In [None]:
for f in tqdm(mels):
    mel = np.load(f)
    f0 = np.load(f.replace('mels/', 'f0s/'))
    energy = np.load(f.replace('mels/', 'energies/'))
    
    scaler_mel.partial_fit(mel)
    scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1))
    scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1))

In [None]:
mels = glob('output-yasmin-parliament/mels/*.npy')
len(mels)

In [None]:
for f in tqdm(mels):
    mel = np.load(f)
    f0 = np.load(f.replace('mels/', 'f0s/'))
    energy = np.load(f.replace('mels/', 'energies/'))
    
    scaler_mel.partial_fit(mel)
    scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1))
    scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1))

In [None]:
directory_stats = 'yasmin-stats'

In [None]:
def save_statistics_to_file(scaler_list, config):
    os.system(f'mkdir {directory_stats}')
    for scaler, name in scaler_list:
        stats = np.stack((scaler.mean_, scaler.scale_))
        np.save(
            os.path.join(f"{directory_stats}/stats{name}.npy"),
            stats.astype(np.float32),
            allow_pickle=False,
        )

In [None]:
scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")]
save_statistics_to_file(scaler_list, config)