In [1]:
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/data/dari-pasentran-ke-istana.gz
# !tar -xf dari-pasentran-ke-istana.gz

In [20]:
import librosa
import pyworld as pw
from sklearn.preprocessing import StandardScaler
import numpy as np
import os

In [3]:
# !pip3 install malaya-gpu -U --no-deps

In [4]:
import yaml

with open('config.yaml') as fopen:
    config = yaml.load(fopen)
    
config

  after removing the cwd from sys.path.


{'sampling_rate': 22050,
 'fft_size': 1024,
 'hop_size': 256,
 'win_length': None,
 'window': 'hann',
 'num_mels': 80,
 'fmin': 80,
 'fmax': 7600,
 'global_gain_scale': 1.0,
 'trim_silence': True,
 'trim_threshold_in_db': 60,
 'trim_frame_size': 2048,
 'trim_hop_size': 512}

In [5]:
import numpy as np

# https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/utils/outliers.py
def is_outlier(x, p25, p75):
    """Check if value is an outlier."""
    lower = p25 - 1.5 * (p75 - p25)
    upper = p75 + 1.5 * (p75 - p25)
    return x <= lower or x >= upper


def remove_outlier(x, p_bottom: int = 25, p_top: int = 75):
    """Remove outlier from x."""
    p_bottom = np.percentile(x, p_bottom)
    p_top = np.percentile(x, p_top)

    indices_of_outliers = []
    for ind, value in enumerate(x):
        if is_outlier(value, p_bottom, p_top):
            indices_of_outliers.append(ind)

    x[indices_of_outliers] = 0.0
    x[indices_of_outliers] = np.max(x)
    return x

In [6]:
import re

_pad = "pad"
_eos = "eos"
_punctuation = "!'(),.:;? "
_special = "-"
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

# Export all symbols:
MALAYA_SPEECH_SYMBOLS = (
    [_pad] + list(_special) + list(_punctuation) + list(_letters) + [_eos]
)

# Regular expression matching text enclosed in curly braces:
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")

In [7]:
def tts_encode(string: str, add_eos: bool = True):
    r = [MALAYA_SPEECH_SYMBOLS.index(c) for c in string]
    if add_eos:
        r = r + [MALAYA_SPEECH_SYMBOLS.index('eos')]
    return r

In [8]:
from unidecode import unidecode
import malaya

normalizer = malaya.normalize.normalizer(date = False, time = False)

def put_spacing_num(string):
    string = re.sub('[A-Za-z]+', lambda ele: ' ' + ele[0] + ' ', string)
    return re.sub(r'[ ]+', ' ', string).strip()

def convert_to_ascii(string):
    return unidecode(string)

def collapse_whitespace(string):
    return re.sub(_whitespace_re, ' ', string)

def cleaning(string, normalize = True, add_eos = True):
    sequence = []
    string = convert_to_ascii(string)
    string = string.replace('&', ' dan ')
    string = put_spacing_num(string)
    if normalize:
        string = normalizer.normalize(string, 
                                      check_english = False, 
                                      normalize_entity = False, 
                                      normalize_text = False,
                                      normalize_url = True,
                                      normalize_email = True)
        string = string['normalize']
    else:
        string = string
    string = string.lower()
    return string, tts_encode(string, add_eos = add_eos)

In [9]:
from glob import glob
from tqdm import tqdm

txts = glob('text-audiobook/*istana*/*.txt')
len(txts)

9168

In [10]:
texts = []
for t in tqdm(txts):
    with open(t) as fopen:
        texts.append(fopen.read())

100%|██████████| 9168/9168 [00:00<00:00, 25399.86it/s]


In [44]:
import malaya_speech

audios, mels, text_ids, f0s, energies = [], [], [], [], []

# scaler_mel = StandardScaler(copy=False)
# scaler_energy = StandardScaler(copy=False)
# scaler_f0 = StandardScaler(copy=False)

for f in tqdm(txts[100:110]):
    with open(f) as fopen:
        text = fopen.read()
    if re.match('^.*(RM|rm)[0-9]+.*$', text):
        continue
    text = cleaning(text)
    r = f.replace('text-audiobook/', 
                  'dari-pasentran-ke-istana/').replace('output-text', 
                                                       'output-wav').replace('.txt', '')
    
    audio, _ = malaya_speech.load(r, sr = config['sampling_rate'])
    
    if config['trim_silence']:
        audio, _ = librosa.effects.trim(
            audio,
            top_db = config['trim_threshold_in_db'],
            frame_length = config['trim_frame_size'],
            hop_length = config['trim_hop_size'],
        )
        
    D = librosa.stft(
        audio,
        n_fft=config['fft_size'],
        hop_length=config['hop_size'],
        win_length=config['win_length'],
        window=config['window'],
        pad_mode='reflect',
    )
    S, _ = librosa.magphase(D) 
    fmin = 0 if config["fmin"] is None else config["fmin"]
    fmax = sampling_rate // 2 if config["fmax"] is None else config["fmax"]
    mel_basis = librosa.filters.mel(
        sr=config['sampling_rate'],
        n_fft=config["fft_size"],
        n_mels=config["num_mels"],
        fmin=fmin,
        fmax=fmax,
    )
    mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T
    audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
    audio = audio[: len(mel) * config['hop_size']]
    
    _f0, t = pw.dio(
        audio.astype(np.double),
        fs=config['sampling_rate'],
        f0_ceil=fmax,
        frame_period=1000 * config['hop_size'] / config['sampling_rate'],
    )
    f0 = pw.stonemask(audio.astype(np.double), _f0, t, config['sampling_rate'])
    if len(f0) >= len(mel):
        f0 = f0[: len(mel)]
    else:
        f0 = np.pad(f0, (0, len(mel) - len(f0)))

    # extract energy
    energy = np.sqrt(np.sum(S ** 2, axis=0))
    f0 = remove_outlier(f0)
    energy = remove_outlier(energy)
    
    if config["global_gain_scale"] > 0.0:
        audio *= config["global_gain_scale"]
        
    if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0:
        print('skipped')
        continue
    
    audios.append(audio)
    mels.append(mel)
    text_ids.append(text)
    f0s.append(f0)
    energies.append(energy)
    
#     scaler_mel.partial_fit(mel)
#     scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1))
#     scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1))

100%|██████████| 10/10 [00:02<00:00,  3.35it/s]


In [45]:
def save_statistics_to_file(scaler_list, config):
    os.system('mkdir stats')
    for scaler, name in scaler_list:
        stats = np.stack((scaler.mean_, scaler.scale_))
        np.save(
            os.path.join(f"stats/stats{name}.npy"),
            stats.astype(np.float32),
            allow_pickle=False,
        )

In [46]:
scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")]
save_statistics_to_file(scaler_list, config)

In [48]:
(scaler_mel.transform(mels[1]) == mels[1]).mean()

1.0

In [49]:
audios[0]

array([ 0.0000000e+00, -6.0160095e-06,  0.0000000e+00, ...,
       -5.8595715e-05, -5.8595715e-05, -5.8595715e-05])

In [50]:
import IPython.display as ipd

ipd.Audio(audios[0], rate = config['sampling_rate'])