In [1]:
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/data/salina.gz
# !tar -xf salina.gz

In [2]:
import librosa
import pyworld as pw
from sklearn.preprocessing import StandardScaler
import numpy as np
import os

In [3]:
# !pip3 install malaya-gpu -U --no-deps

In [4]:
import yaml

with open('config.yaml') as fopen:
    config = yaml.load(fopen)
    
config

  after removing the cwd from sys.path.


{'sampling_rate': 22050,
 'fft_size': 1024,
 'hop_size': 256,
 'win_length': None,
 'window': 'hann',
 'num_mels': 80,
 'fmin': 80,
 'fmax': 7600,
 'global_gain_scale': 1.0,
 'trim_silence': True,
 'trim_threshold_in_db': 60,
 'trim_frame_size': 2048,
 'trim_hop_size': 512}

In [5]:
import numpy as np

# https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/utils/outliers.py
def is_outlier(x, p25, p75):
    """Check if value is an outlier."""
    lower = p25 - 1.5 * (p75 - p25)
    upper = p75 + 1.5 * (p75 - p25)
    return x <= lower or x >= upper


def remove_outlier(x, p_bottom: int = 25, p_top: int = 75):
    """Remove outlier from x."""
    p_bottom = np.percentile(x, p_bottom)
    p_top = np.percentile(x, p_top)

    indices_of_outliers = []
    for ind, value in enumerate(x):
        if is_outlier(value, p_bottom, p_top):
            indices_of_outliers.append(ind)

    x[indices_of_outliers] = 0.0
    x[indices_of_outliers] = np.max(x)
    return x

In [6]:
import re

_pad = "pad"
_eos = "eos"
_punctuation = "!'(),.:;? "
_special = "-"
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

# Export all symbols:
MALAYA_SPEECH_SYMBOLS = (
    [_pad] + list(_special) + list(_punctuation) + list(_letters) + [_eos]
)

# Regular expression matching text enclosed in curly braces:
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")

In [7]:
def tts_encode(string: str, add_eos: bool = True):
    r = [MALAYA_SPEECH_SYMBOLS.index(c) for c in string if c in MALAYA_SPEECH_SYMBOLS]
    if add_eos:
        r = r + [MALAYA_SPEECH_SYMBOLS.index('eos')]
    return r

In [8]:
from unidecode import unidecode
import malaya

normalizer = malaya.normalize.normalizer(date = False, time = False)

def put_spacing_num(string):
    string = re.sub('[A-Za-z]+', lambda ele: ' ' + ele[0] + ' ', string)
    return re.sub(r'[ ]+', ' ', string).strip()

def convert_to_ascii(string):
    return unidecode(string)

def collapse_whitespace(string):
    return re.sub(_whitespace_re, ' ', string)

def cleaning(string, normalize = True, add_eos = True):
    sequence = []
    string = convert_to_ascii(string)
    string = string.replace('&', ' dan ')
    string = put_spacing_num(string)
    if normalize:
        string = normalizer.normalize(string, 
                                      check_english = False, 
                                      normalize_entity = False, 
                                      normalize_text = False,
                                      normalize_url = True,
                                      normalize_email = True)
        string = string['normalize']
    else:
        string = string
    string = string.lower()
    return string, tts_encode(string, add_eos = add_eos)

In [9]:
from glob import glob
from tqdm import tqdm

txts = glob('salina/*salina*/*.txt')
len(txts)

12579

In [10]:
texts = []
for t in tqdm(txts):
    with open(t) as fopen:
        texts.append(fopen.read())

100%|██████████| 12579/12579 [00:00<00:00, 44212.39it/s]


In [11]:
txts[0].replace('output-text', 'output-wav').replace('.txt', '')

'salina/output-wav-salina/tiga-4.mp3-441.wav'

In [12]:
import malaya_speech

def process(txts):
    txts = txts[0]
    audios, mels, text_ids, f0s, energies = [], [], [], [], []

    for f in tqdm(txts):
        with open(f) as fopen:
            text = fopen.read()
        if re.match('^.*(RM|rm)[0-9]+.*$', text):
            continue
        text = cleaning(text)
        r = f.replace('output-text', 'output-wav').replace('.txt', '')

        audio, _ = malaya_speech.load(r, sr = config['sampling_rate'])

        if (len(audio) / config['sampling_rate']) > 25:
            print('skipped, audio too long')
            continue

        if config['trim_silence']:
            audio, _ = librosa.effects.trim(
                audio,
                top_db = config['trim_threshold_in_db'],
                frame_length = config['trim_frame_size'],
                hop_length = config['trim_hop_size'],
            )

        D = librosa.stft(
            audio,
            n_fft=config['fft_size'],
            hop_length=config['hop_size'],
            win_length=config['win_length'],
            window=config['window'],
            pad_mode='reflect',
        )
        S, _ = librosa.magphase(D) 
        fmin = 0 if config["fmin"] is None else config["fmin"]
        fmax = sampling_rate // 2 if config["fmax"] is None else config["fmax"]
        mel_basis = librosa.filters.mel(
            sr=config['sampling_rate'],
            n_fft=config["fft_size"],
            n_mels=config["num_mels"],
            fmin=fmin,
            fmax=fmax,
        )
        mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T
        audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
        audio = audio[: len(mel) * config['hop_size']]

        _f0, t = pw.dio(
            audio.astype(np.double),
            fs=config['sampling_rate'],
            f0_ceil=fmax,
            frame_period=1000 * config['hop_size'] / config['sampling_rate'],
        )
        f0 = pw.stonemask(audio.astype(np.double), _f0, t, config['sampling_rate'])
        if len(f0) >= len(mel):
            f0 = f0[: len(mel)]
        else:
            f0 = np.pad(f0, (0, len(mel) - len(f0)))

        # extract energy
        energy = np.sqrt(np.sum(S ** 2, axis=0))
        f0 = remove_outlier(f0)
        energy = remove_outlier(energy)

        if config["global_gain_scale"] > 0.0:
            audio *= config["global_gain_scale"]

        if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0:
            print('skipped')
            continue

        audios.append(audio)
        mels.append(mel)
        text_ids.append(text)
        f0s.append(f0)
        energies.append(energy)
    
    return [[audios, mels, text_ids, f0s, energies]]

In [13]:
import mp

audios, mels, text_ids, f0s, energies = [], [], [], [], []

In [14]:
for i in range(0, len(txts), 1000):
    index = min(i + 1000, len(txts))
    b = txts[i: index]
    results = mp.multiprocessing(b, process, cores = 10, returned = True)
    for result in results:
        audios.extend(result[0])
        mels.extend(result[1])
        text_ids.extend(result[2])
        f0s.extend(result[3])
        energies.extend(result[4])

  7%|▋         | 7/100 [00:03<00:41,  2.22it/s]

skipped


 29%|██▉       | 29/100 [00:13<00:29,  2.39it/s]

skipped


 36%|███▌      | 36/100 [00:14<00:26,  2.42it/s]

skipped


 87%|████████▋ | 87/100 [00:35<00:04,  2.73it/s]

skipped


100%|██████████| 100/100 [00:40<00:00,  2.50it/s]
100%|██████████| 100/100 [00:43<00:00,  2.28it/s]
100%|██████████| 100/100 [00:44<00:00,  2.26it/s]
100%|██████████| 100/100 [00:44<00:00,  2.25it/s]
100%|██████████| 100/100 [00:44<00:00,  2.23it/s]
100%|██████████| 100/100 [00:44<00:00,  2.23it/s]
100%|██████████| 100/100 [00:45<00:00,  2.20it/s]
100%|██████████| 100/100 [00:45<00:00,  2.20it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
 83%|████████▎ | 83/100 [00:37<00:08,  1.99it/s]

skipped


100%|██████████| 100/100 [00:44<00:00,  2.26it/s]
100%|██████████| 100/100 [00:44<00:00,  2.26it/s]
100%|██████████| 100/100 [00:44<00:00,  2.25it/s]
100%|██████████| 100/100 [00:44<00:00,  2.25it/s]
100%|██████████| 100/100 [00:45<00:00,  2.21it/s]
100%|██████████| 100/100 [00:45<00:00,  2.20it/s]
100%|██████████| 100/100 [00:45<00:00,  2.19it/s]
 97%|█████████▋| 97/100 [00:46<00:01,  2.47it/s]
100%|██████████| 100/100 [00:47<00:00,  2.10it/s]
100%|██████████| 100/100 [00:47<00:00,  2.10it/s]
 49%|████▉     | 49/100 [00:23<00:20,  2.48it/s]

skipped


100%|██████████| 100/100 [00:41<00:00,  2.40it/s]
100%|██████████| 100/100 [00:42<00:00,  2.37it/s]
100%|██████████| 100/100 [00:44<00:00,  2.25it/s]
100%|██████████| 100/100 [00:44<00:00,  2.25it/s]
100%|██████████| 100/100 [00:45<00:00,  2.22it/s]
100%|██████████| 100/100 [00:45<00:00,  2.18it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:46<00:00,  2.15it/s]
100%|██████████| 100/100 [00:47<00:00,  2.11it/s]
100%|██████████| 100/100 [00:48<00:00,  2.05it/s]
 13%|█▎        | 13/100 [00:06<00:44,  1.96it/s]

skipped


 24%|██▍       | 24/100 [00:12<00:41,  1.81it/s]

skipped


 55%|█████▌    | 55/100 [00:24<00:16,  2.65it/s]

skipped


 57%|█████▋    | 57/100 [00:26<00:17,  2.40it/s]

skipped


 64%|██████▍   | 64/100 [00:29<00:14,  2.52it/s]

skipped


100%|██████████| 100/100 [00:42<00:00,  2.33it/s]
100%|██████████| 100/100 [00:43<00:00,  2.29it/s]
100%|██████████| 100/100 [00:43<00:00,  2.28it/s]
100%|██████████| 100/100 [00:44<00:00,  2.26it/s]
100%|██████████| 100/100 [00:45<00:00,  2.21it/s]
100%|██████████| 100/100 [00:45<00:00,  2.18it/s]
100%|██████████| 100/100 [00:45<00:00,  2.18it/s]
100%|██████████| 100/100 [00:46<00:00,  2.14it/s]
100%|██████████| 100/100 [00:47<00:00,  2.13it/s]
100%|██████████| 100/100 [00:47<00:00,  2.13it/s]
  2%|▏         | 2/100 [00:01<01:03,  1.55it/s]

skipped


  3%|▎         | 3/100 [00:02<01:26,  1.12it/s]

skipped


 24%|██▍       | 24/100 [00:09<00:22,  3.34it/s]

skipped


100%|██████████| 100/100 [00:41<00:00,  2.40it/s]
100%|██████████| 100/100 [00:41<00:00,  2.38it/s]
100%|██████████| 100/100 [00:42<00:00,  2.34it/s]
100%|██████████| 100/100 [00:43<00:00,  2.30it/s]
100%|██████████| 100/100 [00:43<00:00,  2.28it/s]
100%|██████████| 100/100 [00:44<00:00,  2.27it/s]
100%|██████████| 100/100 [00:44<00:00,  2.23it/s]
100%|██████████| 100/100 [00:45<00:00,  2.19it/s]
100%|██████████| 100/100 [00:45<00:00,  2.18it/s]
100%|██████████| 100/100 [00:48<00:00,  2.07it/s]
 29%|██▉       | 29/100 [00:12<00:30,  2.34it/s]

skipped


 82%|████████▏ | 82/100 [00:36<00:09,  1.82it/s]

skipped


100%|██████████| 100/100 [00:43<00:00,  2.27it/s]
100%|██████████| 100/100 [00:44<00:00,  2.25it/s]
100%|██████████| 100/100 [00:44<00:00,  2.24it/s]
100%|██████████| 100/100 [00:45<00:00,  2.19it/s]
100%|██████████| 100/100 [00:45<00:00,  2.19it/s]
100%|██████████| 100/100 [00:46<00:00,  2.17it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:46<00:00,  2.13it/s]
 21%|██        | 21/100 [00:11<00:32,  2.47it/s]

skipped


100%|██████████| 100/100 [00:43<00:00,  2.30it/s]
100%|██████████| 100/100 [00:43<00:00,  2.30it/s]
100%|██████████| 100/100 [00:43<00:00,  2.29it/s]
100%|██████████| 100/100 [00:43<00:00,  2.28it/s]
100%|██████████| 100/100 [00:44<00:00,  2.22it/s]
100%|██████████| 100/100 [00:45<00:00,  2.21it/s]
100%|██████████| 100/100 [00:45<00:00,  2.19it/s]
100%|██████████| 100/100 [00:46<00:00,  2.13it/s]
100%|██████████| 100/100 [00:47<00:00,  2.09it/s]
100%|██████████| 100/100 [00:47<00:00,  2.09it/s]
 10%|█         | 10/100 [00:05<00:45,  1.96it/s]

skipped


 78%|███████▊  | 78/100 [00:39<00:10,  2.11it/s]

skipped


100%|██████████| 100/100 [00:41<00:00,  2.39it/s]
100%|██████████| 100/100 [00:42<00:00,  2.34it/s]
100%|██████████| 100/100 [00:43<00:00,  2.29it/s]
100%|██████████| 100/100 [00:45<00:00,  2.21it/s]
100%|██████████| 100/100 [00:46<00:00,  2.17it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:46<00:00,  2.14it/s]
100%|██████████| 100/100 [00:47<00:00,  2.11it/s]
 29%|██▉       | 29/100 [00:14<00:30,  2.30it/s]

skipped


 79%|███████▉  | 79/100 [00:36<00:08,  2.43it/s]

skipped


100%|██████████| 100/100 [00:42<00:00,  2.33it/s]
100%|██████████| 100/100 [00:43<00:00,  2.30it/s]
100%|██████████| 100/100 [00:44<00:00,  2.27it/s]
100%|██████████| 100/100 [00:44<00:00,  2.25it/s]
100%|██████████| 100/100 [00:44<00:00,  2.24it/s]
100%|██████████| 100/100 [00:45<00:00,  2.21it/s]
100%|██████████| 100/100 [00:45<00:00,  2.20it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:46<00:00,  2.15it/s]
100%|██████████| 100/100 [00:46<00:00,  2.14it/s]
 89%|████████▉ | 89/100 [00:41<00:04,  2.26it/s]

skipped


100%|██████████| 100/100 [00:44<00:00,  2.26it/s]
100%|██████████| 100/100 [00:44<00:00,  2.24it/s]
100%|██████████| 100/100 [00:45<00:00,  2.21it/s]
100%|██████████| 100/100 [00:45<00:00,  2.20it/s]
100%|██████████| 100/100 [00:45<00:00,  2.18it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:46<00:00,  2.15it/s]
100%|██████████| 100/100 [00:46<00:00,  2.15it/s]
100%|██████████| 100/100 [00:46<00:00,  2.14it/s]
100%|██████████| 100/100 [00:47<00:00,  2.13it/s]
 16%|█▌        | 16/100 [00:07<00:43,  1.95it/s]

skipped


 48%|████▊     | 48/100 [00:22<00:22,  2.31it/s]

skipped


 83%|████████▎ | 83/100 [00:39<00:05,  3.18it/s]

skipped


100%|██████████| 100/100 [00:43<00:00,  2.32it/s]
100%|██████████| 100/100 [00:44<00:00,  2.25it/s]
100%|██████████| 100/100 [00:45<00:00,  2.21it/s]
100%|██████████| 100/100 [00:45<00:00,  2.20it/s]
100%|██████████| 100/100 [00:45<00:00,  2.19it/s]
100%|██████████| 100/100 [00:45<00:00,  2.18it/s]
100%|██████████| 100/100 [00:46<00:00,  2.17it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:46<00:00,  2.14it/s]
 36%|███▌      | 36/100 [00:15<00:29,  2.18it/s]

skipped


 40%|████      | 40/100 [00:19<00:34,  1.72it/s]

skipped


100%|██████████| 100/100 [00:42<00:00,  2.37it/s]
100%|██████████| 100/100 [00:43<00:00,  2.31it/s]
 99%|█████████▉| 99/100 [00:43<00:00,  2.17it/s]

skipped


100%|██████████| 100/100 [00:43<00:00,  2.28it/s]
100%|██████████| 100/100 [00:44<00:00,  2.25it/s]
100%|██████████| 100/100 [00:45<00:00,  2.22it/s]
100%|██████████| 100/100 [00:45<00:00,  2.18it/s]
100%|██████████| 100/100 [00:45<00:00,  2.18it/s]
100%|██████████| 100/100 [00:46<00:00,  2.13it/s]
100%|██████████| 100/100 [00:47<00:00,  2.11it/s]
100%|██████████| 100/100 [00:48<00:00,  2.07it/s]
100%|██████████| 57/57 [00:23<00:00,  2.46it/s]
100%|██████████| 57/57 [00:24<00:00,  2.34it/s]
100%|██████████| 57/57 [00:25<00:00,  2.24it/s]
100%|██████████| 57/57 [00:25<00:00,  2.22it/s]
100%|██████████| 57/57 [00:25<00:00,  2.20it/s]
100%|██████████| 57/57 [00:25<00:00,  2.20it/s]
100%|██████████| 57/57 [00:25<00:00,  2.20it/s]
100%|██████████| 57/57 [00:26<00:00,  2.19it/s]
100%|██████████| 57/57 [00:26<00:00,  2.15it/s]
100%|██████████| 9/9 [00:03<00:00,  2.95it/s]
100%|██████████| 57/57 [00:26<00:00,  2.13it/s]


In [15]:
scaler_mel = StandardScaler(copy=False)
scaler_energy = StandardScaler(copy=False)
scaler_f0 = StandardScaler(copy=False)

for mel, f0, energy in zip(mels, f0s, energies):

    scaler_mel.partial_fit(mel)
    scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1))
    scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1))

In [16]:
for i in tqdm(range(len(mels))):
    mels[i] = scaler_mel.transform(mels[i])

100%|██████████| 12547/12547 [00:05<00:00, 2285.21it/s]


In [17]:
def save_statistics_to_file(scaler_list, config):
    os.system('mkdir stats-female')
    for scaler, name in scaler_list:
        stats = np.stack((scaler.mean_, scaler.scale_))
        np.save(
            os.path.join(f"stats-female/stats{name}.npy"),
            stats.astype(np.float32),
            allow_pickle=False,
        )

In [18]:
scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")]
save_statistics_to_file(scaler_list, config)

In [19]:
!rm -rf output-female

In [20]:
os.system('mkdir output-female')
directories = ['audios', 'mels', 'text_ids', 'f0s', 'energies']
for d in directories:
    os.system(f'mkdir output-female/{d}')

In [21]:
for i in tqdm(range(len(mels))):
    np.save(f'output-female/audios/{i}.npy', audios[i])
    np.save(f'output-female/mels/{i}.npy', mels[i])
    np.save(f'output-female/text_ids/{i}.npy', text_ids[i])
    np.save(f'output-female/f0s/{i}.npy', f0s[i])
    np.save(f'output-female/energies/{i}.npy', energies[i])

100%|██████████| 12547/12547 [01:49<00:00, 114.45it/s]
