In [1]:
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/data/salina.gz
# !tar -xf salina.gz

In [2]:
import librosa
import pyworld as pw
from sklearn.preprocessing import StandardScaler
import numpy as np
import os

In [3]:
# !pip3 install malaya-gpu -U --no-deps

In [4]:
import yaml

with open('config.yaml') as fopen:
    config = yaml.load(fopen)
    
config

  after removing the cwd from sys.path.


{'sampling_rate': 22050,
 'fft_size': 1024,
 'hop_size': 256,
 'win_length': None,
 'window': 'hann',
 'num_mels': 80,
 'fmin': 80,
 'fmax': 7600,
 'global_gain_scale': 1.0,
 'trim_silence': True,
 'trim_threshold_in_db': 20,
 'trim_frame_size': 2048,
 'trim_hop_size': 512}

In [5]:
import numpy as np

# https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/utils/outliers.py
def is_outlier(x, p25, p75):
    """Check if value is an outlier."""
    lower = p25 - 1.5 * (p75 - p25)
    upper = p75 + 1.5 * (p75 - p25)
    return x <= lower or x >= upper


def remove_outlier(x, p_bottom: int = 25, p_top: int = 75):
    """Remove outlier from x."""
    p_bottom = np.percentile(x, p_bottom)
    p_top = np.percentile(x, p_top)

    indices_of_outliers = []
    for ind, value in enumerate(x):
        if is_outlier(value, p_bottom, p_top):
            indices_of_outliers.append(ind)

    x[indices_of_outliers] = 0.0
    x[indices_of_outliers] = np.max(x)
    return x

In [6]:
import re

_pad = "pad"
_eos = "eos"
_punctuation = "!'(),.:;? "
_special = "-"
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

# Export all symbols:
MALAYA_SPEECH_SYMBOLS = (
    [_pad] + list(_special) + list(_punctuation) + list(_letters) + [_eos]
)

# Regular expression matching text enclosed in curly braces:
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")

In [7]:
def tts_encode(string: str, add_eos: bool = True):
    r = [MALAYA_SPEECH_SYMBOLS.index(c) for c in string if c in MALAYA_SPEECH_SYMBOLS]
    if add_eos:
        r = r + [MALAYA_SPEECH_SYMBOLS.index('eos')]
    return r

In [8]:
from unidecode import unidecode
import malaya

normalizer = malaya.normalize.normalizer(date = False, time = False)

def put_spacing_num(string):
    string = re.sub('[A-Za-z]+', lambda ele: ' ' + ele[0] + ' ', string)
    return re.sub(r'[ ]+', ' ', string).strip()

def convert_to_ascii(string):
    return unidecode(string)

def collapse_whitespace(string):
    return re.sub(_whitespace_re, ' ', string)

def cleaning(string, normalize = True, add_eos = True):
    sequence = []
    string = convert_to_ascii(string)
    string = string.replace('&', ' dan ')
    string = put_spacing_num(string)
    if normalize:
        string = normalizer.normalize(string, 
                                      check_english = False, 
                                      normalize_entity = False, 
                                      normalize_text = False,
                                      normalize_url = True,
                                      normalize_email = True)
        string = string['normalize']
    else:
        string = string
    string = string.lower()
    return string, tts_encode(string, add_eos = add_eos)

In [9]:
from glob import glob
from tqdm import tqdm

txts = glob('salina/*salina*/*.txt')
txts.extend(glob('../youtube/malay2/*salina-short*/*salina*/*.txt'))
len(txts)

39482

In [10]:
import malaya_speech
from malaya_speech import Pipeline
vad = malaya_speech.vad.webrtc()

def process(txts, silent_trail = 500):
    txts = txts[0]
    audios, mels, text_ids, f0s, energies = [], [], [], [], []

    for f in tqdm(txts):
        with open(f) as fopen:
            text = fopen.read()
        if re.match('^.*(RM|rm)[0-9]+.*$', text):
            continue
        text = cleaning(text)
        r = f.replace('output-text', 'output-wav').replace('.txt', '')

        audio, _ = malaya_speech.load(r, sr = config['sampling_rate'])

        if (len(audio) / config['sampling_rate']) > 25:
            print('skipped, audio too long')
            continue

        if config['trim_silence']:
            y_= malaya_speech.resample(audio, config['sampling_rate'], 16000)
            y_ = malaya_speech.astype.float_to_int(y_)
            frames = list(malaya_speech.generator.frames(audio, 30, config['sampling_rate']))
            frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
            frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
            grouped_deep = malaya_speech.group.group_frames(frames_webrtc)
            grouped_deep = malaya_speech.group.group_frames_threshold(grouped_deep, 0.1)
            r = []
            for g in grouped_deep:
                if g[1]:
                    g = g[0].array
                else:
                    g = np.concatenate([g[0].array[:silent_trail], g[0].array[-silent_trail:]])
                r.append(g)
            audio = np.concatenate(r)

        D = librosa.stft(
            audio,
            n_fft=config['fft_size'],
            hop_length=config['hop_size'],
            win_length=config['win_length'],
            window=config['window'],
            pad_mode='reflect',
        )
        S, _ = librosa.magphase(D) 
        fmin = 0 if config["fmin"] is None else config["fmin"]
        fmax = sampling_rate // 2 if config["fmax"] is None else config["fmax"]
        mel_basis = librosa.filters.mel(
            sr=config['sampling_rate'],
            n_fft=config["fft_size"],
            n_mels=config["num_mels"],
            fmin=fmin,
            fmax=fmax,
        )
        mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T
        audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
        audio = audio[: len(mel) * config['hop_size']]

        _f0, t = pw.dio(
            audio.astype(np.double),
            fs=config['sampling_rate'],
            f0_ceil=fmax,
            frame_period=1000 * config['hop_size'] / config['sampling_rate'],
        )
        f0 = pw.stonemask(audio.astype(np.double), _f0, t, config['sampling_rate'])
        if len(f0) >= len(mel):
            f0 = f0[: len(mel)]
        else:
            f0 = np.pad(f0, (0, len(mel) - len(f0)))

        # extract energy
        energy = np.sqrt(np.sum(S ** 2, axis=0))
        f0 = remove_outlier(f0)
        energy = remove_outlier(energy)

        if config["global_gain_scale"] > 0.0:
            audio *= config["global_gain_scale"]

        if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0:
            print('skipped')
            continue

        audios.append(audio)
        mels.append(mel)
        text_ids.append(text)
        f0s.append(f0)
        energies.append(energy)
    
    return [[audios, mels, text_ids, f0s, energies]]

In [11]:
import mp

audios, mels, text_ids, f0s, energies = [], [], [], [], []

In [12]:
for i in range(0, len(txts), 1000):
    index = min(i + 1000, len(txts))
    b = txts[i: index]
    results = mp.multiprocessing(b, process, cores = 10, returned = True)
    for result in results:
        audios.extend(result[0])
        mels.extend(result[1])
        text_ids.extend(result[2])
        f0s.extend(result[3])
        energies.extend(result[4])

 81%|████████  | 81/100 [00:45<00:08,  2.37it/s]

skipped


100%|██████████| 100/100 [00:56<00:00,  1.78it/s]
100%|██████████| 100/100 [00:56<00:00,  1.76it/s]
100%|██████████| 100/100 [00:57<00:00,  1.75it/s]
100%|██████████| 100/100 [00:57<00:00,  1.72it/s]
100%|██████████| 100/100 [00:58<00:00,  1.70it/s]
100%|██████████| 100/100 [00:59<00:00,  1.68it/s]
 95%|█████████▌| 95/100 [00:59<00:03,  1.32it/s]]
100%|██████████| 100/100 [00:59<00:00,  1.67it/s]
100%|██████████| 100/100 [00:59<00:00,  1.67it/s]
100%|██████████| 100/100 [01:02<00:00,  1.61it/s]
100%|██████████| 100/100 [00:57<00:00,  1.73it/s]
100%|██████████| 100/100 [00:58<00:00,  1.70it/s]
100%|██████████| 100/100 [00:59<00:00,  1.67it/s]
100%|██████████| 100/100 [01:00<00:00,  1.67it/s]
100%|██████████| 100/100 [01:00<00:00,  1.66it/s]
100%|██████████| 100/100 [01:00<00:00,  1.65it/s]
100%|██████████| 100/100 [01:02<00:00,  1.61it/s]
100%|██████████| 100/100 [01:03<00:00,  1.57it/s]
100%|██████████| 100/100 [01:03<00:00,  1.57it/s]
100%|██████████| 100/100 [01:04<00:00,  1.55it/s]


skipped


100%|██████████| 100/100 [00:29<00:00,  3.42it/s]
100%|██████████| 100/100 [00:29<00:00,  3.36it/s]
100%|██████████| 100/100 [00:30<00:00,  3.29it/s]
100%|██████████| 100/100 [00:30<00:00,  3.27it/s]
 97%|█████████▋| 97/100 [00:30<00:00,  3.62it/s]]
100%|██████████| 100/100 [00:30<00:00,  3.26it/s]
100%|██████████| 100/100 [00:30<00:00,  3.24it/s]
100%|██████████| 100/100 [00:31<00:00,  3.17it/s]
100%|██████████| 100/100 [00:32<00:00,  3.12it/s]
100%|██████████| 100/100 [00:33<00:00,  2.99it/s]
 21%|██        | 21/100 [00:07<00:26,  3.00it/s]

skipped


 45%|████▌     | 45/100 [00:11<00:16,  3.37it/s]

skipped


 91%|█████████ | 91/100 [00:25<00:02,  4.11it/s]]
100%|██████████| 100/100 [00:25<00:00,  3.85it/s]
100%|██████████| 100/100 [00:26<00:00,  3.79it/s]
100%|██████████| 100/100 [00:27<00:00,  3.58it/s]
100%|██████████| 100/100 [00:27<00:00,  3.58it/s]
100%|██████████| 100/100 [00:28<00:00,  3.56it/s]
100%|██████████| 100/100 [00:28<00:00,  3.56it/s]
100%|██████████| 100/100 [00:28<00:00,  3.54it/s]
100%|██████████| 100/100 [00:28<00:00,  3.52it/s]
100%|██████████| 100/100 [00:29<00:00,  3.41it/s]
  6%|▌         | 6/100 [00:01<00:31,  3.00it/s]

skipped


 16%|█▌        | 16/100 [00:04<00:16,  5.04it/s]

skipped


 78%|███████▊  | 78/100 [00:21<00:05,  4.12it/s]

skipped


 99%|█████████▉| 99/100 [00:23<00:00,  5.48it/s]

skipped


100%|██████████| 100/100 [00:23<00:00,  4.21it/s]
100%|██████████| 100/100 [00:23<00:00,  4.17it/s]
100%|██████████| 100/100 [00:24<00:00,  4.09it/s]
100%|██████████| 100/100 [00:24<00:00,  4.07it/s]
 99%|█████████▉| 99/100 [00:24<00:00,  3.95it/s]
100%|██████████| 100/100 [00:24<00:00,  4.07it/s]
100%|██████████| 100/100 [00:24<00:00,  4.06it/s]
100%|██████████| 100/100 [00:25<00:00,  3.88it/s]
100%|██████████| 100/100 [00:25<00:00,  3.86it/s]
100%|██████████| 100/100 [00:25<00:00,  3.85it/s]
100%|██████████| 100/100 [00:27<00:00,  3.57it/s]
 94%|█████████▍| 94/100 [00:27<00:01,  5.18it/s]]
100%|██████████| 100/100 [00:29<00:00,  3.39it/s]
100%|██████████| 100/100 [00:29<00:00,  3.36it/s]
100%|██████████| 100/100 [00:29<00:00,  3.35it/s]
100%|██████████| 100/100 [00:30<00:00,  3.29it/s]
100%|██████████| 100/100 [00:30<00:00,  3.24it/s]
100%|██████████| 100/100 [00:31<00:00,  3.21it/s]
100%|██████████| 100/100 [00:31<00:00,  3.20it/s]
100%|██████████| 100/100 [00:31<00:00,  3.19it/s]
1

skipped


100%|██████████| 100/100 [00:25<00:00,  3.93it/s]
 96%|█████████▌| 96/100 [00:25<00:01,  3.61it/s]]
100%|██████████| 100/100 [00:26<00:00,  3.75it/s]
100%|██████████| 100/100 [00:27<00:00,  3.70it/s]
100%|██████████| 100/100 [00:27<00:00,  3.68it/s]
100%|██████████| 100/100 [00:27<00:00,  3.66it/s]
100%|██████████| 100/100 [00:27<00:00,  3.59it/s]
100%|██████████| 100/100 [00:27<00:00,  3.57it/s]
100%|██████████| 100/100 [00:28<00:00,  3.55it/s]
100%|██████████| 100/100 [00:28<00:00,  3.51it/s]
 36%|███▌      | 36/100 [00:09<00:14,  4.42it/s]

skipped


 43%|████▎     | 43/100 [00:12<00:16,  3.47it/s]

skipped


 76%|███████▌  | 76/100 [00:20<00:07,  3.42it/s]

skipped


 85%|████████▌ | 85/100 [00:25<00:04,  3.00it/s]

skipped


100%|██████████| 100/100 [00:27<00:00,  3.63it/s]
100%|██████████| 100/100 [00:27<00:00,  3.58it/s]
100%|██████████| 100/100 [00:28<00:00,  3.57it/s]
100%|██████████| 100/100 [00:28<00:00,  3.51it/s]
 98%|█████████▊| 98/100 [00:28<00:00,  3.60it/s]]
100%|██████████| 100/100 [00:28<00:00,  3.51it/s]
 92%|█████████▏| 92/100 [00:28<00:01,  4.01it/s]
100%|██████████| 100/100 [00:29<00:00,  3.43it/s]
100%|██████████| 100/100 [00:29<00:00,  3.35it/s]
100%|██████████| 100/100 [00:29<00:00,  3.35it/s]
100%|██████████| 100/100 [00:27<00:00,  3.67it/s]
100%|██████████| 100/100 [00:28<00:00,  3.51it/s]
100%|██████████| 100/100 [00:28<00:00,  3.51it/s]
100%|██████████| 100/100 [00:28<00:00,  3.48it/s]
100%|██████████| 100/100 [00:29<00:00,  3.44it/s]
100%|██████████| 100/100 [00:29<00:00,  3.39it/s]
100%|██████████| 100/100 [00:29<00:00,  3.36it/s]
100%|██████████| 100/100 [00:30<00:00,  3.27it/s]
100%|██████████| 100/100 [00:30<00:00,  3.26it/s]
100%|██████████| 100/100 [00:31<00:00,  3.20it/s]
 

skipped


100%|██████████| 100/100 [00:23<00:00,  4.31it/s]
100%|██████████| 100/100 [00:24<00:00,  4.13it/s]
100%|██████████| 100/100 [00:24<00:00,  4.09it/s]
 90%|█████████ | 90/100 [00:24<00:02,  3.50it/s]]
100%|██████████| 100/100 [00:24<00:00,  4.06it/s]
 96%|█████████▌| 96/100 [00:24<00:01,  3.79it/s]]
100%|██████████| 100/100 [00:25<00:00,  3.92it/s]
100%|██████████| 100/100 [00:26<00:00,  3.81it/s]
100%|██████████| 100/100 [00:26<00:00,  3.79it/s]
100%|██████████| 100/100 [00:26<00:00,  3.74it/s]
 87%|████████▋ | 87/100 [00:23<00:05,  2.59it/s]]
100%|██████████| 100/100 [00:24<00:00,  4.08it/s]
100%|██████████| 100/100 [00:24<00:00,  3.13it/s]
100%|██████████| 100/100 [00:25<00:00,  3.75it/s]
100%|██████████| 100/100 [00:25<00:00,  3.93it/s]
100%|██████████| 100/100 [00:25<00:00,  3.89it/s]
100%|██████████| 100/100 [00:26<00:00,  3.82it/s]
100%|██████████| 100/100 [00:27<00:00,  3.66it/s]
100%|██████████| 100/100 [00:27<00:00,  3.65it/s]
100%|██████████| 100/100 [00:27<00:00,  3.63it/s]


skipped


100%|██████████| 100/100 [00:26<00:00,  3.72it/s]
100%|██████████| 100/100 [00:27<00:00,  3.68it/s]
 99%|█████████▉| 99/100 [00:27<00:00,  3.61it/s]]
 98%|█████████▊| 98/100 [00:27<00:00,  4.09it/s]]
100%|██████████| 100/100 [00:27<00:00,  3.59it/s]
100%|██████████| 100/100 [00:27<00:00,  3.58it/s]
100%|██████████| 100/100 [00:28<00:00,  3.57it/s]
 86%|████████▌ | 86/100 [00:23<00:02,  4.88it/s]

skipped


100%|██████████| 100/100 [00:25<00:00,  3.97it/s]
100%|██████████| 100/100 [00:25<00:00,  3.91it/s]
100%|██████████| 100/100 [00:26<00:00,  3.79it/s]
100%|██████████| 100/100 [00:26<00:00,  3.78it/s]
100%|██████████| 100/100 [00:26<00:00,  3.78it/s]
100%|██████████| 100/100 [00:26<00:00,  3.73it/s]
100%|██████████| 100/100 [00:26<00:00,  3.71it/s]
100%|██████████| 100/100 [00:27<00:00,  3.69it/s]
100%|██████████| 100/100 [00:27<00:00,  3.68it/s]
100%|██████████| 100/100 [00:27<00:00,  3.63it/s]
 95%|█████████▌| 95/100 [00:19<00:01,  4.63it/s]

skipped


100%|██████████| 100/100 [00:20<00:00,  4.97it/s]

100%|██████████| 100/100 [00:20<00:00,  4.89it/s]
100%|██████████| 100/100 [00:20<00:00,  4.83it/s]
100%|██████████| 100/100 [00:20<00:00,  4.79it/s]
 93%|█████████▎| 93/100 [00:20<00:01,  6.88it/s]
100%|██████████| 100/100 [00:21<00:00,  4.76it/s]
100%|██████████| 100/100 [00:21<00:00,  4.70it/s]
100%|██████████| 100/100 [00:21<00:00,  4.68it/s]
100%|██████████| 100/100 [00:21<00:00,  4.60it/s]
 24%|██▍       | 24/100 [00:05<00:16,  4.51it/s]

skipped


 87%|████████▋ | 87/100 [00:20<00:04,  2.86it/s]]
100%|██████████| 100/100 [00:20<00:00,  4.83it/s]
100%|██████████| 100/100 [00:20<00:00,  4.83it/s]
100%|██████████| 100/100 [00:20<00:00,  4.78it/s]
100%|██████████| 100/100 [00:21<00:00,  4.72it/s]
100%|██████████| 100/100 [00:21<00:00,  4.63it/s]
100%|██████████| 100/100 [00:22<00:00,  4.50it/s]
100%|██████████| 100/100 [00:22<00:00,  4.45it/s]
100%|██████████| 100/100 [00:22<00:00,  4.44it/s]
100%|██████████| 100/100 [00:22<00:00,  4.40it/s]
  9%|▉         | 9/100 [00:01<00:16,  5.36it/s]

skipped


100%|██████████| 100/100 [00:20<00:00,  4.98it/s]
100%|██████████| 100/100 [00:20<00:00,  4.86it/s]
100%|██████████| 100/100 [00:20<00:00,  4.78it/s]
100%|██████████| 100/100 [00:21<00:00,  4.74it/s]
100%|██████████| 100/100 [00:21<00:00,  4.73it/s]
100%|██████████| 100/100 [00:21<00:00,  4.67it/s]
100%|██████████| 100/100 [00:21<00:00,  4.67it/s]
100%|██████████| 100/100 [00:21<00:00,  4.65it/s]
100%|██████████| 100/100 [00:21<00:00,  4.65it/s]
100%|██████████| 100/100 [00:21<00:00,  4.55it/s]
100%|██████████| 100/100 [00:20<00:00,  4.87it/s]
100%|██████████| 100/100 [00:20<00:00,  4.85it/s]
100%|██████████| 100/100 [00:20<00:00,  4.85it/s]
100%|██████████| 100/100 [00:20<00:00,  4.84it/s]
100%|██████████| 100/100 [00:20<00:00,  4.83it/s]
100%|██████████| 100/100 [00:20<00:00,  4.81it/s]
100%|██████████| 100/100 [00:21<00:00,  4.74it/s]
100%|██████████| 100/100 [00:21<00:00,  4.73it/s]
100%|██████████| 100/100 [00:21<00:00,  4.72it/s]
100%|██████████| 100/100 [00:21<00:00,  4.70it/s]


skipped


 84%|████████▍ | 84/100 [00:16<00:03,  5.04it/s]

skipped


100%|██████████| 100/100 [00:19<00:00,  5.04it/s]
100%|██████████| 100/100 [00:20<00:00,  4.85it/s]
100%|██████████| 100/100 [00:20<00:00,  4.84it/s]
100%|██████████| 100/100 [00:20<00:00,  4.84it/s]
100%|██████████| 100/100 [00:20<00:00,  4.78it/s]
100%|██████████| 100/100 [00:20<00:00,  4.77it/s]
100%|██████████| 100/100 [00:21<00:00,  4.70it/s]
100%|██████████| 100/100 [00:21<00:00,  4.70it/s]
100%|██████████| 100/100 [00:21<00:00,  4.58it/s]
100%|██████████| 100/100 [00:21<00:00,  4.56it/s]
 49%|████▉     | 49/100 [00:11<00:10,  4.70it/s]

skipped


100%|██████████| 100/100 [00:18<00:00,  5.51it/s]
100%|██████████| 100/100 [00:19<00:00,  5.08it/s]
100%|██████████| 100/100 [00:20<00:00,  4.97it/s]
100%|██████████| 100/100 [00:20<00:00,  4.91it/s]
100%|██████████| 100/100 [00:21<00:00,  4.74it/s]
100%|██████████| 100/100 [00:21<00:00,  4.70it/s]
100%|██████████| 100/100 [00:22<00:00,  4.54it/s]
100%|██████████| 100/100 [00:22<00:00,  4.53it/s]
100%|██████████| 100/100 [00:22<00:00,  4.51it/s]
100%|██████████| 100/100 [00:22<00:00,  4.49it/s]
 97%|█████████▋| 97/100 [00:19<00:00,  6.08it/s]]
100%|██████████| 100/100 [00:19<00:00,  5.08it/s]
100%|██████████| 100/100 [00:19<00:00,  5.07it/s]
100%|██████████| 100/100 [00:19<00:00,  5.05it/s]
100%|██████████| 100/100 [00:20<00:00,  4.93it/s]
100%|██████████| 100/100 [00:20<00:00,  5.85it/s]
100%|██████████| 100/100 [00:20<00:00,  4.84it/s]
100%|██████████| 100/100 [00:20<00:00,  4.83it/s]
100%|██████████| 100/100 [00:20<00:00,  4.82it/s]
100%|██████████| 100/100 [00:21<00:00,  4.72it/s]


skipped


 53%|█████▎    | 53/100 [00:11<00:08,  5.46it/s]

skipped


100%|██████████| 100/100 [00:19<00:00,  5.11it/s]
 96%|█████████▌| 96/100 [00:19<00:00,  6.04it/s]]
 87%|████████▋ | 87/100 [00:19<00:03,  3.26it/s]
100%|██████████| 100/100 [00:20<00:00,  4.89it/s]
100%|██████████| 100/100 [00:20<00:00,  4.88it/s]
100%|██████████| 100/100 [00:20<00:00,  4.87it/s]
100%|██████████| 100/100 [00:20<00:00,  4.77it/s]
100%|██████████| 100/100 [00:21<00:00,  4.74it/s]
100%|██████████| 100/100 [00:21<00:00,  4.65it/s]
100%|██████████| 100/100 [00:21<00:00,  4.65it/s]
 41%|████      | 41/100 [00:08<00:13,  4.35it/s]

skipped


 44%|████▍     | 44/100 [00:09<00:14,  3.90it/s]

skipped


100%|██████████| 100/100 [00:19<00:00,  5.05it/s]
100%|██████████| 100/100 [00:20<00:00,  4.93it/s]
100%|██████████| 100/100 [00:20<00:00,  4.91it/s]
100%|██████████| 100/100 [00:20<00:00,  4.77it/s]
100%|██████████| 100/100 [00:21<00:00,  4.76it/s]
100%|██████████| 100/100 [00:21<00:00,  4.66it/s]
100%|██████████| 100/100 [00:21<00:00,  4.65it/s]
100%|██████████| 100/100 [00:21<00:00,  4.60it/s]
100%|██████████| 100/100 [00:21<00:00,  4.59it/s]
100%|██████████| 100/100 [00:21<00:00,  4.56it/s]
 43%|████▎     | 43/100 [00:08<00:13,  4.28it/s]

skipped


 67%|██████▋   | 67/100 [00:14<00:07,  4.18it/s]

skipped


 71%|███████   | 71/100 [00:14<00:04,  6.50it/s]

skipped


100%|██████████| 100/100 [00:20<00:00,  4.93it/s]
 98%|█████████▊| 98/100 [00:20<00:00,  5.85it/s]]
100%|██████████| 100/100 [00:20<00:00,  4.83it/s]
100%|██████████| 100/100 [00:20<00:00,  4.81it/s]
100%|██████████| 100/100 [00:21<00:00,  4.71it/s]
100%|██████████| 100/100 [00:21<00:00,  4.68it/s]
100%|██████████| 100/100 [00:21<00:00,  4.66it/s]
100%|██████████| 100/100 [00:21<00:00,  4.62it/s]
100%|██████████| 100/100 [00:21<00:00,  4.60it/s]
100%|██████████| 100/100 [00:21<00:00,  4.58it/s]
 35%|███▌      | 17/48 [00:03<00:08,  3.50it/s]

skipped


100%|██████████| 48/48 [00:08<00:00,  5.59it/s]
100%|██████████| 48/48 [00:09<00:00,  5.22it/s]
100%|██████████| 2/2 [00:00<00:00,  3.27it/s]s]
100%|██████████| 48/48 [00:09<00:00,  5.04it/s]
100%|██████████| 48/48 [00:10<00:00,  4.77it/s]
100%|██████████| 48/48 [00:10<00:00,  4.76it/s]
100%|██████████| 48/48 [00:10<00:00,  4.69it/s]
100%|██████████| 48/48 [00:10<00:00,  4.57it/s]
100%|██████████| 48/48 [00:10<00:00,  4.56it/s]
100%|██████████| 48/48 [00:10<00:00,  4.50it/s]
100%|██████████| 48/48 [00:10<00:00,  4.48it/s]


In [13]:
scaler_mel = StandardScaler(copy=False)
scaler_energy = StandardScaler(copy=False)
scaler_f0 = StandardScaler(copy=False)

for mel, f0, energy in zip(mels, f0s, energies):

    scaler_mel.partial_fit(mel)
    scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1))
    scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1))

In [14]:
for i in tqdm(range(len(mels))):
    mels[i] = scaler_mel.transform(mels[i])

100%|██████████| 39445/39445 [00:09<00:00, 4091.63it/s]


In [15]:
def save_statistics_to_file(scaler_list, config):
    os.system('mkdir female-stats')
    for scaler, name in scaler_list:
        stats = np.stack((scaler.mean_, scaler.scale_))
        np.save(
            os.path.join(f"female-stats/stats{name}.npy"),
            stats.astype(np.float32),
            allow_pickle=False,
        )

In [16]:
scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")]
save_statistics_to_file(scaler_list, config)

In [17]:
!rm -rf output-female-v2

In [18]:
os.system('mkdir output-female-v2')
directories = ['audios', 'mels', 'text_ids', 'f0s', 'energies']
for d in directories:
    os.system(f'mkdir output-female-v2/{d}')

In [19]:
for i in tqdm(range(len(mels))):
    np.save(f'output-female-v2/audios/{i}.npy', audios[i])
    np.save(f'output-female-v2/mels/{i}.npy', mels[i])
    np.save(f'output-female-v2/text_ids/{i}.npy', text_ids[i])
    np.save(f'output-female-v2/f0s/{i}.npy', f0s[i])
    np.save(f'output-female-v2/energies/{i}.npy', energies[i])

100%|██████████| 39445/39445 [03:44<00:00, 175.39it/s]
