In [1]:
import parselmouth
import librosa
import pyworld as pw
from pydub import AudioSegment
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
import malaya_speech
from malaya_speech import Pipeline
from malaya_speech.utils.text import TextIDS
from glob import glob
import json

`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [2]:
base_directory = '/home/husein/ssd2'

In [3]:
import yaml

with open('config.yaml') as fopen:
    config = yaml.safe_load(fopen)
    
config

{'sampling_rate': 22050,
 'fft_size': 1024,
 'hop_size': 256,
 'win_length': None,
 'window': 'hann',
 'num_mels': 80,
 'fmin': 80,
 'fmax': 7600,
 'global_gain_scale': 1.0,
 'trim_silence': True,
 'trim_threshold_in_db': 60,
 'trim_frame_size': 2048,
 'trim_hop_size': 512}

In [4]:
import numpy as np

# https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/utils/outliers.py
def is_outlier(x, p25, p75):
    """Check if value is an outlier."""
    lower = p25 - 1.5 * (p75 - p25)
    upper = p75 + 1.5 * (p75 - p25)
    return x <= lower or x >= upper


def remove_outlier(x, p_bottom: int = 25, p_top: int = 75):
    """Remove outlier from x."""
    p_bottom = np.percentile(x, p_bottom)
    p_top = np.percentile(x, p_top)

    indices_of_outliers = []
    for ind, value in enumerate(x):
        if is_outlier(value, p_bottom, p_top):
            indices_of_outliers.append(ind)

    x[indices_of_outliers] = 0.0
    x[indices_of_outliers] = np.max(x)
    return x

In [5]:
import re

_pad = 'pad'
_start = 'start'
_eos = 'eos'
_punctuation = "!'(),.:;? "
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'

MALAYA_SPEECH_SYMBOLS = (
    [_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters)
)

In [6]:
def tts_encode(string: str, add_eos: bool = True):
    r = [MALAYA_SPEECH_SYMBOLS.index(c) for c in string if c in MALAYA_SPEECH_SYMBOLS]
    if add_eos:
        r = r + [MALAYA_SPEECH_SYMBOLS.index('eos')]
    return r

In [7]:
from unidecode import unidecode
import malaya

normalizer = malaya.normalize.normalizer()
sentence_tokenizer = malaya.text.function.split_into_sentences

tokenizer = TextIDS(
    pad_to=None,
    understand_punct=False,
    is_lower=True,
    normalizer=normalizer,
    sentence_tokenizer=sentence_tokenizer,
)

In [8]:
tokenizer.normalize('saya tak suka ayam.')

('saya tak suka ayam.',
 array([58, 40, 64, 40, 13, 59, 40, 50, 13, 58, 60, 50, 40, 13, 40, 64, 40,
        52,  9]))

In [9]:
# !wget https://huggingface.co/datasets/mesolitica/azure-tts-yasmin/resolve/main/postprocessing-edge-tts-news-yasmin.json

In [4]:
# !wget https://huggingface.co/datasets/mesolitica/azure-tts-yasmin/resolve/main/postprocessing-edge-tts-parliament-yasmin.json

In [10]:
with open('postprocessing-edge-tts-news-yasmin.json') as fopen:
    text = json.load(fopen)

directory = 'yasmin-multispeaker'
os.makedirs(directory, exist_ok=True)
yasmin = []
for t in text:
    index = os.path.split(t[0])[1].replace('.json', '')
    wav = t[0].replace('-text', '-wav').replace('.json', '.wav')
    wav = os.path.join(base_directory, wav)
    if os.path.exists(wav):
        yasmin.append((wav, t[1], directory))
        
len(yasmin)

48750

In [11]:
# !wget https://huggingface.co/datasets/mesolitica/azure-tts-osman/resolve/main/postprocessing-edge-tts-news.json

In [1]:
!wget https://huggingface.co/datasets/mesolitica/azure-tts-osman/resolve/main/postprocessing-edge-tts-parliament.json

--2023-03-12 11:51:09--  https://huggingface.co/datasets/mesolitica/azure-tts-osman/resolve/main/postprocessing-edge-tts-parliament.json
Resolving huggingface.co (huggingface.co)... 54.165.206.104, 54.160.18.166, 3.223.56.9, ...
Connecting to huggingface.co (huggingface.co)|54.165.206.104|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/2e/ff/2effb71c92a38c30df1a938f66e3868e88c7ebb859410c37c47d0fd56510f503/4dc8df4b7fcddae6a793b1a0b3b5ca46ae07078d67be61ade023f38a417f1c89?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27postprocessing-edge-tts-parliament.json%3B+filename%3D%22postprocessing-edge-tts-parliament.json%22%3B&response-content-type=application%2Fjson&Expires=1678852270&Policy=eyJTdGF0ZW1lbnQiOlt7IlJlc291cmNlIjoiaHR0cHM6Ly9jZG4tbGZzLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzJlL2ZmLzJlZmZiNzFjOTJhMzhjMzBkZjFhOTM4ZjY2ZTM4NjhlODhjN2ViYjg1OTQxMGMzN2M0N2QwZmQ1NjUxMGY1MDMvNGRjOGRmNGI3ZmNkZGFlNmE3OTNiMWEwYjNiNWNhND

In [12]:
with open('postprocessing-edge-tts-news.json') as fopen:
    text = json.load(fopen)

directory = 'osman-multispeaker'
os.makedirs(directory, exist_ok=True)
osman = []
for t in text:
    index = os.path.split(t[0])[1].replace('.json', '')
    wav = t[0].replace('-text', '-wav').replace('.json', '.wav')
    wav = os.path.join(base_directory, wav)
    if os.path.exists(wav):
        osman.append((wav, t[1], directory))
        
len(osman)

48750

In [13]:
with open('gtts-text.json') as fopen:
    texts = json.load(fopen)

In [14]:
wavenet_a = []
directory = 'ms-MY-Wavenet-A-multispeaker'
os.makedirs(directory, exist_ok=True)
for no, t in enumerate(texts):
    wav = f'ms-MY-Wavenet-A/{no}.mp3'
    wav = os.path.join(base_directory, wav)
    if os.path.exists(wav):
        wavenet_a.append((wav, t, directory))
        
len(wavenet_a)

31069

In [15]:
wavenet_b = []
directory = 'ms-MY-Wavenet-B-multispeaker'
os.makedirs(directory, exist_ok=True)
for no, t in enumerate(texts):
    wav = f'ms-MY-Wavenet-B/{no}.mp3'
    wav = os.path.join(base_directory, wav)
    if os.path.exists(wav):
        wavenet_b.append((wav, t, directory))
        
len(wavenet_b)

31069

In [16]:
wavenet_c = []
directory = 'ms-MY-Wavenet-C-multispeaker'
os.makedirs(directory, exist_ok=True)
for no, t in enumerate(texts):
    wav = f'ms-MY-Wavenet-C/{no}.mp3'
    wav = os.path.join(base_directory, wav)
    if os.path.exists(wav):
        wavenet_c.append((wav, t, directory))
        
len(wavenet_c)

31069

In [17]:
wavenet_d = []
directory = 'ms-MY-Wavenet-D-multispeaker'
os.makedirs(directory, exist_ok=True)
for no, t in enumerate(texts):
    wav = f'ms-MY-Wavenet-D/{no}.mp3'
    wav = os.path.join(base_directory, wav)
    if os.path.exists(wav):
        wavenet_d.append((wav, t, directory))
        
len(wavenet_d)

31069

In [18]:
from unidecode import unidecode
import malaya

normalizer = malaya.normalize.normalizer(date = False, time = False, money = False)

def put_spacing_num(string):
    string = re.sub('[A-Za-z]+', lambda ele: ' ' + ele[0] + ' ', string)
    return re.sub(r'[ ]+', ' ', string).strip()

def convert_to_ascii(string):
    return unidecode(string)

def collapse_whitespace(string):
    return re.sub(_whitespace_re, ' ', string)

def cleaning(string, normalize = True, add_eos = False):
    sequence = []
    string = convert_to_ascii(string)
    string = string.replace('&', ' dan ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    if string[-1] in ['-', ',']:
        string = string[:-1]
    if string[-1] != '.':
        string = string + '.'
    if normalize:
        string = normalizer.normalize(string, 
                                      check_english = False, 
                                      normalize_entity = False, 
                                      normalize_text = False,
                                      normalize_url = True,
                                      normalize_email = True,
                                      normalize_year = True)
        string = string['normalize']
    else:
        string = string
    string = put_spacing_num(string)
    string = ''.join([c for c in string if c in MALAYA_SPEECH_SYMBOLS])
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [19]:
import pandas as pd

df = pd.read_csv('/home/husein/ssd2/haqkiem/metadata.csv', header = None, sep = '|')
txts = df.values.tolist()

In [20]:
haqkiem = []
directory = 'haqkiem-multispeaker'
os.makedirs(directory, exist_ok=True)
for f in txts:
    text = f[1]
    f = f[0]
    wav = f'haqkiem/{f}.wav'
    wav = os.path.join(base_directory, wav)
    
    text = text.split('.,,')[0]
    if len(re.findall(r'(RM \d+,\d+\.\d+|RM \d+\.\d+)', text)):
        continue

    text = f'{text} .'
    text = cleaning(text)
    if os.path.exists(wav):
        haqkiem.append((wav, text, directory))
        
len(haqkiem)

4289

In [21]:
def cleaning(string, normalize = True, add_eos = False):
    sequence = []
    string = convert_to_ascii(string)
    string = string.replace('&', ' dan ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = string
    string = put_spacing_num(string)
    string = ''.join([c for c in string if c in MALAYA_SPEECH_SYMBOLS])
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [22]:
with open('true-case-pasentran-turki.json') as fopen:
    data = json.load(fopen)
    
male = []
directory = 'male-multispeaker'
os.makedirs(directory, exist_ok=True)
for k, v in data.items():
    try:
        wav = os.path.join(base_directory, k)
        if os.path.exists(wav):
            male.append((wav, cleaning(v['true_case']), directory))
    except Exception as e:
        print(e, k, v)
        
len(male)

11083

In [23]:
with open('true-case-salina.json') as fopen:
    data = json.load(fopen)
    
female = []
directory = 'female-multispeaker'
os.makedirs(directory, exist_ok=True)
for k, v in data.items():
    try:
        wav = os.path.join(base_directory, k)
        if os.path.exists(wav):
            female.append((wav, cleaning(v['true_case']), directory))
    except Exception as e:
        print(e, k, v)
        
len(female)

12204

In [24]:
female[:2]

[('/home/husein/ssd2/salina/output-wav-salina/dua-3.mp3-0.wav',
  'Dua .',
  'female-multispeaker'),
 ('/home/husein/ssd2/salina/output-wav-salina/dua-3.mp3-1.wav',
  'Bilik mandi di bangunan PWD itu , memanjang bentuknya , bukannya ,',
  'female-multispeaker')]

In [25]:
import malaya_speech
import soundfile as sf
from malaya_speech import Pipeline
from datasets import Audio
from tqdm import tqdm

def process(txts, 
            start_silent_trail = int(0.15 * config['sampling_rate']),
            middle_silent_trail = int(0.2 * config['sampling_rate']),
            end_silent_trail = int(0.2 * config['sampling_rate']),
            process_middle_silent = True,
            maxlen = 20):
    
    reader = Audio(sampling_rate = 22050)
    vad = malaya_speech.vad.webrtc()
    txts = txts[0]
    audios, text_ids = [], []

    for f in tqdm(txts):
        
        directory = f[2]
        text = f[1]
        f = f[0]
        
        if '.mp3' in f:
            sound = AudioSegment.from_mp3(f)
            samples = sound.get_array_of_samples()
            samples = np.array(samples)
            samples = malaya_speech.astype.int_to_float(samples)
            audio = malaya_speech.resample(samples, sound.frame_rate, 22050)
            
        else:
            audio = reader.decode_example(reader.encode_example(f))['array']

        if config['trim_silence']:
            y_= malaya_speech.resample(audio, config['sampling_rate'], 16000)
            y_ = malaya_speech.astype.float_to_int(y_)
            frames = list(malaya_speech.generator.frames(audio, 30, config['sampling_rate']))
            frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
            frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
            grouped_deep = malaya_speech.group.group_frames(frames_webrtc)
            grouped_deep = malaya_speech.group.group_frames_threshold(grouped_deep, 0.15)
            r = []
            for no, g in enumerate(grouped_deep):
                if g[1]:
                    g = g[0].array
                else:
                    if no == 0:
                        g = g[0].array[-start_silent_trail:]
                    elif no == (len(grouped_deep) - 1):
                        g = g[0].array[:end_silent_trail]
                    else:
                        if process_middle_silent:
                            g = np.concatenate([g[0].array[:middle_silent_trail], g[0].array[-middle_silent_trail:]])
                        else:
                            g = g[0].array
                        
                r.append(g)
            audio = np.concatenate(r)
        
        if (len(audio) / config['sampling_rate']) > maxlen:
            continue
        
        if (len(audio) / config['sampling_rate']) < 0.5:
            continue
            
        audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
        f = os.path.split(f)[1]
        new_f = f'{directory}/{f}'.replace('.mp3', '.wav').replace('.flac', '.wav')
#         new_f = f'{directory}/{f}'.replace('.wav', '.mp3')
#         torchaudio.save(new_f, torch.tensor(audio.astype('float32')).unsqueeze(0), 
#                         22050, format='mp3')
        
        sf.write(new_f, audio, 22050)
        audios.append(new_f)
        text_ids.append(text)
    
    return [[audios, text_ids]]

In [26]:
speakers = [
    yasmin,
    osman,
    wavenet_a,
    wavenet_b,
    wavenet_c,
    wavenet_d,
    haqkiem,
    male,
    female,
]
len(speakers)

9

In [27]:
audios, text_ids, speakers_id = [], [], []
for i in range(len(speakers)):
    r = process((speakers[i][:10],))[0]
    audios.extend(r[0])
    text_ids.extend(r[1])
    speakers_id.extend([i] * len(r[1]))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  6.76it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  8.80it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.13it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.28it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.46it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 71.85it/s]
100%|█████████████████████████████

In [28]:
r = process((osman[:10],))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.10it/s]


In [29]:
r[0][0]

['osman-multispeaker/138.wav',
 'osman-multispeaker/32958.wav',
 'osman-multispeaker/10582.wav',
 'osman-multispeaker/456.wav',
 'osman-multispeaker/18717.wav',
 'osman-multispeaker/30899.wav',
 'osman-multispeaker/45320.wav',
 'osman-multispeaker/6376.wav',
 'osman-multispeaker/47642.wav',
 'osman-multispeaker/19271.wav']

In [30]:
import IPython.display as ipd
ipd.Audio(r[0][0][-2])

In [31]:
import mp

audios, text_ids, speakers_id = [], [], []
batch_size = 5000
for n in range(len(speakers)):
    for i in range(0, len(speakers[n]), batch_size):
        b = speakers[n][i: i + batch_size]
        results = mp.multiprocessing(b, process, cores = 10, returned = True)
        for result in results:
            audios.extend(result[0])
            text_ids.extend(result[1])
            speakers_id.extend([n] * len(result[1]))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:02<00:00,  7.96it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:03<00:00,  7.90it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:04<00:00,  7.80it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:04<00:00,  7.77it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:04<00:00,  7.74it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:04<00:00,  7.74it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:04<00:00,  7.74it/s]
100%|█████████████████████████████

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:05<00:00,  7.60it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:06<00:00,  7.50it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:07<00:00,  7.42it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:03<00:00,  7.82it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:04<00:00,  7.75it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:05<00:00,  7.69it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:05<00:00,  7.69it/s]
100%|█████████████████████████████

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:48<00:00,  4.61it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:48<00:00,  4.60it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:48<00:00,  4.60it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:48<00:00,  4.60it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:48<00:00,  4.59it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 106/106 [00:20<00:00,  5.28it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 106/106 [00:21<00:00,  5.03it/s]
100%|█████████████████████████████

In [32]:
from tqdm import tqdm
import random

data = []
for i in tqdm(range(len(audios))):
    data.append((os.path.join(os.getcwd(), audios[i]), speakers_id[i], text_ids[i]))
    
random.shuffle(data)

100%|███████████████████████████████████████████████████████████████████████████████████████| 249245/249245 [00:00<00:00, 1001181.12it/s]


In [33]:
len(data)

249245

In [34]:
import json

with open('multispeaker-clean-vits.json', 'w') as fopen:
    json.dump(data, fopen)