In [1]:
import parselmouth
import librosa
import pyworld as pw
from pydub import AudioSegment
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
import malaya_speech
from malaya_speech import Pipeline
from malaya_speech.utils.text import TextIDS
from glob import glob
import json
import pickle

`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [2]:
# !pip3 install ~/dev/malaya-speech --no-deps

In [3]:
base_directory = '/home/husein/ssd2'

In [4]:
import yaml

with open('config.yaml') as fopen:
    config = yaml.safe_load(fopen)
    
config

{'sampling_rate': 22050,
 'fft_size': 1024,
 'hop_size': 256,
 'win_length': None,
 'window': 'hann',
 'num_mels': 80,
 'fmin': 80,
 'fmax': 7600,
 'global_gain_scale': 1.0,
 'trim_silence': True,
 'trim_threshold_in_db': 60,
 'trim_frame_size': 2048,
 'trim_hop_size': 512}

In [5]:
import numpy as np

# https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/utils/outliers.py
def is_outlier(x, p25, p75):
    """Check if value is an outlier."""
    lower = p25 - 1.5 * (p75 - p25)
    upper = p75 + 1.5 * (p75 - p25)
    return x <= lower or x >= upper


def remove_outlier(x, p_bottom: int = 25, p_top: int = 75):
    """Remove outlier from x."""
    p_bottom = np.percentile(x, p_bottom)
    p_top = np.percentile(x, p_top)

    indices_of_outliers = []
    for ind, value in enumerate(x):
        if is_outlier(value, p_bottom, p_top):
            indices_of_outliers.append(ind)

    x[indices_of_outliers] = 0.0
    x[indices_of_outliers] = np.max(x)
    return x

In [6]:
import re

_pad = 'pad'
_start = 'start'
_eos = 'eos'
_punctuation = "!'(),.:;? "
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'

MALAYA_SPEECH_SYMBOLS = (
    [_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters)
)

In [7]:
def tts_encode(string: str, add_eos: bool = True):
    r = [MALAYA_SPEECH_SYMBOLS.index(c) for c in string if c in MALAYA_SPEECH_SYMBOLS]
    if add_eos:
        r = r + [MALAYA_SPEECH_SYMBOLS.index('eos')]
    return r

In [8]:
from unidecode import unidecode
import malaya

normalizer = malaya.normalize.normalizer()
sentence_tokenizer = malaya.text.function.split_into_sentences

tokenizer = TextIDS(
    pad_to=None,
    understand_punct=False,
    is_lower=True,
    normalizer=normalizer,
    sentence_tokenizer=sentence_tokenizer,
)

In [9]:
malaya_speech.__version__

'1.4.0'

In [10]:
tokenizer.normalize('saya tak suka ayam.', add_fullstop = False)

('saya tak suka ayam',
 array([58, 40, 64, 40, 13, 59, 40, 50, 13, 58, 60, 50, 40, 13, 40, 64, 40,
        52]))

In [11]:
with open('/home/husein/dev/malaya-speech/data/youtube/mapping-youtube-speakers-90.json') as fopen:
    speaker_mapping = json.load(fopen)

In [12]:
from tqdm import tqdm
from collections import defaultdict

pkls = sorted(glob('/home/husein/ssd2/processed-youtube-v2/*.pkl'))
total = defaultdict(list)
for pkl in tqdm(pkls):
    
    with open(pkl, 'rb') as fopen:
        data = pickle.load(fopen)
        
    filename = os.path.split(pkl)[1].replace('.pkl', '')
        
    for d in data:
        if len(d['asr_model'][0]) < 2:
            continue
        if d['asr_model'][1][0] >= 0.08:
            speaker = d['classification_model'][1]
            speaker_name = f'{filename}-{speaker}'
            
            total[speaker_mapping[speaker_name]].append([d['wav_data'], d['asr_model'][0], 'multispeaker-noisy'])
            
# total = sorted(total, reverse = False, key = lambda x: x['asr_model'][1][0])
len(total)

100%|█████████████████████████████████████| 5145/5145 [00:01<00:00, 4088.70it/s]


4581

In [13]:
import malaya_speech
import soundfile as sf
from malaya_speech import Pipeline
from datasets import Audio
from tqdm import tqdm

def process(txts, 
            start_silent_trail = int(0.15 * config['sampling_rate']),
            middle_silent_trail = int(0.2 * config['sampling_rate']),
            end_silent_trail = int(0.2 * config['sampling_rate']),
            process_middle_silent = True,
            maxlen = 20):
    
    reader = Audio(sampling_rate = 22050)
    vad = malaya_speech.vad.webrtc()
    txts = txts[0]
    audios, text_ids = [], []

    for f in tqdm(txts):
        
        directory = f[2]
        text = f[1]
        f = f[0]
        
        if '.mp3' in f:
            sound = AudioSegment.from_mp3(f)
            samples = sound.get_array_of_samples()
            samples = np.array(samples)
            samples = malaya_speech.astype.int_to_float(samples)
            audio = malaya_speech.resample(samples, sound.frame_rate, 22050)
            
        else:
            audio = reader.decode_example(reader.encode_example(f))['array']

        if config['trim_silence']:
            y_= malaya_speech.resample(audio, config['sampling_rate'], 16000)
            y_ = malaya_speech.astype.float_to_int(y_)
            frames = list(malaya_speech.generator.frames(audio, 30, config['sampling_rate']))
            frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
            frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
            grouped_deep = malaya_speech.group.group_frames(frames_webrtc)
            grouped_deep = malaya_speech.group.group_frames_threshold(grouped_deep, 0.15)
            r = []
            for no, g in enumerate(grouped_deep):
                if g[1]:
                    g = g[0].array
                else:
                    if no == 0:
                        g = g[0].array[-start_silent_trail:]
                    elif no == (len(grouped_deep) - 1):
                        g = g[0].array[:end_silent_trail]
                    else:
                        if process_middle_silent:
                            g = np.concatenate([g[0].array[:middle_silent_trail], g[0].array[-middle_silent_trail:]])
                        else:
                            g = g[0].array
                        
                r.append(g)
            audio = np.concatenate(r)
        
        if (len(audio) / config['sampling_rate']) > maxlen:
            continue
        
        if (len(audio) / config['sampling_rate']) < 0.5:
            continue
            
        audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
        new_f = '-'.join(f.split('/')[-2:]).replace('.mp3', '.wav')
        new_f = os.path.join(directory, new_f)
#         new_f = f'{directory}/{f}'.replace('.wav', '.mp3')
#         torchaudio.save(new_f, torch.tensor(audio.astype('float32')).unsqueeze(0), 
#                         22050, format='mp3')
        
        sf.write(new_f, audio, 22050)
        audios.append(new_f)
        text_ids.append(text)
    
    return [[audios, text_ids]]

In [14]:
# !rm -rf multispeaker-noisy
# !mkdir multispeaker-noisy

In [15]:
# import mp

# audios, text_ids, speakers_id = [], [], []
# batch_size = 5000

# count = 0
# maps = {}
# for k in tqdm(total.keys()):
#     nested_count = 0
#     for i in range(0, len(total[k]), batch_size):
#         b = total[k][i: i + batch_size]
#         results = mp.multiprocessing(b, process, cores = min(3, len(b)), returned = True)
#         for result in results:
#             if len(result[0]):
#                 audios.extend(result[0])
#                 text_ids.extend(result[1])
#                 speakers_id.extend([count] * len(result[1]))
#                 nested_count += 1
    
#     if nested_count > 0:
#         maps[count] = k
#         count += 1

In [16]:
count = 0
maps = {}
audios, text_ids, speakers_id = [], [], []
for k in tqdm(total.keys()):
    b = total[k]
    exists = 0
    for b_ in b:
        f = b_[0]
        directory = b_[-1]
        new_f = '-'.join(f.split('/')[-2:]).replace('.mp3', '.wav')
        new_f = os.path.join(directory, new_f)
        if os.path.exists(new_f):
            audios.append(new_f)
            text_ids.append(b_[1])
            speakers_id.append(count)
            exists += 1
            
    if exists:
        maps[count] = k
        count += 1

100%|████████████████████████████████████| 4581/4581 [00:00<00:00, 14937.49it/s]


In [17]:
s = sorted(set(speakers_id))
for n, k in enumerate(s):
    if n != k:
        print(n, k)

In [18]:
len(speakers_id)

49831

In [19]:
from tqdm import tqdm
import random

data = []
for i in tqdm(range(len(audios))):
    data.append((os.path.join(os.getcwd(), audios[i]), speakers_id[i], text_ids[i]))
    
random.shuffle(data)

100%|█████████████████████████████████| 49831/49831 [00:00<00:00, 829597.73it/s]


In [20]:
import json

with open('multispeaker-noisy-vits.json', 'w') as fopen:
    json.dump(data, fopen)

In [21]:
# import json

# with open('multispeaker-noisy-vits.json') as fopen:
#     data = json.load(fopen)

In [22]:
with open('multispeaker-clean-vits.json') as fopen:
    clean = json.load(fopen)

In [23]:
from collections import defaultdict
from tqdm import tqdm

speakers_clean = defaultdict(list)
for d in tqdm(clean):
    s = tokenizer.normalize(d[-1], add_fullstop = False, normalize = False)[0]
    speakers_clean[d[1]].append([d[0], d[1], s])

100%|████████████████████████████████| 249245/249245 [00:16<00:00, 14913.48it/s]


In [24]:
for k, v in speakers_clean.items():
    print(k, len(v))

5 31059
1 48733
2 31059
0 48734
4 31059
3 31059
8 12204
7 11049
6 4289


In [25]:
import random

In [26]:
combined = []
for k, v in speakers_clean.items():
    combined.extend(random.sample(v, min(len(v), 13000)))

In [27]:
len(combined)

105542

In [28]:
with open('multispeaker-clean-v2-vits.json') as fopen:
    clean_v2 = json.load(fopen)
    
for d in tqdm(clean_v2):
    s = tokenizer.normalize(d[-1], add_fullstop = False, normalize = False)[0]
    combined.append([d[0], d[1] + 9, s])

100%|████████████████████████████████████| 2951/2951 [00:00<00:00, 12088.87it/s]


In [29]:
len(combined)

108493

In [32]:
for d in tqdm(data):
    s = tokenizer.normalize(d[-1], add_fullstop = False, normalize = False)[0]
    combined.append([d[0], d[1] + 9 + 3, s])
    
len(combined)

100%|██████████████████████████████████| 49831/49831 [00:01<00:00, 44163.41it/s]


158324

In [33]:
import IPython.display as ipd
ipd.Audio(combined[-10][0])

In [34]:
combined[-10]

['/home/husein/speech-bahasa/multispeaker-noisy/10_x_Episod_Penuh____Bing_-_Episod_Terbaik__Bing_Bahasa_Melayu-PY2TduWM0sw-11.wav',
 169,
 'jalan jalan']

In [35]:
random.shuffle(combined)

with open('multispeaker-combined-vits.json', 'w') as fopen:
    json.dump(combined, fopen)

In [36]:
s = sorted(set([b[1] for b in combined]))
for n, k in enumerate(s):
    if n != k:
        print(n, k)

In [37]:
len(s)

4589

In [38]:
max([d[1] for d in combined])

4588

In [39]:
min([d[1] for d in combined])

0

In [40]:
with open('multispeaker-noisy-mapping.json', 'w') as fopen:
    json.dump(maps, fopen)

In [41]:
maps

{0: '"Abam_peluk_saya_lama_atas_pentas_akhir_MLM"-_Ali_Puteh_menangis_imbau_saat_manis_dengan_arwah_abang-_MdgGr7VD7w-speaker 0',
 1: "Main_'The_Detectives'_Dengan_Bront_Palarae_&_Fikry_Ibrahim___SEISMIK_Plays-_cYdHZBlZhM-speaker 1",
 2: '"Alif_bersih_daripada_ubat_terlarang"_-_Nora_dedah_laporan_bedah_siasat_punca_kematian_anaknya-gx9JlO12W_4-speaker 1',
 3: '"Anak_Melayu,_India,_Cina_hulur_RM10,_RM15"_-_Komposer_Ooi_Eow_Jin_kini_terlantar_sakit-0x5coikIeBs-speaker 1',
 4: '"Berlakon_dengan_Opah_Aminah_paling_susah..._sumpah!"_-_Ayda_Jebat_a.k.a_Johana_Bukan_Gadis_Biasa-G8gUxijCp4o-speaker 0',
 5: '"Eee_Keling!"-1DM5sIvcgV4-speaker 0',
 6: '"Jangan_sombong_jadi_orang"_-_Mobile_Legends_\'team_player\'_Indonesia-Q5GQS6na0ug-speaker 1',
 7: "The_Fashion_Weak_Podcast_Ep_32_-_Getting_Surprised_Reading_Melissa_Campbell's_(@heymelbelle)_IG_DMs-mJuKUwFpDgI-speaker 3",
 8: '"Main_Game_Tak_Ada_Masa_Depan"-9kROug2q59U-speaker 0',
 9: '"Marah_Sebab_Sayang"_Ada_Kesan_Psikologi-8pkbRj0-ntg-speaker 