In [1]:
import json
import re
import string

_pad = 'pad'
_start = 'start'
_eos = 'eos'
_punctuation = "!'(),.:;? "
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'

MALAYA_SPEECH_SYMBOLS = (
    [_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters)
)
vocab = {no: c for no, c in enumerate(MALAYA_SPEECH_SYMBOLS)}

files = [
    '/home/husein/ssd3/tts/multispeaker-clean-vits-husein-chatbot.json',
    '/home/husein/ssd3/tts/multispeaker-clean-vits-shafiqah-idayu-chatbot.json',
    '/home/husein/ssd3/tts/multispeaker-clean-vits-anwar-ibrahim.json'
]

In [2]:
import phonemizer

global_phonemizer = phonemizer.backend.EspeakBackend(language='ms', preserve_punctuation=True,  with_stress=True)

In [3]:
from tqdm import tqdm

"""
LJSpeech-1.1/wavs/LJ022-0023.wav|ðɪ ˌoʊvɚwˈɛlmɪŋ mədʒˈɔːɹᵻɾi ʌv pˈiːpəl ɪn ðɪs kˈʌntɹi nˈoʊ hˌaʊ tə sˈɪft ðə wˈiːt fɹʌmðə tʃˈæf ɪn wʌt ðeɪ hˈɪɹ ænd wʌt ðeɪ ɹˈiːd .|0
"""

texts = []
for no, f in enumerate(files):
    speaker = f.split('vits-')[-1].replace('.json', '')
    with open(f) as fopen:
        d = json.load(fopen)
        for d_ in tqdm(d):
            t = ''.join([vocab[i] for i in d_[-1][:-1]]).split()
            phonemes = [global_phonemizer.phonemize([word], strip=True)[0] if word not in string.punctuation else word for word in t]
            phonemes = ' '.join(phonemes)
            texts.append(f'{d_[0]}|{phonemes}|{no}')
            
len(texts)

100%|███████████████████████████████████████████████████████████████████████████████| 127137/127137 [00:33<00:00, 3828.71it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 141475/141475 [00:35<00:00, 3957.03it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 106014/106014 [00:26<00:00, 3944.16it/s]


374626

In [4]:
with open('train_list.txt', 'w') as fopen:
    fopen.write('\n'.join(texts))

In [5]:
import random

texts = []
for no, f in enumerate(files):
    speaker = f.split('vits-')[-1].replace('.json', '')
    with open(f) as fopen:
        d = json.load(fopen)
        d = random.sample(d, 30)
        for d_ in tqdm(d):
            t = ''.join([vocab[i] for i in d_[-1][:-1]]).split()
            phonemes = [global_phonemizer.phonemize([word], strip=True)[0] if word not in string.punctuation else word for word in t]
            phonemes = ' '.join(phonemes)
            texts.append(f'{d_[0]}|{phonemes}|{no}')
            
len(texts)

100%|███████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 3249.97it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 4043.61it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 4292.75it/s]


90

In [6]:
with open('val_list.txt', 'w') as fopen:
    fopen.write('\n'.join(texts))