In [1]:
import json
import random
from glob import glob
from tqdm import tqdm
from collections import defaultdict

files = glob('*vits*.json')
files = [f for f in files if 'combine' not in f and 'multispeaker-clean-vits.json' not in f]
files

['multispeaker-clean-vits-anwar-ibrahim.json',
 'multispeaker-clean-vits-kp-ms.json',
 'multispeaker-clean-vits-shafiqah-idayu-chatbot.json',
 'multispeaker-clean-vits-husein-chatbot.json',
 'multispeaker-clean-vits-kp-zh.json']

In [2]:
combine_all = []
for f in files:
    with open(f) as fopen:
        d = json.load(fopen)
    print(f, len(d))
    combine_all.extend(d)
    
len(combine_all)

multispeaker-clean-vits-anwar-ibrahim.json 106014
multispeaker-clean-vits-kp-ms.json 160265
multispeaker-clean-vits-shafiqah-idayu-chatbot.json 141475
multispeaker-clean-vits-husein-chatbot.json 127137
multispeaker-clean-vits-kp-zh.json 111128


646019

In [3]:
speakers = {}
for i in range(len(combine_all)):
    speakers[combine_all[i][0].split('tts/')[1].split('/')[0]] = combine_all[i][1]
    
speakers

{'anwar-ibrahim-chatbot': 8,
 'kp-ms-chatbot': 8,
 'shafiqah-idayu-chatbot': 10,
 'husein-chatbot': 10,
 'kp-zh-chatbot': 8}

In [4]:
new_speakers = {
    'anwar-ibrahim-chatbot': 'Anwar Ibrahim',
    'shafiqah-idayu-chatbot': 'Shafiqah Idayu',
    'husein-chatbot': 'Husein',
    'kp-ms-chatbot': 'KP',
    'kp-zh-chatbot': 'KP',
}

In [5]:
import re

_pad = 'pad'
_start = 'start'
_eos = 'eos'
_punctuation = "!'(),.:;? "
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'

MALAYA_SPEECH_SYMBOLS = (
    [_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters)
)
''.join([MALAYA_SPEECH_SYMBOLS[i] for i in combine_all[1][2]])

'Walaupun secara teorinya mungkin untuk membina struktur kecil dengan pencungkil gigi , ia agak mencabar dan tidak sesuai untuk rumah bersaiz biasa .eos'

In [6]:
for i in tqdm(range(len(combine_all))):
    new_id = new_speakers[combine_all[i][0].split('tts/')[1].split('/')[0]]
    combine_all[i][1] = new_id
    if isinstance(combine_all[i][2], list):
        combine_all[i][2] = ''.join([MALAYA_SPEECH_SYMBOLS[c] for c in combine_all[i][2][:-1]])

100%|█████████████████████████████████████████████████████████████████████████████| 646019/646019 [00:01<00:00, 395009.10it/s]


In [7]:
files = defaultdict(int)
for d in combine_all:
    files[d[0]] += 1
    
len(files), len(combine_all)

(646019, 646019)

In [8]:
for k, v in files.items():
    if v > 1:
        print(k)

In [9]:
combine_all[-1]

['/home/husein/ssd3/tts/kp-zh-chatbot/kp-chinese-texts-part4-7824.wav',
 'KP',
 '今天呢我们开始讲一本书 , 叫你是孩子最好的玩具 .']

In [10]:
from pypinyin import lazy_pinyin, Style
import jieba

jieba.initialize()

def is_chinese(c):
    return (
        "\u3100" <= c <= "\u9fff"
    )

def convert_char_to_pinyin(text_list, polyphone=True):
    final_text_list = []
    custom_trans = str.maketrans(
        {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'", '，': ', ', '！': '. ', '。': '. '}
    ) 

    for text in text_list:
        char_list = []
        text = text.translate(custom_trans)
        for seg in jieba.cut(text):
            seg_byte_len = len(bytes(seg, "UTF-8"))
            if seg_byte_len == len(seg):  # if pure alphabets and symbols
                if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
                    char_list.append(" ")
                char_list.extend(seg)
            elif polyphone and seg_byte_len == 3 * len(seg):  # if pure east asian characters
                seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
                for i, c in enumerate(seg):
                    if is_chinese(c):
                        char_list.append(" ")
                    char_list.append(seg_[i])
            else:  # if mixed characters, alphabets and symbols
                for c in seg:
                    if ord(c) < 256:
                        char_list.extend(c)
                    elif is_chinese(c):
                        char_list.append(" ")
                        char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
                    else:
                        char_list.append(c)
        final_text_list.append(char_list)

    return final_text_list

def normalize(text):
    converted = convert_char_to_pinyin(text.split())
    converted = [''.join(c) for c in converted]
    return ' '.join(converted).strip()

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.311 seconds.
Prefix dict has been built successfully.


In [11]:
speakers = defaultdict(list)
for r in tqdm(combine_all):
    a = {
        'audio_filename': r[0],
        'prompt': r[1],
        'transcription': normalize(r[2])
    }
    speakers[r[1]].append(a)

100%|███████████████████████████████████████████████████████████████████████████████| 646019/646019 [02:02<00:00, 5254.95it/s]


In [12]:
from sklearn.model_selection import train_test_split

train, test = [], []
for k, v in speakers.items():
    train_, test_ = train_test_split(v, test_size = 10)
    train.extend(train_)
    test.extend(test_)

In [13]:
random.shuffle(train)
random.shuffle(test)

In [14]:
from datasets import DatasetDict, Dataset

dataset_dict = DatasetDict({
    'train': Dataset.from_list(train),
    'test': Dataset.from_list(test)
})

In [16]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['audio_filename', 'prompt', 'transcription'],
        num_rows: 645979
    })
    test: Dataset({
        features: ['audio_filename', 'prompt', 'transcription'],
        num_rows: 40
    })
})

In [17]:
dataset_dict['train'][0]

{'audio_filename': '/home/husein/ssd3/tts/husein-chatbot/husein-chatbot-normalized-v2-54278.wav',
 'prompt': 'Husein',
 'transcription': 'Pokok tidak mempunyai rasa , kerana ia tidak boleh dimakan .'}

In [18]:
dataset_dict.push_to_hub('huseinzol05/mesolitica-tts-combined')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/646 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/476 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/huseinzol05/mesolitica-tts-combined/commit/bf486f8397018d1668e4344d3b09009d1fa2efda', commit_message='Upload dataset', commit_description='', oid='bf486f8397018d1668e4344d3b09009d1fa2efda', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/huseinzol05/mesolitica-tts-combined', endpoint='https://huggingface.co', repo_type='dataset', repo_id='huseinzol05/mesolitica-tts-combined'), pr_revision=None, pr_num=None)