In [1]:
# !pip3.10 install -e . --no-deps
# !pip3.10 install torchdiffeq x-transformers jieba pypinyin ema_pytorch
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/2e06e71ef7349a57bc58cc9913ae6bae1f9f8447/mp.py

In [59]:
from datasets.arrow_writer import ArrowWriter
from datasets.arrow_reader import ArrowReader

In [2]:
import json
import os
from glob import glob
from tqdm import tqdm
from datasets.arrow_writer import ArrowWriter
from concurrent.futures import ProcessPoolExecutor
from f5_tts.model.utils import (
    repetition_found,
    convert_char_to_pinyin,
)
from importlib.resources import files

en_filters = ["ا", "い", "て"]
tokenizer = 'pinyin'
dataset_name = f"Emilia_Malaysian_{tokenizer}"
save_dir = str(files("f5_tts").joinpath("../../")) + f"/data/{dataset_name}"
save_dir

'/home/husein/ssd3/F5-TTS/src/f5_tts/../../data/Emilia_Malaysian_pinyin'

In [3]:
def deal_with_audio_dir(file):
    sub_result, durations = [], []
    vocab_set = set()
    
    folder = os.path.split(file)[0]
    filename = file.replace('.json', '')
    
    try:
        with open(file) as fopen:
            d = json.load(fopen)
    except:
        return sub_result, durations, vocab_set
    
    for no, obj in enumerate(d):
        text = obj["text"].strip()
        if any(f in text for f in en_filters) or repetition_found(text, length=4):
            continue
        
        if tokenizer == "pinyin":
            text = convert_char_to_pinyin([text], polyphone=True)[0]
        duration = obj["end"] - obj['start']
        audio_path = os.path.join(folder, f'{filename}_{no}.mp3')
        sub_result.append({"audio_path": audio_path, "text": text, "duration": duration})
        durations.append(duration)
        vocab_set.update(list(text))
    
    return sub_result, durations, vocab_set

In [29]:
import soundfile as sf
from tqdm import tqdm
import subprocess
import re

def get_length(file):
    process = subprocess.Popen(
        ['ffmpeg', '-i', file],
        stdout=subprocess.PIPE, stderr=subprocess.STDOUT
    )
    stdout, stderr = process.communicate()
    matches = re.search(
        r"Duration:\s{1}(?P<hours>\d+?):(?P<minutes>\d+?):(?P<seconds>\d+\.\d+?),",
        stdout.decode(),
        re.DOTALL).groupdict()
    return float(matches['hours']) * 60 * 60 + \
        float(matches['minutes']) * 60 + float(matches['seconds'])

def loop(data):
    sub_result, durations = [], []
    vocab_set = set()
    data, _ = data
    for d in tqdm(data):
        audio = os.path.join('/home/husein/ssd3', d['audio'])
        duration = get_length(audio)
        text = d["transcription"].strip()
        
        if tokenizer == "pinyin":
            text = convert_char_to_pinyin([text], polyphone=True)[0]
            
        sub_result.append({"audio_path": audio, "text": text, "duration": duration})
        durations.append(duration)
        vocab_set.update(list(text))
        
    return [[sub_result, durations, vocab_set]]

In [7]:
import pandas as pd

data = pd.read_parquet('/home/husein/ssd3/verify-text.parquet')
data = data.to_dict(orient = 'records')

In [9]:
data[0]

{'audio': 'malaysian-podcast_processed_24k/Cara Nak Apply Student Exchange [vFhLEniT9X8]/Cara Nak Apply Student Exchange [vFhLEniT9X8]_0.mp3',
 'transcription': 'Cara nak apply, macam Puteri kan time internship. So, Puteri punya keluar dekat group internship, aa, dia keluar satu form.'}

In [56]:
# loop((data[:10], 0))

In [67]:
import mp

gather = mp.multiprocessing(data, loop, cores = 20)

100%|█████████████████████████████████████████████████████████████████████████████| 121911/121911 [2:19:14<00:00, 14.59it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 121911/121911 [2:19:14<00:00, 14.59it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 10.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 121911/121911 [2:21:29<00:00, 14.36it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 121911/121911 [2:22:10<00:00, 14.29it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 121911/121911 [2:22:21<00:00, 14.27it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 121911/121911 [2:22:29<00:00, 14.26it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 121911/121911 [2:22:36<00:00, 14.25it/s]


In [68]:
result = []
duration_list = []
text_vocab_set = set()

for sub_result, durations, vocab_set in gather:
    result.extend(sub_result)
    duration_list.extend(durations)
    text_vocab_set.update(vocab_set)

In [69]:
if not os.path.exists(f"{save_dir}"):
    os.makedirs(f"{save_dir}")

In [70]:
# !wget https://huggingface.co/SWivid/F5-TTS/resolve/main/F5TTS_Base/vocab.txt -O {save_dir}/vocab.txt

In [71]:
with ArrowWriter(path=f"{save_dir}/raw-original.arrow") as writer:
    for line in tqdm(result, desc="Writing to raw.arrow ..."):
        writer.write(line)
    
with ArrowWriter(path=f"{save_dir}/raw.arrow") as writer:
    for line in tqdm(result, desc="Writing to raw.arrow ..."):
        line['audio_path'] = line['audio_path'].replace('/home/husein/ssd3', '/workspace')
        writer.write(line)
        
# dup a json separately saving duration in case for DynamicBatchSampler ease
with open(f"{save_dir}/duration.json", "w", encoding="utf-8") as f:
    json.dump({"duration": duration_list}, f, ensure_ascii=False)

Writing to raw.arrow ...: 100%|███████████████████████████████████████████████| 2438225/2438225 [00:19<00:00, 127594.78it/s]
Writing to raw.arrow ...: 100%|███████████████████████████████████████████████| 2438225/2438225 [00:19<00:00, 124918.45it/s]


In [72]:
from datasets import Dataset as Dataset_
dataset = Dataset_.from_file(f"{save_dir}/raw.arrow")

In [73]:
result[0]['audio_path'].replace('/home/husein/ssd3', '/workspace')

'/workspace/malaysian-podcast_processed_24k/Cara Nak Apply Student Exchange [vFhLEniT9X8]/Cara Nak Apply Student Exchange [vFhLEniT9X8]_0.mp3'

In [75]:
from f5_tts.model.dataset import load_dataset
from f5_tts.model.utils import get_tokenizer

In [76]:
vocab_char_map, vocab_size = get_tokenizer('Emilia_Malaysian', 'pinyin')

In [77]:
target_sample_rate = 24000
n_mel_channels = 100
hop_length = 256
win_length = 1024
n_fft = 1024
mel_spec_type = "vocos"  # 'vocos' or 'bigvgan'
mel_spec_kwargs = dict(
    n_fft=n_fft,
    hop_length=hop_length,
    win_length=win_length,
    n_mel_channels=n_mel_channels,
    target_sample_rate=target_sample_rate,
    mel_spec_type=mel_spec_type,
)

In [78]:
train_dataset = load_dataset('Emilia_Malaysian', 'pinyin', mel_spec_kwargs=mel_spec_kwargs)

Loading dataset ...


In [79]:
from f5_tts.model.utils import list_str_to_idx

In [81]:
# list_str_to_idx(train_dataset[0]['text'], vocab_char_map).shape

In [84]:
!ls data/Emilia_Malaysian_pinyin

duration.json  raw.arrow  raw-original.arrow  vocab.txt


In [85]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_file(
    path_or_fileobj="data/Emilia_Malaysian_pinyin/duration.json",
    path_in_repo="data/Emilia_Malaysian_pinyin/duration.json",
    repo_id="mesolitica/Malaysian-Voice-Conversion",
    repo_type="dataset",
)

duration.json:   0%|          | 0.00/15.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Malaysian-Voice-Conversion/commit/e4e3f8a78592db1a99eae1c7dc22f1be20159e18', commit_message='Upload data/Emilia_Malaysian_pinyin/duration.json with huggingface_hub', commit_description='', oid='e4e3f8a78592db1a99eae1c7dc22f1be20159e18', pr_url=None, pr_revision=None, pr_num=None)

In [86]:
api.upload_file(
    path_or_fileobj="data/Emilia_Malaysian_pinyin/vocab.txt",
    path_in_repo="data/Emilia_Malaysian_pinyin/vocab.txt",
    repo_id="mesolitica/Malaysian-Voice-Conversion",
    repo_type="dataset",
)

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Malaysian-Voice-Conversion/commit/594784d5e2d28dd3cd838cf5709db83f82c44f64', commit_message='Upload data/Emilia_Malaysian_pinyin/vocab.txt with huggingface_hub', commit_description='', oid='594784d5e2d28dd3cd838cf5709db83f82c44f64', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
api.upload_file(
    path_or_fileobj="data/Emilia_Malaysian_pinyin/raw.arrow",
    path_in_repo="data/Emilia_Malaysian_pinyin/raw.arrow",
    repo_id="mesolitica/Malaysian-Voice-Conversion",
    repo_type="dataset",
)

raw.arrow:   0%|          | 0.00/2.39G [00:00<?, ?B/s]