In [None]:
# load common voice data into .cache

from datasets import load_dataset, Audio

low_resource_language_code = "cy" # Welsh
ds = load_dataset("mozilla-foundation/common_voice_11_0", low_resource_language_code)
ds = ds.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
from tqdm import tqdm
import soundfile as sf
import os

train_dataset = []
output_dir = "./speech/train"
os.makedirs(output_dir, exist_ok=True)
for i, sample in tqdm(enumerate(ds["train"]), total=len(ds["train"])):
    waveform = sample["audio"]["array"]
    sr = sample["audio"]["sampling_rate"]
    filename = os.path.join(output_dir, f"{i:05d}.mp3")
    sf.write(filename, waveform, sr, format="mp3")
    train_dataset.append({
        "speech": filename,
        "text": sample["sentence"],
    })

valid_dataset = []
output_dir = "./speech/valid"
os.makedirs(output_dir, exist_ok=True)
for i, sample in tqdm(enumerate(ds["validation"]), total=len(ds["validation"])):
    waveform = sample["audio"]["array"]
    sr = sample["audio"]["sampling_rate"]
    filename = os.path.join(output_dir, f"{i:05d}.mp3")
    sf.write(filename, waveform, sr, format="mp3")
    valid_dataset.append({
        "speech": filename,
        "text": sample["sentence"],
    })

In [None]:
# create dump file (list of texts & audio file locations)

import espnetez as ez

data_info = {
    "speech": ["wav.scp", "sound"],
    "text": ["text", "text"],
}

ez.data.create_dump_file("./dump/train", train_dataset, data_info)
ez.data.create_dump_file("./dump/valid", valid_dataset, data_info)

In [None]:
# train & save a tokenizer

import espnetez as ez

# generate training texts from the training data
# you can select several datasets to train sentencepiece.
ez.preprocess.prepare_sentences(["dump/train/text"], "dump/spm")
ez.preprocess.train_sentencepiece(
    "dump/spm/train.txt",
    "data/bpemodel",
    vocab_size=1000,
)