## LeanSpeech Training: HFC-Female (en-US)
This notebook allows you to train [LeanSpeech TTS](https://github.com/mush42/leanspeech) on [HiFiCaptin en-US female dataset](https://ast-astrec.nict.go.jp/en/release/hi-fi-captain/)


## Plumming

In [None]:
#@markdown ### Google Colab Anti-Disconnect
#@markdown Avoid automatic disconnection. Still, it will disconnect after **6 to 12 hours**.

import IPython
js_code = '''
function ClickConnect(){
console.log("Working");
document.querySelector("colab-toolbar-button#connect").click()
}
setInterval(ClickConnect,60000)
'''
display(IPython.display.Javascript(js_code))


#@markdown ### Check GPU type
#@markdown A higher capable GPU can lead to faster training speeds. By default, you will have a **Tesla T4**.
!nvidia-smi

## Prepare environment

In [None]:
#@markdown ### Clone leanSpeech repository

import os

if not os.path.isdir(os.path.join(os.getcwd(), "leanspeech")):
    print("Cloning leanspeech repository...")
    !git clone --depth=1 https://github.com/mush42/leanspeech

!cd ./leanSpeech

#@markdown ### Upgrade packages
!pip install --upgrade pip setuptools wheel

#@markdown ### Install leanspeech dependencies
!pip install -r requirements.txt

## Constants

In [None]:
HFC_FEMALE_DATASET_URL = "https://ast-astrec.nict.go.jp/release/hi-fi-captain/hfc_en-US_F.zip"
HFC_FEMALE_MATCHA_ONNX = "https://drive.google.com/file/d/18ysTCnXipCzzK6LxJtXFJvpDa6QKt7Ol/view?usp=sharing"


## Upload sentences to generate synthetic dataset

In [None]:
from google.colab import files

uploaded = files.upload()
(filename, file_content) = tuple(uploaded.items())
print(f"Uploaded file: {filename}")
source_text = file_content.decode("utf-8")
source_lines = source_text.splitlines()
source_lines = [line for l in source_lines if (line := l.strip())]
print(f"Found {len(source_lines)} lines in the uploaded file")
with open("sentences.txt", "w", encoding="utf-8") as outfile:
    outfile.write("\n".join(source_lines ))
print("Wrote lines to `sentences.txt`")

## Generate synthetic dataset

In [None]:
!python3 -m leanspeech.tools.matcha_synthetic \
    --batch-size 64 \
    --mel-mean "-6.38385" \
    --mel-std 2.541796 \
    --val-percent 0.025 \
    --cuda \
    matcha-hfc_female.onnx \
    en-us \
    sentences.txt \
    ./data/synthetic


## Format dataset as ljspeech

In [None]:
print("Downloading dataset...")
!wget "https://ast-astrec.nict.go.jp/release/hi-fi-captain/hfc_en-US_F.zip"

print("Formatting dataset as ljspeech...")
import csv
import os
from pathlib import Path, PurePosixPath
from zipfile import ZipFile

raw_data_output_dir = "./data/raw/"

train_output_dir = raw_data_output_dir .joinpath("train")
val_output_dir = raw_data_output_dir .joinpath("val")

wav_train_output_dir = train_output_dir.joinpath("wav")
wav_val_output_dir = val_output_dir.joinpath("wav")

wav_train_output_dir .mkdir(parents=True, exist_ok=True)
wav_val_output_dir .mkdir(parents=True, exist_ok=True)

wav2sent_train = {}
wav2sent_val = {}

print("Copying wav files...")
with open("hfc_en-US_F.zip", "r") as zfile:
    for fp in zfile.filelist:
        p = PurePosixPath(fp)
        if p.suffix != ".txt":
            continue
        content = zfile.read(fp).decode("utf-8")
        lines = [l for l in content.splitlines() if l.strip()]
        w2s = wav2sent_train if "val" not in p.name else wav2sent_val
        for line in lines:
            filestem, sent = line.split(" ", 1)
            w2s [filestem] = sent

    for fp in zfile.filelist:
        p = PurePosixPath(fp)
        if p.suffix != ".wav":
            continue
        if p.stem in wav2sent_train:
            output_path = os.fspath(wav_train_output_dir  / p.name)
        elif p.stem in wav2sent_val:
            output_path = os.fspath(wav_val_output_dir  / p.name)
        else:
            print(f"Warning: file `{fp} not found in train/val list.")
            continue
        zfile.extract(fp, path=output_path)

print("Writing metadata.csv")
with open(train_output_dir .joinpath("metadata.csv"), "w", encoding="utf-8") as cfile:
    writer = csv.writer(cfile, delimiter="|")
    writer.writerows(tuple(wav2sent_train.items()))

with open(val_output_dir .joinpath("metadata.csv"), "w", encoding="utf-8") as cfile:
    writer = csv.writer(cfile, delimiter="|")
    writer.writerows(tuple(wav2sent_val.items()))

print("Done formatting dataset as `ljspeech`")

!ls ./data/raw/

## Preprocess ground-truth dataset

In [None]:
!python3 -m leanspeech.tools.preprocess_dataset hfc_female-en-US ./data/raw ./data/gt

## Merge datasets

## Start training

In [None]:
!python3 leanspeech.train experiment="hfc_female-en_US"

In [None]:
import random

train_synth_lines = Path("./data/synthetic/train.txt").read_text(encoding="utf-8").splitlines()
val_synth_lines = Path("./data/synthetic/val.txt").read_text(encoding="utf-8").splitlines()
train_gt_lines = Path("./data/gt/train.txt").read_text(encoding="utf-8").splitlines()
val_gt_lines = Path("./data/gt/val.txt").read_text(encoding="utf-8").splitlines()

all_train_lines = train_synth_lines + train_gt_lines
all_val_lines = val_synth_lines + val_gt_lines

random.shuffle(all_train_lines)
random.shuffle(all_val_lines)

with open("./data/train.txt", "w", encoding="utf-8") as tfile:
    tfile.write("\n".join(all_train_lines))

with open("./data/val.txt", "w", encoding="utf-8") as vfile:
    vfile.write("\n".join(all_val_lines))

# Move files to expected location
!mkdir data/hfc_female-en_US
!mv data/train.txt data/hfc_female-en_US
!mv data/val.txt data/hfc_female-en_US
