# 6 - Train tts

Based on [this colab](https://colab.research.google.com/drive/1N_B_38MMRk1BUqwI_C829TyGpNmppUqK?usp=sharing#scrollTo=m_HkOd4jwqIb) and [this other one](https://colab.research.google.com/drive/1q2mhEiclQVyNe20U9fLbzVobfDiLtXSQ?usp=sharing#scrollTo=PS3jyscLSDEc).




## 1 - Setup

In [None]:
%%capture
!pip install pyloudnorm
!git clone https://github.com/xiph/rnnoise.git
!sudo apt-get install curl autoconf automake libtool python-dev pkg-config sox
%cd /content/rnnoise
!sh autogen.sh
!sh configure
!make clean
!make

In [None]:
%%capture
%cd /content
!sudo apt-get install espeak-ng
!git clone https://github.com/coqui-ai/TTS.git
!pip install TTS

In [None]:
from IPython.display import Audio
import librosa
from google.colab import drive
from pathlib import Path
import shutil
import os
import subprocess
import soundfile as sf
import pyloudnorm as pyln
import sys
import glob
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import torch

In [None]:
def display_audio(path):
  x, sr = librosa.load(path)
  display(Audio(x, rate=sr))

def save_for_interence(input, output=None):
  input = Path(input)
  if output is None:
    name_parts = input.name.split('.')
    name_parts[0] = f"{name_parts[0]}_inference"
    name = '.'.join(name_parts)
    output = input.parent / name
  else:
    output = Path(output)
  #load model
  model = torch.load(input)
  keys = [k for k in model["model"].keys() if k.startswith('disc.')]
  for k in keys:
    del model["model"][k]

  torch.save(model, output)

  return output

def read_text(text, 
              model_path,
              config_path="/root/.local/share/tts/tts_models--es--css10--vits/config.json",
              out_path="/content/example.wav"):
  """Read a text using a model"""
  status = subprocess.run(["tts", 
                           "--text", text, 
                           "--model_path", str(model_path),
                           "--config_path", str(config_path), 
                           "--out_path", str(out_path)])
  if status.returncode:
    raise RuntimeError(f"Process finish with error {status}")
  return Path(out_path)

Mount drive

In [None]:
drive.mount('/content/drive/')

Mounted at /content/drive/


## 2 - Choose pretrained model
We will choose the model tts_models/es/css10/vits, pretrained with a male voice in spanish.

Can be listed with
```
!tts --list_models
```

**Need to run to download the model pretrained in the cache folder.**

 Should be stored at _/root/.local/share/tts/tts_models--es--css10--vits_

In [None]:
!tts --text "Es el vecino el que elige el alcalde y es el alcalde el que quiere que sean los vecinos el alcalde, fin de la cita." --model_name "tts_models/es/css10/vits" --out_path /content/example.wav

 > Downloading model to /root/.local/share/tts/tts_models--es--css10--vits
100% 101M/101M [00:04<00:00, 21.1MiB/s]
 > Model's license - bsd-3-clause
 > Check https://opensource.org/licenses for more info.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-

In [None]:
x, sr = librosa.load("/content/example.wav")
Audio(x, rate=sr)

## 3.A - Preprocessing and store dataset in drive (skip if done)

Download dataset

In [None]:
dataset_path = "/content/drive/MyDrive/Máster/DLASP/Final/dataset_clean.zip"
shutil.copy(dataset_path, "/content/dataset_clean.zip")
dataset_original = Path('/content/dataset/')
dataset_original.mkdir(exist_ok=True)
shutil.unpack_archive("/content/dataset_clean.zip", dataset_original)

Process dataset

In [None]:
dataset_processed = Path("/content/dataset_processed")
dataset_processed.mkdir(exist_ok=True)

rnn = "/content/rnnoise/examples/rnnoise_demo"

#paths = Path(src).glob("**/*.wav")
#paths = Path(orig_wavs).glob("**/*.wav")
paths = dataset_original.glob("*.wav")

for filepath in tqdm(list(paths), leave=False):
  target_filepath= dataset_processed / filepath.name

  subprocess.run(["sox", "-G", "-v", "0.95", filepath, "48k.wav", "remix", "-", "rate", "48000"])
  # convert wav to raw
  subprocess.run(["sox", "48k.wav", "-c", "1", "-r", "48000", "-b", "16", "-e", "signed-integer", "-t", "raw", "temp.raw"])
 
  # apply rnnoise
  subprocess.run([rnn, "temp.raw", "rnn.raw"])

  # convert raw back to wav
  subprocess.run(["sox", "-G", "-v", "0.95", "-r", "48k", "-b", "16", "-e", "signed-integer", "rnn.raw", "-t", "wav", "rnn.wav"])

  # apply high/low pass filter and change sr to 22050Hz
  subprocess.run(["sox", "rnn.wav", str(target_filepath), "remix", "-", "highpass", "100", "lowpass", "7000", "rate", "22050"])
  data, rate = sf.read(target_filepath)

  # peak normalize audio to -1 dB
  peak_normalized_audio = pyln.normalize.peak(data, -1.0)

  # measure the loudness first
  meter = pyln.Meter(rate) # create BS.1770 meter
  loudness = meter.integrated_loudness(data)

  # loudness normalize audio to -25 dB LUFS
  loudness_normalized_audio = pyln.normalize.loudness(data, loudness, -25.0)
  sf.write(target_filepath, data=loudness_normalized_audio, samplerate=22050)
  

Test differences

In [None]:
n = 3
display_audio(dataset_original / f"segment{n}.wav")
display_audio(dataset_processed / f"segment{n}.wav")

Save to drive

In [None]:
shutil.make_archive("dataset_processed", 'zip', dataset_processed)
shutil.copy("dataset_processed.zip", "/content/drive/MyDrive/Máster/DLASP/Final/dataset_processed.zip" )
metadata = pd.read_csv(dataset_original / "metadata.csv")
metadata.to_csv("/content/drive/MyDrive/Máster/DLASP/Final/metadata.csv", index=False)

## 3.B - Load dataset processed from drive

Copy metadata file and zip with wavs

In [None]:
shutil.copy("/content/drive/MyDrive/Máster/DLASP/Final/dataset_processed.zip", 
            "dataset_processed.zip")
metadata = pd.read_csv("/content/drive/MyDrive/Máster/DLASP/Final/metadata.csv")

Uncompress zip

In [None]:
dataset_processed = Path("/content/dataset_processed")
dataset_processed.mkdir(exist_ok=True)
shutil.unpack_archive("dataset_processed.zip", dataset_processed)

## 4 - Format metadata file and test pretrained model


### 4.1 - Format metadata
Clean strange characters and remove sentences with numbers (I didnt find a normalizer to replace numbers with its text version in spanish).

We keep a total of 4596 sentences.

Store the metada file in *dataset_processed / "metadata.csv"*

In [None]:
#metadata = pd.read_csv("/content/drive/MyDrive/Máster/DLASP/Final/metadata.csv")
# Replace strange characters
replacements = {"'": '', 
                '…': '',
                '%': ' por ciento',
                'î': 'i',
                'ê':'e',
                'è':'e',
                'е':'e',
                'к': 'k',
                'т':'t',
                '-' : ' ',
                '¡' : '',
                '¿' : '',
                #'é': 'e', # For any reason é is not encoded in the model
                'ü': 'u',
                }
for k, v in replacements.items():
  metadata.text = metadata.text.str.replace(k, v, regex=False)

metadata.text = metadata.text.str.strip()

# Convert to lowercase
metadata.text = metadata.text.str.lower()

# Remove sentences with numbers
for number in range(11):
  metadata = metadata[~metadata.text.str.contains(str(number))]

# Need to set the same speaker than in the finetuned example
metadata["speaker_name"] = "tux"

# Rename columns to match coqui formatter
metadata = metadata.rename(
    columns={"filename":"audio_file"}).drop(
        columns=["index", "song"])
    
metadata = metadata.reset_index(drop=True).copy()

# Store in data_processed / metadata.csv
metadata.to_csv(dataset_processed / "metadata.csv", sep='|', index=False)

metadata.head()

Unnamed: 0,text,audio_file,speaker_name
0,"busco una calma inalcanzable, la atmósfera aqu...",segment1.wav,tux
1,"quiero estar solo, si solo todo estará bien",segment2.wav,tux
2,"que nadie me hable, que no rompan este silenci...",segment3.wav,tux
3,"hoy quiero sentir el frío, vértigo que el mund...",segment4.wav,tux
4,"harto de fingir excusas, musa siento huir de m...",segment5.wav,tux


Check there aren't strange characters

In [None]:
np.unique(list(''.join(list(metadata.text))))

array([' ', '!', ',', '.', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
       'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
       'v', 'w', 'x', 'y', 'z', 'á', 'é', 'í', 'ñ', 'ó', 'ú'], dtype='<U1')

## 4.2 Test pretrained model cached
Test model cached is correctly download (in the expected path)

In [None]:
!tts --text "La cerámica de talavera no es cosa menor. Dicho de otro modo, es cosa mayor." --model_path /root/.local/share/tts/tts_models--es--css10--vits/model_file.pth.tar --config_path /root/.local/share/tts/tts_models--es--css10--vits/config.json > /dev/null

display_audio("tts_output.wav")

## 5.1 - Prepare training

Path where the training will be stored

In [None]:
output_path = Path("/content/output/")
output_path.mkdir(exist_ok=True)

Create training script

In [None]:
training_script_content = r"""
from TTS.config import load_config

from trainer import Trainer, TrainerArgs
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits, VitsAudioConfig
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig

from TTS.tts.utils.languages import LanguageManager
from TTS.tts.utils.speakers import SpeakerManager
from pathlib import Path

# Folder where the pretrained path is cached
pretrained_folder = Path('/root/.local/share/tts/tts_models--es--css10--vits/')

# Load base configuration
vits_config = load_config(str(pretrained_folder / 'config.json'))
vits_config.output_path="/content/output/"
vits_config.run_name='tts_models--es--nach--vits'
vits_config.model_args.num_speakers = 1
vits_config.lr = 0.00001
#vits_config.lr_gen = vits_config.lr_gen / 100
vits_config.save_all_best = True
# Need discriminator for training
vits_config.init_discriminator = True
vits_config.model_args.init_discriminator = True
vits_config.epochs = 100

# Colab standard only has 2 threads
vits_config.num_loader_workers = 2
vits_config.num_eval_loader_workers = 2

# Override dataset config
vits_config.datasets = BaseDatasetConfig(
    formatter="coqui", 
    meta_file_train="metadata.csv", 
    path='/content/dataset_processed',
    language="es",
)

# Override test sentences
vits_config.test_sentences=[
    ['Un arcoíris\u200b o arco iris es un fenómeno óptico y meteorológico que '
    'causa la aparición en la atmósfera terrestre de un arco multicolor.',
     'tux', None, 'es']
]

# Override languages id file 
language_ids_file = str(pretrained_folder / 'language_ids.json')
vits_config.language_ids_file = language_ids_file
vits_config.model_args.language_ids_file = language_ids_file
# Audio processor
ap = AudioProcessor.init_from_config(vits_config)

# Load tokenizer
tokenizer, config = TTSTokenizer.init_from_config(vits_config)

# Load the training and eval samples
train_samples, eval_samples = load_tts_samples(
    config.datasets,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

# Load speaker  and language manager
speaker_manager = SpeakerManager.init_from_config(vits_config)
language_manager = LanguageManager(language_ids_file)

# Define model
model = Vits(config, ap, tokenizer, 
             speaker_manager=speaker_manager,
             language_manager=language_manager)


trainer_args = TrainerArgs()

trainer = Trainer(
    args=trainer_args,
    config=config,
    output_path='/content/output/',
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)

trainer.fit()
"""


In [None]:
training_file = output_path / "training.py"
with open(training_file, 'w') as f:
  f.write(training_script_content)

## 5.2 Training

In [None]:
!CUDA_VISIBLE_DEVICES="0" python /content/output/training.py --restore_path /content/output/tts_models--es--nach--vits-December-06-2022_02+29PM-0000000/best_model_471189.pth

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
     | > loss_disc_real_2: 0.25302  (0.22755)
     | > loss_disc_real_3: 0.23748  (0.22599)
     | > loss_disc_real_4: 0.25903  (0.24469)
     | > loss_disc_real_5: 0.25045  (0.24088)
     | > loss_0: 2.73395  (2.66132)
     | > grad_norm_0: 17.72506  (11.94886)
     | > loss_gen: 1.81327  (2.02746)
     | > loss_kl: 2.49938  (2.50559)
     | > loss_feat: 3.03084  (3.16602)
     | > loss_mel: 21.54112  (21.48723)
     | > loss_duration: 1.28240  (1.36507)
     | > amp_scaler: 512.00000  (512.00000)
     | > loss_1: 30.16701  (30.55137)
     | > grad_norm_1: 150.38744  (180.36931)
     | > current_lr_0: 0.00020 
     | > current_lr_1: 0.00020 
     | > step_time: 1.02140  (1.00363)
     | > loader_time: 0.00920  (0.00738)


[1m   --> STEP: 42/132 -- GLOBAL_STEP: 481000[0m
     | > loss_disc: 2.64772  (2.63540)
     | > loss_disc_real_0: 0.20863  (0.20213)
     | > loss_disc_real_1: 0.22897  (0.20725)
     | > l

Create backup in drive (~20gb)

In [None]:
training_folder = "/content/output/tts_models--es--nach--vits-December-06-2022_03+32PM-0000000"
shutil.make_archive("nach_training_backup", 
                    "zip", training_folder)
shutil.copy("nach_training_backup.zip", 
            "/content/drive/MyDrive/Máster/DLASP/Final/nach_training_backup.zip")

'/content/drive/MyDrive/Máster/DLASP/Final/nach_training_backup.zip'

## Test model

In [258]:
# Select checkpoint
training_path = Path("/content/output/tts_models--es--nach--vits-December-06-2022_03+32PM-0000000/")
model_path = training_path / "best_model_476866.pth"
# Prepare model for inference (remove discriminator network)
# Only need to be once
model_inference_path = save_for_interence(model_path)

# Test
text = ("Mi padre es el sol, mi madre la luna. "
        "Mi hermano es el viento y el planeta tierra mi cuna. "
        "Mis unicos hijos son las frases que me invento, "
        "y mi mayor regalo es vivir este momento")
wav_path = read_text(text=text, model_path=model_inference_path)

display_audio(wav_path)