In [8]:
import os
import torch
import gruut
from trainer import Trainer, TrainerArgs
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.config import BaseAudioConfig, BaseDatasetConfig
from TTS.tts.configs.fast_speech_config import FastSpeechConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.tts.models.tacotron2 import Tacotron2

# Check if CUDA is available
print(torch.cuda.is_available())
USE_CUDA = torch.cuda.is_available()

output_path = "/srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1"

if not os.path.exists(output_path):
    os.makedirs(output_path)

# Create a BaseDatasetConfig object
dataset_config = BaseDatasetConfig(formatter="custom_formatter", meta_file_train="train.csv", path=os.path.join(output_path))

# Load the dataset using your custom formatter
train_samples, eval_samples = load_tts_samples([dataset_config], eval_split=True)

audio_config = BaseAudioConfig(
    sample_rate=24000,
    do_trim_silence=True,
    trim_db=23.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=8000,
    spec_gain=1.0,
    log_func="np.log",
    ref_level_db=20,
    preemphasis=0.0,
)

# Configure your Tacotron2 model
config = Tacotron2Config(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    precompute_num_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=100,
    use_phonemes=True,
    phonemizer="gruut",
    phoneme_language="es-es",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache2"),
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
    use_speaker_embedding=True,
    min_text_len=0,
    max_text_len=500,
    min_audio_len=100000,  # Adjust this value based on your requirements
    max_audio_len=400000,  # Adjust this value based on your requirements
)

config.audio.fft_size = 2048
config.audio.win_length = 1200
config.audio.hop_length = 256
config.audio.num_mels = 80

ap = AudioProcessor.init_from_config(audio_config)

# Initialize the TTSTokenizer
tokenizer, config = TTSTokenizer.init_from_config(config)

# Initialize the gruut phonemizer

# Initialize speaker manager
speaker_manager = SpeakerManager()
speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
config.num_speakers = speaker_manager.num_speakers

# Initialize Tacotron2 model
model = Tacotron2(config, ap, tokenizer, speaker_manager=speaker_manager)
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

# Training loop with phonemized text printing

# Train the model
trainer.fit()


False


 | > Found 168523 files in /srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1
 > Setting up Audio Processor...
 | > sample_rate:24000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:23.0
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Init speaker_embedding layer.


 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Num. of CPUs: 8
 | > Num. of Torch Threads: 1
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1/run-November-03-2023_09+39PM-5cf4ea1

 > Model has 32649330 parameters


 > `speakers.pth` is saved to /srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1/run-November-03-2023_09+39PM-5cf4ea1/speakers.pth.
 > `speakers_file` is updated in the config.json.



[4m[1m > EPOCH: 0/100[0m
 --> /srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1/run-November-03-2023_09+39PM-5cf4ea1




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: es-es
		| > phoneme backend: gruut
| > Number of instances : 166838



[1m > TRAINING (2023-11-03 21:41:08) [0m


 | > Preprocessing samples
 | > Max text length: 305
 | > Min text length: 0
 | > Avg text length: 137.913679273466
 | 
 | > Max audio length: 399382.0
 | > Min audio length: 100821.0
 | > Avg audio length: 245005.40084841967
 | > Num. instances discarded samples: 32942
 | > Batch group size: 0.
{'text': 'mer l el estudiante se desmayó', 'ph_hat': 'tɾes', 'token_ids': array([22, 82,  8, 21]), 'token_ids_len': 4}{'text': 'decidle que habían dado aviso a ordwinter de mi viaje a inglaterra', 'ph_hat': 'tɾeθe', 'token_ids': array([ 22,  82,   8, 116,   8]), 'token_ids_len': 5}{'text': 'los moros tienen una fiesta de caballos que llama la fantasía', 'ph_hat': 'dʝeθ', 'token_ids': array([  7, 105,   8, 116]), 'token_ids_len': 4}


{'text': 'rica y abundante de todas las cosas necesarias para la vida humana', 'ph_hat': 'onθe', 'token_ids': array([ 17,  16, 116,   8]), 'token_ids_len': 4}
{'text': 'has amado la justicia y aborrecido la maldad', 'ph_hat': 'sʝete', 'token_ids': array([ 21, 105, 

 > Keyboard interrupt detected.
 > Saving model before exiting...

 > CHECKPOINT : /srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1/run-November-03-2023_09+39PM-5cf4ea1/checkpoint_0.pth


: 