In [None]:
import pandas as pd
import os
# Define the root path
root_path = "/srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1"

# Set the output path based on your specific configuration
output_path = os.path.join(root_path)

# Path to your "train.csv" file
train_csv_path = os.path.join(root_path, "train.csv")

# Load the "train.csv" file into a DataFrame
df = pd.read_csv(train_csv_path, delimiter='|')
print(df.head())
# Construct full paths to the audio files using the "wav_filename" column
df['full_audio_path'] = df['wav_filename'].apply(lambda x: os.path.join(root_path, x))

print(df['full_audio_path'][0])

In [None]:
import os
from trainer import Trainer, TrainerArgs
from TTS.config import BaseAudioConfig, BaseDatasetConfig
from TTS.tts.configs.fast_speech_config import FastSpeechConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.tts.models.tacotron2 import Tacotron2

print(torch.cuda.is_available())
USE_CUDA = torch.cuda.is_available()

# GlowTTSConfig: all model related values for training, validating and testing.
from TTS.tts.configs.tacotron2_config import Tacotron2Config

output_path = "/srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1"

if not os.path.exists(output_path):
    os.makedirs(output_path)
    

# Create a BaseDatasetConfig object
dataset_config = BaseDatasetConfig(formatter="custom_formatter", meta_file_train="train.csv", path=os.path.join(output_path))

# Load the dataset using your custom formatter
train_samples, eval_samples = load_tts_samples([dataset_config], eval_split=True)

audio_config = BaseAudioConfig(
    sample_rate=24000,
    do_trim_silence=True,
    trim_db=23.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=8000,
    spec_gain=1.0,
    log_func="np.log",
    ref_level_db=20,
    preemphasis=0.0,
)

# Configure your Tacotron2 model
config = Tacotron2Config(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    precompute_num_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=100,
    use_phonemes=True,
    phonemizer="gruut",
    phoneme_language="es-es",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache2"),
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
    use_speaker_embedding=True,
    min_text_len=0,
    max_text_len=500,
    min_audio_len=100000,  # Adjust this value based on your requirements
    max_audio_len=400000,  # Adjust this value based on your requirements
)

config.audio.fft_size = 2048
config.audio.win_length = 1200
config.audio.hop_length = 256
config.audio.num_mels = 80

ap = AudioProcessor.init_from_config(audio_config)

# Initialize the TTSTokenizer
tokenizer, config = TTSTokenizer.init_from_config(config)



In [5]:
speaker_manager = SpeakerManager()
speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
config.num_speakers = speaker_manager.num_speakers

model = Tacotron2(config, ap, tokenizer, speaker_manager=speaker_manager)


 > Init speaker_embedding layer.


In [6]:
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Num. of CPUs: 32
 | > Num. of Torch Threads: 16
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1/run-November-01-2023_06+26PM-5e2fae5
  from .autonotebook import tqdm as notebook_tqdm

 > Model has 32649330 parameters


 > `speakers.pth` is saved to /srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1/run-November-01-2023_06+26PM-5e2fae5/speakers.pth.
 > `speakers_file` is updated in the config.json.


In [None]:
trainer.fit()


[4m[1m > EPOCH: 0/100[0m
 --> /srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1/run-November-01-2023_06+26PM-5e2fae5




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: es-es
		| > phoneme backend: gruut
| > Number of instances : 166838



[1m > TRAINING (2023-11-01 18:26:31) [0m


 | > Preprocessing samples
 | > Max text length: 2
 | > Min text length: 1
 | > Avg text length: 1.9506109219095418
 | 
 | > Max audio length: 399382.0
 | > Min audio length: 100821.0
 | > Avg audio length: 245005.40084841967
 | > Num. instances discarded samples: 32942
 | > Batch group size: 0.
ot͡ʃoot͡ʃo

 [!] Character '͡' not found in the vocabulary. Discarding it. [!] Character '͡' not found in the vocabulary. Discarding it.



  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
