In [None]:
import tensorflow as tf

# Check for GPU availability
if tf.test.is_gpu_available():
    print("GPU is available and ready for use.")
else:
    print("No GPU available. Check your setup.")


In [4]:
import pandas as pd
import os

column_names = ['wav_filename',	'wav_filesize',	'transcript',	'transcript_wav2vec',	'levenshtein',	'duration',	'num_words','client_id', 'path']

df = pd.read_csv('/srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1/train.csv', delimiter='|')
df.columns = column_names
column_data = df.iloc[:, 3]
print(column_data)

# Search for the character '͡' in a specific column and handle missing values
search_character = '͡'
column_name = 'transcript'
result = df[df[column_name].str.contains(search_character, na=False)]

# Display the rows where the character was found
print(result)



0         vengo a verte pasar todos los días aporciten c...
1         entró efectivamente el tártaro con áspero cont...
2         reparte el señor del huerto la fruta y no ella...
3         que para huír de un hablador de estos querría ...
4         los lagartos los escarabajos los insectos buri...
                                ...                        
168518    señor don luis de santo orcaz voy a deciros pu...
168519    todo está resuelto y por ahora os dan con la p...
168520    la seora marquesa de leiba al recoger a la señ...
168521    por tanto he aquí que nuevamente excitaré yo l...
168522    porque perecerá la sabiduría de sus sabios y s...
Name: transcript_wav2vec, Length: 168523, dtype: object
Empty DataFrame
Columns: [wav_filename, wav_filesize, transcript, transcript_wav2vec, levenshtein, duration, num_words, client_id, path]
Index: []


In [1]:
import os
import torch
from trainer import Trainer, TrainerArgs
import gruut
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.config import BaseAudioConfig, BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.tts.models.tacotron2 import Tacotron2

print(torch.cuda.is_available())
USE_CUDA = torch.cuda.is_available()

file_output= "/srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1/tmp"

output_path = "/srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1"

if not os.path.exists(output_path):
    os.makedirs(output_path)
    

# Create a BaseDatasetConfig object
dataset_config = BaseDatasetConfig(formatter="custom_formatter2", meta_file_train="train.csv", path=os.path.join(output_path))



# Load the dataset using your custom formatter

audio_config = BaseAudioConfig(
    sample_rate=24000,
    do_trim_silence=True,
    trim_db=23.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=8000,
    spec_gain=1.0,
    log_func="np.log",
    ref_level_db=20,
    preemphasis=0.0,
)

# Configure your Tacotron2 model
config = Tacotron2Config(
    run_name="CTacotron2",
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    precompute_num_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=100,
    use_phonemes=True,
    phonemizer="gruut",
    phoneme_language="es",
    phoneme_cache_path=os.path.join(file_output, "phoneme_cache3"),
    print_step=25,
    print_eval=True,
    mixed_precision=True,
    output_path=file_output,
    datasets=[dataset_config],
    use_speaker_embedding=True,
    min_text_len=0,
    max_text_len=500,
    min_audio_len=100000,  # Adjust this value based on your requirements
    max_audio_len=400000,  # Adjust this value based on your requirements
)

config.audio.fft_size = 2048
config.audio.win_length = 1200
config.audio.hop_length = 256
config.audio.num_mels = 80

ap = AudioProcessor.init_from_config(audio_config)

train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)

# Initialize the TTSTokenizer
tokenizer, config = TTSTokenizer.init_from_config(config)



False
 > Setting up Audio Processor...
 | > sample_rate:24000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:23.0
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024


AttributeError: module 'TTS.tts.datasets' has no attribute 'custom_formatter2'

In [2]:
speaker_manager = SpeakerManager()
speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
config.num_speakers = speaker_manager.num_speakers

model = Tacotron2(config, ap, tokenizer, speaker_manager=speaker_manager)


 > Init speaker_embedding layer.


In [3]:
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)


 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Num. of CPUs: 8
 | > Num. of Torch Threads: 1
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1/tmp/CTacotron2-November-04-2023_06+11PM-5cf4ea1

 > Model has 32649842 parameters


 > `speakers.pth` is saved to /srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1/tmp/CTacotron2-November-04-2023_06+11PM-5cf4ea1/speakers.pth.
 > `speakers_file` is updated in the config.json.


In [4]:
trainer.fit()


[4m[1m > EPOCH: 0/100[0m
 --> /srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data/spanish/cml_tts_dataset_spanish_v0.1/tmp/CTacotron2-November-04-2023_06+11PM-5cf4ea1




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: es
		| > phoneme backend: gruut
| > Number of instances : 166838



[1m > TRAINING (2023-11-04 18:11:56) [0m


 | > Preprocessing samples
 | > Max text length: 305
 | > Min text length: 0
 | > Avg text length: 137.913679273466
 | 
 | > Max audio length: 399382.0
 | > Min audio length: 100821.0
 | > Avg audio length: 245005.40084841967
 | > Num. instances discarded samples: 32942
 | > Batch group size: 0.


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


 > SSIM loss is out-of-range -inf, setting it 0.0



[1m   --> TIME: 2023-11-04 18:26:14 -- STEP: 0/4185 -- GLOBAL_STEP: 0[0m
     | > decoder_loss: 44.86867904663086  (44.86867904663086)
     | > postnet_loss: 46.83922576904297  (46.83922576904297)
     | > stopnet_loss: 0.6435643434524536  (0.6435643434524536)
     | > ga_loss: 0.006162174046039581  (0.006162174046039581)
     | > decoder_diff_spec_loss: 0.4541545510292053  (0.4541545510292053)
     | > postnet_diff_spec_loss: 4.502472400665283  (4.502472400665283)
     | > decoder_ssim_loss: 0.8203125  (0.8203125)
     | > postnet_ssim_loss: 0.0  (0.0)
     | > loss: 25.04558563232422  (25.04558563232422)
     | > align_error: 0.983642578125  (0.983642578125)
     | > grad_norm: tensor(5.7063)  (tensor(5.7063))
     | > current_lr: 2.5000000000000002e-08 
     | > step_time: 852.0242  (852.0241842269897)
     | > loader_time: 5.4737  (5.473690986633301)





: 