In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Attempt to Train Polish dataset (100 samples in directory)

Unsuccesful: stuck on epoch 7

In [7]:
import os

from trainer import Trainer, TrainerArgs

from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.speakers import SpeakerManager

output_path = "/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/"

data_path = "/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/"

# Using LJSpeech like dataset processing for the blizzard dataset
dataset_config = BaseDatasetConfig(
    formatter="ljspeech",
    meta_file_train="train.csv",
    path=data_path,
)

audio_config = BaseAudioConfig(
    sample_rate=24000,
    do_trim_silence=True,
    trim_db=60.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=11025,
    spec_gain=1.0,
    log_func="np.log",
    ref_level_db=20,
    preemphasis=0.0,
)

# Using the standard Capacitron config
capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0, capacitron_capacity=50)

config = Tacotron2Config(
    run_name="Capacitron-Tacotron2",
    audio=audio_config,
    capacitron_vae=capacitron_config,
    use_capacitron_vae=True,
    batch_size=32,  # Tune this to your gpu
    max_audio_len=40 * 24000,  # Tune this to your gpu
    min_audio_len=2 * 24000,
    eval_batch_size=4,
    num_loader_workers=2,
    num_eval_loader_workers=2,
    precompute_num_workers=2,
    run_eval=True,
    test_delay_epochs=6,
    ga_alpha=0.0,
    r=2,
    optimizer="CapacitronOptimizer",
    optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
    attention_type="dynamic_convolution",
    grad_clip=0.0,  # Important! We overwrite the standard grad_clip with capacitron_grad_clip
    double_decoder_consistency=False,
    epochs=25,
    use_phonemes=True,
    phoneme_language="pl",
    phonemizer="espeak",
    phoneme_cache_path=os.path.join(data_path, "phoneme_cache3"),
    stopnet_pos_weight=15,
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    seq_len_norm=True,
    output_path=output_path,
    datasets=[dataset_config],
    use_speaker_embedding=True,
    lr=1e-3,
    lr_scheduler="StepwiseGradualLR",
    lr_scheduler_params={
        "gradual_learning_rates": [
            [0, 1e-3],
            [2e4, 5e-4],
            [4e5, 3e-4],
            [6e4, 1e-4],
            [8e4, 5e-5],
        ]
    },
    scheduler_after_epoch=False,  # scheduler doesn't work without this flag
    # Need to experiment with these below for capacitron
    loss_masking=False,
    decoder_loss_alpha=1.0,
    postnet_loss_alpha=1.0,
    postnet_diff_spec_alpha=0.0,
    decoder_diff_spec_alpha=0.0,
    decoder_ssim_alpha=0.0,
    postnet_ssim_alpha=0.0,
)

ap = AudioProcessor(**config.audio.to_dict())

tokenizer, config = TTSTokenizer.init_from_config(config)

train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)

speaker_manager = SpeakerManager()
speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
config.num_speakers = speaker_manager.num_speakers

model = Tacotron2(config, ap, tokenizer, speaker_manager=speaker_manager)

trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
    training_assets={"audio_processor": ap},
)

trainer.fit()

 > Setting up Audio Processor...
 | > sample_rate:24000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:11025
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60.0
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 | > Found 100 files in /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir
 > Init speaker_embedding layer.


 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 2
 | > Num. of Torch Threads: 1
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/Capacitron-Tacotron2-November-01-2023_05+21PM-0000000

 > Model has 34615866 parameters


 > `speakers.pth` is saved to /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/Capacitron-Tacotron2-November-01-2023_05+21PM-0000000/speakers.pth.
 > `speakers_file` is updated in the config.json.



[4m[1m > EPOCH: 0/25[0m
 --> /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/Capacitron-Tacotron2-November-01-2023_05+21PM-0000000

[1m > TRAINING (2023-11-01 17:21:29) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: pl
		| > phoneme backend: espeak
| > Number of instances : 99
 | > Preprocessing samples
 | > Max text length: 250
 | > Min text length: 115
 | > Avg text length: 183.6161616161616
 | 
 | > Max audio length: 358822.0
 | > Min audio length: 240022.0
 | > Avg audio length: 294283.83838383836
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



[1m   --> TIME: 2023-11-01 17:21:40 -- STEP: 0/4 -- GLOBAL_STEP: 0[0m
     | > decoder_loss: 39.23311233520508  (39.23311233520508)
     | > postnet_loss: 41.16348648071289  (41.16348648071289)
     | > capaciton_reconstruction_loss: 474607.15625  (474607.15625)
     | > capacitron_vae_loss: -0.0005424190312623978  (-0.0005424190312623978)
     | > capacitron_vae_beta_loss: 45.73677062988281  (45.73677062988281)
     | > capacitron_vae_kl_term: 4.263230800628662  (4.263230800628662)
     | > capacitron_beta: 1.0  (1.0)
     | > stopnet_loss: 0.7341765761375427  (0.7341765761375427)
     | > loss: 81.13023376464844  (81.13023376464844)
     | > align_error: 0.9730942882597446  (0.9730942882597446)
     | > grad_norm: 0  (0)
     | > current_lr: 0.001 
     | > step_time: 5.3831  (5.383062124252319)
     | > loader_time: 5.7199  (5.71993088722229)






[1m > EVALUATION [0m





> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: pl
		| > phoneme backend: espeak
| > Number of instances : 1
 | > Preprocessing samples
 | > Max text length: 244
 | > Min text length: 244
 | > Avg text length: 244.0
 | 
 | > Max audio length: 348502.0
 | > Min audio length: 348502.0
 | > Avg audio length: 348502.0
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.


[1m   --> STEP: 0[0m
     | > decoder_loss: 40.663272857666016  (40.663272857666016)
     | > postnet_loss: 40.65840148925781  (40.65840148925781)
     | > capaciton_reconstruction_loss: 620206.625  (620206.625)
     | > capacitron_vae_loss: -0.0004353635595180094  (-0.0004353635595180094)
     | > capacitron_vae_beta_loss: 45.69575881958008  (45.69575881958008)
     | > capacitron_vae_kl_term: 4.228603363037109  (4.228603363037109)
     | > capacitron_beta: 0.9983475208282471  (0.9983475208282471)
     | > stopnet_loss: 0.6581758856773376  (0.6581758856773376)
     | > loss: 81.97940826416016  (81.97940826416016)
     | > align_error: 0.9743180014193058  (0.9743180014193058)






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.41062259674072266 [0m(+0)
     | > avg_decoder_loss: 40.663272857666016 [0m(+0)
     | > avg_postnet_loss: 40.65840148925781 [0m(+0)
     | > avg_capaciton_reconstruction_loss: 620206.625 [0m(+0)
     | > avg_capacitron_vae_loss: -0.0004353635595180094 [0m(+0)
     | > avg_capacitron_vae_beta_loss: 45.69575881958008 [0m(+0)
     | > avg_capacitron_vae_kl_term: 4.228603363037109 [0m(+0)
     | > avg_capacitron_beta: 0.9983475208282471 [0m(+0)
     | > avg_stopnet_loss: 0.6581758856773376 [0m(+0)
     | > avg_loss: 81.97940826416016 [0m(+0)
     | > avg_align_error: 0.9743180014193058 [0m(+0)

 > BEST MODEL : /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/Capacitron-Tacotron2-November-01-2023_05+21PM-0000000/best_model_4.pth

[4m[1m > EPOCH: 1/25[0m
 --> /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/Capacitron-Tacotron2-November-01-2023_05+21PM-0000000

[1m > TRAINING (2023-11-01 17:22:1




[4m[1m > EPOCH: 2/25[0m
 --> /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/Capacitron-Tacotron2-November-01-2023_05+21PM-0000000

[1m > TRAINING (2023-11-01 17:22:59) [0m

[1m > EVALUATION [0m

[1m   --> STEP: 0[0m
     | > decoder_loss: 35.80514144897461  (35.80514144897461)
     | > postnet_loss: 27.7950496673584  (27.7950496673584)
     | > capaciton_reconstruction_loss: 578298.8125  (578298.8125)
     | > capacitron_vae_loss: -0.00043189729331061244  (-0.00043189729331061244)
     | > capacitron_vae_beta_loss: 45.331939697265625  (45.331939697265625)
     | > capacitron_vae_kl_term: 4.206365585327148  (4.206365585327148)
     | > capacitron_beta: 0.9899179935455322  (0.9899179935455322)
     | > stopnet_loss: 0.4913251996040344  (0.4913251996040344)
     | > loss: 64.09107971191406  (64.09107971191406)
     | > align_error: 0.9776151347905397  (0.9776151347905397)


  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.7110562324523926 [0m(-0.1017




[4m[1m > EPOCH: 3/25[0m
 --> /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/Capacitron-Tacotron2-November-01-2023_05+21PM-0000000

[1m > TRAINING (2023-11-01 17:23:46) [0m

[1m > EVALUATION [0m

[1m   --> STEP: 0[0m
     | > decoder_loss: 29.225866317749023  (29.225866317749023)
     | > postnet_loss: 11.129799842834473  (11.129799842834473)
     | > capaciton_reconstruction_loss: 511884.1875  (511884.1875)
     | > capacitron_vae_loss: -0.00042898187530227005  (-0.00042898187530227005)
     | > capacitron_vae_beta_loss: 45.025936126708984  (45.025936126708984)
     | > capacitron_vae_kl_term: 4.25537109375  (4.25537109375)
     | > capacitron_beta: 0.9842890501022339  (0.9842890501022339)
     | > stopnet_loss: 0.3188757300376892  (0.3188757300376892)
     | > loss: 40.67411422729492  (40.67411422729492)
     | > align_error: 0.9806569013744593  (0.9806569013744593)


  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.641592264175415 [0m(-0.0694639




[4m[1m > EPOCH: 4/25[0m
 --> /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/Capacitron-Tacotron2-November-01-2023_05+21PM-0000000

[1m > TRAINING (2023-11-01 17:24:33) [0m

[1m > EVALUATION [0m

[1m   --> STEP: 0[0m
     | > decoder_loss: 20.429006576538086  (20.429006576538086)
     | > postnet_loss: 5.5633416175842285  (5.5633416175842285)
     | > capaciton_reconstruction_loss: 413216.125  (413216.125)
     | > capacitron_vae_loss: -0.00042412534821778536  (-0.00042412534821778536)
     | > capacitron_vae_beta_loss: 44.516197204589844  (44.516197204589844)
     | > capacitron_vae_kl_term: 4.491630554199219  (4.491630554199219)
     | > capacitron_beta: 0.9781980514526367  (0.9781980514526367)
     | > stopnet_loss: 0.15812799334526062  (0.15812799334526062)
     | > loss: 26.150053024291992  (26.150053024291992)
     | > align_error: 0.9835020024329424  (0.9835020024329424)






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6665661334991455 [0m(+0.02497386932373047)
     | > avg_decoder_loss:[92m 20.429006576538086 [0m(-8.796859741210938)
     | > avg_postnet_loss:[92m 5.5633416175842285 [0m(-5.566458225250244)
     | > avg_capaciton_reconstruction_loss:[92m 413216.125 [0m(-98668.0625)
     | > avg_capacitron_vae_loss:[91m -0.00042412534821778536 [0m(+4.856527084484696e-06)
     | > avg_capacitron_vae_beta_loss:[92m 44.516197204589844 [0m(-0.5097389221191406)
     | > avg_capacitron_vae_kl_term:[91m 4.491630554199219 [0m(+0.23625946044921875)
     | > avg_capacitron_beta:[92m 0.9781980514526367 [0m(-0.006090998649597168)
     | > avg_stopnet_loss:[92m 0.15812799334526062 [0m(-0.1607477366924286)
     | > avg_loss:[92m 26.150053024291992 [0m(-14.52406120300293)
     | > avg_align_error:[91m 0.9835020024329424 [0m(+0.0028451010584831238)

 > BEST MODEL : /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/Capacitro




[4m[1m > EPOCH: 6/25[0m
 --> /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/Capacitron-Tacotron2-November-01-2023_05+21PM-0000000

[1m > TRAINING (2023-11-01 17:26:07) [0m

[1m   --> TIME: 2023-11-01 17:26:26 -- STEP: 1/4 -- GLOBAL_STEP: 25[0m
     | > decoder_loss: 10.671897888183594  (10.671897888183594)
     | > postnet_loss: 10.251362800598145  (10.251362800598145)
     | > capaciton_reconstruction_loss: 245835.296875  (245835.296875)
     | > capacitron_vae_loss: -0.00041657095425762236  (-0.00041657095425762236)
     | > capacitron_vae_beta_loss: 40.057464599609375  (40.057464599609375)
     | > capacitron_vae_kl_term: 8.716302871704102  (8.716302871704102)
     | > capacitron_beta: 0.9702973365783691  (0.9702973365783691)
     | > stopnet_loss: 0.13627004623413086  (0.13627004623413086)
     | > loss: 21.059114456176758  (21.059114456176758)
     | > align_error: 0.9840188752859831  (0.9840188752859831)
     | > grad_norm: 0  (0.0)
     | > current_lr: 0.001

 | > Synthesizing test sentences.


 ! Run is kept in /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/Capacitron-Tacotron2-November-01-2023_05+21PM-0000000


Internal Python error in the inspect module.
Below is the traceback from this internal error.

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1808, in fit
    self._fit()
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1764, in _fit
    self.test_run()
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1680, in test_run
    test_outputs = self.model.test_run(self.training_assets)
  File "/usr/local/lib/python3.10/dist-packages/TTS/tts/models/base_tacotron.py", line 157, in test_run
    outputs_dict = synthesis(
  File "/usr/local/lib/python3.10/dist-packages/TTS/tts/utils/synthesis.py", line 221, in synthesis
    outputs = run_model_torch(
  File "/usr/local/lib/python3.10/dist-packages/TTS/tts/utils/synthesis.py", line 53, in run_model_torch
    outputs = _func(
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    retu

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1808, in fit
    self._fit()
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1764, in _fit
    self.test_run()
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1680, in test_run
    test_outputs = self.model.test_run(self.training_assets)
  File "/usr/local/lib/python3.10/dist-packages/TTS/tts/models/base_tacotron.py", line 157, in test_run
    outputs_dict = synthesis(
  File "/usr/local/lib/python3.10/dist-packages/TTS/tts/utils/synthesis.py", line 221, in synthesis
    outputs = run_model_torch(
  File "/usr/local/lib/python3.10/dist-packages/TTS/tts/utils/synthesis.py", line 53, in run_model_torch
    outputs = _func(
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/TTS/tts/models/tacotro

TypeError: ignored