In [1]:
import os
import yaml
import torch
import numpy as np
import torch.nn as nn
from pathlib import Path
from torch.utils.data import DataLoader


from audioml.fastspeech.model import Text2Mel
from audioml.dataset.feature_dataset import SpeechFeatureDataset
from audioml.processing.text_speech_alignment import TTSTokenizer

W0609 15:57:38.831000 60845 site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


# Config

In [2]:
os.listdir(Path(os.getcwd()).parent / 'audioml')
config_path = Path(os.getcwd()).parent / 'audioml' / 'config.yaml'
print(config_path)

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

/Users/mayankanand/Documents/audio/audio/audioml/config.yaml


In [3]:
FEATURE_DIR = Path(os.getcwd()).parent / 'data' / 'processed' / 'lj_speech_feature'

# Dataloader

In [11]:
batch_size = 4
feature_dataset = SpeechFeatureDataset(
    feature_dir=FEATURE_DIR,
    batch_size=batch_size,
    sort=True,
    drop_last=False
)

[NeMo I 2025-06-09 16:04:01 nemo_logging:393] Found existing object /Users/mayankanand/.cache/torch/NeMo/NeMo_2.3.1/Aligner/5b0d70eb6a09c1a8470b745034a1a00b/Aligner.nemo.
[NeMo I 2025-06-09 16:04:01 nemo_logging:393] Re-using file from: /Users/mayankanand/.cache/torch/NeMo/NeMo_2.3.1/Aligner/5b0d70eb6a09c1a8470b745034a1a00b/Aligner.nemo
[NeMo I 2025-06-09 16:04:01 nemo_logging:393] Instantiating model from pre-trained checkpoint


 NeMo-text-processing :: INFO     :: Creating ClassifyFst grammars.
[NeMo W 2025-06-09 16:04:15 nemo_logging:405] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2025-06-09 16:04:15 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.TTSDataset
      manifest_filepath: /data3/LJSpeech/nvidia_ljspeech_train.json
      sample_rate: 22050
      sup_data_path: /data3/LJSpeech/align_supplementary/
      sup_data_types:
      - align_prior_matrix
      n_fft: 1024
      win_length: 1024
      hop_length: 256
      window: hann
      n_mels: 80
      low

[NeMo I 2025-06-09 16:04:15 nemo_logging:393] PADDING: 1
[NeMo I 2025-06-09 16:04:15 nemo_logging:393] Model AlignerModel was successfully restored from /Users/mayankanand/.cache/torch/NeMo/NeMo_2.3.1/Aligner/5b0d70eb6a09c1a8470b745034a1a00b/Aligner.nemo.


 NeMo-text-processing :: INFO     :: Creating ClassifyFst grammars.


In [12]:
group_size = 8
shuffle=True
feature_dataloader = DataLoader(
    feature_dataset,
    batch_size=group_size * batch_size,
    shuffle=shuffle,
    collate_fn=feature_dataset.collate_function
)

In [13]:
for batchs in feature_dataloader:
    for batch in batchs:
        print(batch['raw_text'])
        print(batch['token_length'])
    break

Token Length: [105, 118, 60, 61, 81, 89, 106, 53, 56, 111, 118, 65, 112, 79, 127, 85, 108, 97, 71, 106, 107, 58, 73, 27, 143, 122, 87, 95, 101, 129, 46, 92]
Sorted IDX: [24 29 14 25  1 10 12  9 16 20  6 19  0 28 17 27 31  5 26 15  4 13 22 18
 11  3  2 21  8  7 30 23]
idx_arr: [[24, 29, 14, 25], [1, 10, 12, 9], [16, 20, 6, 19], [0, 28, 17, 27], [31, 5, 26, 15], [4, 13, 22, 18], [11, 3, 2, 21], [8, 7, 30, 23]]
["Oswald's known actions in the building immediately after the assassination are consistent with his having been at the southeast corner window of the sixth floor", 'Courvoisier wished to commit suicide in Newgate, but was prevented by the vigilant supervision to which he was subjected while in jail.', "the firm's paper went down further and further in value; an application to the Committee of Bankers for assistance was peremptorily refused,", 'Both Director Hoover and Belmont expressed to the Commission the great concern of the FBI, which is shared by the Secret Service,']
[143, 1

In [8]:
for batch in batchs:
    print(batch['token_length'])

[148, 139, 132, 130]
[124, 124, 124, 123]
[121, 116, 116, 115]
[112, 111, 110, 108]
[106, 103, 101, 99]
[98, 95, 89, 89]
[86, 78, 73, 70]
[53, 53, 44, 21]


In [7]:
token_length = [111, 45, 119, 68, 58, 129, 100, 121, 133, 114, 140, 47, 104, 101, 94, 128, 78, 71, 125, 112, 45, 33, 119, 67, 92, 117, 109, 57, 57, 100, 118, 132]

In [9]:
np.argsort(token_length)[::-1]

array([10,  8, 31,  5, 15, 18,  7,  2, 22, 30, 25,  9, 19,  0, 26, 12, 13,
        6, 29, 14, 24, 16, 17,  3, 23,  4, 27, 28, 11, 20,  1, 21])

In [11]:
token_length[8]

133

# Model

## Tokenizer

In [9]:
tokenizer = TTSTokenizer()

[NeMo I 2025-06-08 10:27:58 nemo_logging:393] Found existing object /Users/mayankanand/.cache/torch/NeMo/NeMo_2.3.1/Aligner/5b0d70eb6a09c1a8470b745034a1a00b/Aligner.nemo.
[NeMo I 2025-06-08 10:27:58 nemo_logging:393] Re-using file from: /Users/mayankanand/.cache/torch/NeMo/NeMo_2.3.1/Aligner/5b0d70eb6a09c1a8470b745034a1a00b/Aligner.nemo
[NeMo I 2025-06-08 10:27:58 nemo_logging:393] Instantiating model from pre-trained checkpoint


 NeMo-text-processing :: INFO     :: Creating ClassifyFst grammars.
[NeMo W 2025-06-08 10:28:11 nemo_logging:405] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2025-06-08 10:28:11 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.TTSDataset
      manifest_filepath: /data3/LJSpeech/nvidia_ljspeech_train.json
      sample_rate: 22050
      sup_data_path: /data3/LJSpeech/align_supplementary/
      sup_data_types:
      - align_prior_matrix
      n_fft: 1024
      win_length: 1024
      hop_length: 256
      window: hann
      n_mels: 80
      low

[NeMo I 2025-06-08 10:28:11 nemo_logging:393] PADDING: 1
[NeMo I 2025-06-08 10:28:12 nemo_logging:393] Model AlignerModel was successfully restored from /Users/mayankanand/.cache/torch/NeMo/NeMo_2.3.1/Aligner/5b0d70eb6a09c1a8470b745034a1a00b/Aligner.nemo.


 NeMo-text-processing :: INFO     :: Creating ClassifyFst grammars.


## Text2Mel

In [10]:
text2mel = Text2Mel(config)

# Forward Pass

In [11]:
phones = tokenizer.batch_tokenize(batch['raw_text'])

In [12]:
input_ids, src_mask, duration = phones['input_ids'], phones['mask_ids'], batch['duration']

In [13]:
text2mel_output = text2mel(input_ids, src_mask, train=True, gt_duration=duration)

In [14]:
text2mel_output.keys()

dict_keys(['mel_spec', 'mel_mask', 'log_duration', 'duration', 'pitch', 'energy'])

### Pitch Predicted Features

In [16]:
pitch_output = text2mel_output['pitch']

In [17]:
pitch_spec = pitch_output['pitch_spectrogram']
pitch_mean = pitch_output['pitch_mean']
pitch_std = pitch_output['pitch_std']
pitch_f0 = pitch_output['reconstructed_f0'] # This is not notmalized

# print(f"===== Pitch Predicted Features =====")
print(f"Pitch Spectrogram: {pitch_spec.shape}")
print(f"Pitch Mean:        {pitch_mean.shape}")
print(f"Pitch std:         {pitch_std.shape}")
print(f"Pitch F0:          {pitch_f0.shape}")

Pitch Spectrogram: torch.Size([8, 616, 10])
Pitch Mean:        torch.Size([8])
Pitch std:         torch.Size([8])
Pitch F0:          torch.Size([8, 616])


### Pitch Target/Labels

In [19]:
torch.tensor(batch['pitch_contour_mean'])

tensor([5.2549, 5.1765, 5.4370, 5.3852, 5.2531, 5.3193, 5.3611, 5.2811])

In [20]:
print(f"Pitch Spectrogram: {batch['pitch_spectrogram'].shape}")
print(f"Pitch Mean:        {torch.tensor(batch['pitch_contour_mean']).shape}")
print(f"Pitch STD:         {torch.tensor(batch['pitch_contour_std']).shape}")
print(f"Pitch F0:          {torch.tensor(batch['pitch_contour']).shape}")

Pitch Spectrogram: torch.Size([8, 616, 10])
Pitch Mean:        torch.Size([8])
Pitch STD:         torch.Size([8])
Pitch F0:          torch.Size([8, 616])


In [21]:
mse_loss = nn.MSELoss()
mae_loss = nn.L1Loss()

# Loss

**Targets/Labels**
1. Mel-Spectrogram
2. Duration (log_duration)
3. Pitch-Spectrogram
4. Energy

In [None]:
class FastSpeech2Loss(nn.Module):
    """ FastSpeech2 Loss """

    def __init__(self):
        super(FastSpeech2Loss, self).__init__()
        self.mse_loss = nn.MSELoss()
        self.mae_loss = nn.L1Loss()

    def forward(self, inputs, predictions, src_mask):
        # Inputs (Ground Truth) / Target
        # Duration Target
        log_duration_target = torch.log(torch.clamp(inputs['duration'], min=1.0)).masked_fill(src_mask.bool(), 0)
        # Mel-Spectrogram Target
        mel_spec_target = inputs['mel_spectrogram']
        # Pitch Target
        pitch_spec_target = inputs['pitch_spectrogram']
        pitch_contour_target = inputs['pitch_contour']
        pitch_mean_target = inputs['pitch_contour_mean']
        pitch_std_target = inputs['pitch_contour_std']
        # Energy Target
        energy_target = inputs['energy']
        
        # Predictions
        # Mel-Spectrogram Prediction
        pred_mel_spec = predictions['mel_spec']
        mel_mask = predictions['mel_mask']
        # Pitch Feature Prediction
        pitch_predictions = predictions['pitch']
        pred_pitch_spec = pitch_predictions['pitch_spectrogram']
        pred_f0 = pitch_predictions['reconstructed_f0']
        pred_pitch_mean = pitch_predictions['pitch_contour_mean']
        pred_pitch_std = pitch_predictions['pitch_contour_std']
        # Energy Prediction
        energy_predictions = predictions['energy']
        pred_energy = energy_predictions['raw_energy']
        # Duration Prediction
        pred_log_duration = predictions['log_duration']
        
        # src_masks = ~src_masks
        # mel_masks = ~mel_masks
        # log_duration_targets = torch.log(duration_targets.float() + 1)
        mel_spec_target = mel_spec_target[:, :mel_masks.shape[1], :]
        mel_masks = mel_masks[:, :mel_masks.shape[1]]

        log_duration_targets.requires_grad = False
        mel_spec_target.requires_grad = False
        pitch_spec_target.requires_grad = False
        pitch_contour_target.requires_grad = False
        pitch_mean_target.requires_grad = False
        pitch_std_target.requires_grad = False
        energy_target.requires_grad = False
        
        # Loss Calculation
        # Mel-Spectrogram loss
        mel_loss = self.mae_loss(pred_mel_spec, mel_spec_target)
        # Pitch Loss
        pitch_spectrogram_loss = self.mse_loss(pred_pitch_spec, pitch_spec_target)
        pitch_mean_loss = self.mse_loss(pred_pitch_mean, pitch_mean_target)
        pitch_std_loss = self.mse_loss(pred_pitch_std, pitch_std_target)
        # Energy Loss
        energy_loss = self.mse_loss(pred_energy, energy_target)
        # Duration Loss
        duration_loss = self.mse_loss(pred_log_duration, log_duration_target)

        total_loss = (
            mel_loss + duration_loss + pitch_spectrogram_loss + pitch_mean_loss + pitch_std_loss + energy_loss
        )

        return (
            total_loss,
            mel_loss,
            duration_loss,
            pitch_spectrogram_loss,
            pitch_mean_loss,
            pitch_std_loss,
            energy_loss
        )

# Train Loop

In [4]:
from tqdm import tqdm

## Load Data

In [8]:
n_epochs = 4
total_steps = 10000

# Load Dataset
batch_size = 16

In [None]:
feature_dataset = SpeechFeatureDataset(
    feature_dir=FEATURE_DIR,
    batch_size=batch_size,
    sort=True,
    drop_last=False
)

group_size = 4
shuffle=True
feature_dataloader = DataLoader(
    feature_dataset,
    batch_size=group_size * batch_size,
    shuffle=shuffle,
    collate_fn=feature_dataset.collate_function
)

[NeMo I 2025-06-09 15:57:54 nemo_logging:393] Found existing object /Users/mayankanand/.cache/torch/NeMo/NeMo_2.3.1/Aligner/5b0d70eb6a09c1a8470b745034a1a00b/Aligner.nemo.
[NeMo I 2025-06-09 15:57:54 nemo_logging:393] Re-using file from: /Users/mayankanand/.cache/torch/NeMo/NeMo_2.3.1/Aligner/5b0d70eb6a09c1a8470b745034a1a00b/Aligner.nemo
[NeMo I 2025-06-09 15:57:54 nemo_logging:393] Instantiating model from pre-trained checkpoint


 NeMo-text-processing :: INFO     :: Creating ClassifyFst grammars.
[NeMo W 2025-06-09 15:58:07 nemo_logging:405] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2025-06-09 15:58:07 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.TTSDataset
      manifest_filepath: /data3/LJSpeech/nvidia_ljspeech_train.json
      sample_rate: 22050
      sup_data_path: /data3/LJSpeech/align_supplementary/
      sup_data_types:
      - align_prior_matrix
      n_fft: 1024
      win_length: 1024
      hop_length: 256
      window: hann
      n_mels: 80
      low

[NeMo I 2025-06-09 15:58:07 nemo_logging:393] PADDING: 1
[NeMo I 2025-06-09 15:58:07 nemo_logging:393] Model AlignerModel was successfully restored from /Users/mayankanand/.cache/torch/NeMo/NeMo_2.3.1/Aligner/5b0d70eb6a09c1a8470b745034a1a00b/Aligner.nemo.


 NeMo-text-processing :: INFO     :: Creating ClassifyFst grammars.


## Load Model

### Tokenizer

In [6]:
tokenizer = TTSTokenizer()

[NeMo I 2025-06-09 15:58:30 nemo_logging:393] Found existing object /Users/mayankanand/.cache/torch/NeMo/NeMo_2.3.1/Aligner/5b0d70eb6a09c1a8470b745034a1a00b/Aligner.nemo.
[NeMo I 2025-06-09 15:58:30 nemo_logging:393] Re-using file from: /Users/mayankanand/.cache/torch/NeMo/NeMo_2.3.1/Aligner/5b0d70eb6a09c1a8470b745034a1a00b/Aligner.nemo
[NeMo I 2025-06-09 15:58:30 nemo_logging:393] Instantiating model from pre-trained checkpoint


 NeMo-text-processing :: INFO     :: Creating ClassifyFst grammars.
[NeMo W 2025-06-09 15:58:43 nemo_logging:405] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2025-06-09 15:58:43 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.TTSDataset
      manifest_filepath: /data3/LJSpeech/nvidia_ljspeech_train.json
      sample_rate: 22050
      sup_data_path: /data3/LJSpeech/align_supplementary/
      sup_data_types:
      - align_prior_matrix
      n_fft: 1024
      win_length: 1024
      hop_length: 256
      window: hann
      n_mels: 80
      low

[NeMo I 2025-06-09 15:58:43 nemo_logging:393] PADDING: 1
[NeMo I 2025-06-09 15:58:43 nemo_logging:393] Model AlignerModel was successfully restored from /Users/mayankanand/.cache/torch/NeMo/NeMo_2.3.1/Aligner/5b0d70eb6a09c1a8470b745034a1a00b/Aligner.nemo.


 NeMo-text-processing :: INFO     :: Creating ClassifyFst grammars.


### Text2Mel

In [22]:
text2mel = Text2Mel(config)
text2mel.train()

Text2Mel(
  (encoder): Encoder(
    (embedding_layer): Embedding(
      (tok_emb): Embedding(114, 384)
      (pos_emb): Embedding(512, 384)
    )
    (fft_layers): ModuleList(
      (0-1): 2 x FFTBlock(
        (mha): MultiHeadAttention(
          (w_k): Linear(in_features=384, out_features=384, bias=False)
          (w_q): Linear(in_features=384, out_features=384, bias=False)
          (w_v): Linear(in_features=384, out_features=384, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
          (w_o): Linear(in_features=384, out_features=384, bias=True)
        )
        (layer_norm1): LayerNorm()
        (conv1d): Conv1D(
          (layers): Sequential(
            (0): Conv1d(384, 1536, kernel_size=(3,), stride=(1,), padding=(1,))
            (1): ReLU()
            (2): Conv1d(1536, 384, kernel_size=(3,), stride=(1,), padding=(1,))
          )
        )
        (layer_norm2): LayerNorm()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (variance_adapt

### Loop

In [9]:
epoch = 0
train = True

In [None]:
outer_loop = tqdm(total=total_steps, desc="Training", position=0)
outer_loop.n = restore_step
outer_loop.update()

while True:
    inner_loop = tqdm(total=n_epochs, desc=f"Epoch: {epoch}", position=1)

    for batchs in feature_dataloader:
        for batch in batchs:


            # Model Forward
            tokens = tokenizer.batch_tokenize(batch['raw_text'])
            input_ids, src_mask, duration = tokens['input_ids'], tokens['mask_ids'], batch['duration']

            model_output = text2mel(input_ids, src_mask, train=train, gt_duration=duration)

            

In [20]:
batch['duration'].shape

torch.Size([4, 56])

In [17]:
tokens = tokenizer.batch_tokenize(batch['raw_text'])

In [21]:
tokens['input_ids'].shape, tokens['mask_ids'].shape

(torch.Size([4, 56]), torch.Size([4, 56]))