In [1]:
import os
os.chdir('../')

In [2]:
%pwd

'd:\\MLOps-Project\\text-to-speech-using-mlops'

In [15]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelBuildingConfig:
    root_dir : Path
    text_num_embeddings : int
    embedding_size : int
    encoder_embedding_size  : int
    dim_feedforward : int
    postnet_embedding_size : int
    encoder_kernel_size : int
    postnet_kernel_size : int
    num_heads : int
    dropout : float
    batch_first : bool
    mel_freq : int
    max_mel_time : int

In [16]:
from src.simpletts.constants import *
from src.simpletts.utils.common import create_directories, read_yaml

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
        
        
    def get_model_building_config(self) -> ModelBuildingConfig:
        config = self.config.model_building
        create_directories([config.root_dir])

        model_building_config = ModelBuildingConfig(
            root_dir= self.config.root_dir,
            text_num_embeddings = self.params.text_num_embeddings,
            embedding_size = self.params.embedding_size,
            encoder_embedding_size = self.params.encoder_embedding_size,
            dim_feedforward = self.params.dim_feedforward,
            postnet_embedding_size = self.params.postnet_embedding_size,
            encoder_kernel_size = self.params.encoder_kernel_size,
            postnet_kernel_size = self.params.postnet_kernel_size,
            dropout=self.params.dorout,
            num_heads=self.params.num_heads,
            batch_first = self.params.batch_first,
            mel_freq = self.params.mel_freq,
            max_mel_time = self.params.max_mel_time
        )
        
        return model_building_config

In [5]:
import math
import torch
import torch.nn.functional as F
import torch.nn as nn

import pandas as pd
from tqdm import tqdm



class EncoderBlock(nn.Module):
    def __init__(self, config : ModelBuildingConfig):
        self.config = config
        super(EncoderBlock, self).__init__()
        self.norm1 = nn.LayerNorm(
            normalized_shape=self.config.embedding_size,
            num_heads = self.config.num_heads,
            dropout = self.config.dropout,
            batch_first = self.config.batch_first
        )
        
        self.dropout1 = torch.nn.Dropout(self.config.dropout)
        self.norm2 = nn.LayerNorm(
            normalized_shape=self.config.embedding_size
        )
        
        self.linear1 = nn.Linear(
            self.config.embedding_size,
            self.config.dim_feedforward
        )
        
        self.dropout2 = torch.nn.Dropout(self.config.dropout)
        self.linear2 = nn.Linear(
            self.config.dim_feedforward,
            self.config.embedding_size
        )
        self.dropout3 = torch.nn.Dropout(self.config.dropout)
        
        
    def forward(
        self,
        x,
        attn_mask = None,
        key_padding_mask  = None
    ):
        x_out = self.norm1(x)
        x_out, _ = self.attn(
            query = x_out,
            key = x_out,
            value = x_out,
            attn_mask  = attn_mask,
            key_padding_mask = key_padding_mask
        )
        x_out = self.dropout1(x_out)
        x = x + x_out
        
        x_out = self.norm2(x)
        x_out  = F.relu(x_out)
        x_out = self.dropout2(x_out)
        x_out = self.linear2(x_out)
        x_out = self.dropout3(x_out)
        
        x = x + x_out
        
        return x

In [6]:
class DecoderBlcok(nn.Module):
    def __init__(self, config : ModelBuildingConfig):
        self.config = config
        super(DecoderBlcok, self).__init__()
        self.norm1 = nn.LayerNorm(
            normalized_shape=self.config.embedding_size
        )
        self.self_attn = torch.nn.MultiheadAttention(
            embed_dim=self.config.embedding_size,
            num_heads=self.config.num_heads,
            dropout=self.config.dropout,
            batch_first=True
        )
        self.dropout1 = torch.nn.Dropout(self.config.dropout)
        
        self.norm2 = nn.LayerNorm(
            normalized_shape=self.config.embedding_size
        )
        self.attn = torch.nn.MultiheadAttention(
            embed_dim=self.config.embedding_size,
            num_heads= self.config.num_heads,
            dropout=self.config.num_heads,
            batch_first=True
        )
        
        self.dropout2 = torch.nn.Dropout(self.config.dropout)
        self.nomr3 = nn.LayerNorm(
            normalized_shape=self.config.embedding_size
        )
        
        self.liner1 = nn.Linear(
            self.config.embedding_size,
            self.config.dim_feedforward
        )
        self.dropout3 = torch.dropout(self.config.dropout)
        self.liner2 = nn.Linear(
            self.config.dim_feedforward,
            self.config.embedding_size
        )        
        
        self.dropout4 = torch.nn.Dropout(self.config.dropout)
        
    def forward(self,
                x,
                memory,
                x_attn_mask = None,
                x_key_mask = None,
                memory_attn_mask = None,
                memory_key_padding_mask = None,
                ):
        x_out,_ = self.self_attn(
            query = x,
            key = x,
            value = x,
            attn_mask = x_attn_mask,
            key_padding_mask = x_key_mask
        )
        x_out = self.dropout1(x_out)
        x = self.norm1(x + x_out)
        
        x_out, _ = self.attn(
            query = x,
            key = memory,
            value = memory,
            attn_mask = memory_attn_mask,
            key_padding_mask = memory_key_padding_mask
        )
        x_out  = self.dropout2(x_out)
        x = self.norm2(x + x_out)
        
        x_out = self.liner1(x)
        x_out = F.relu(x_out)
        x_out = self.dropout3(x_out)
        x_out = self.liner2(x_out)
        x = self.nomr3(x + x_out)
        
        return x

In [9]:
class EncoderPreNet(nn.Module):
    def __init__(self, config = ModelBuildingConfig):
        self.config = config
        super(EncoderBlock, self).__init__()
        self.embedding =  nn.Embedding(
            num_embeddings=self.config.text_num_embeddings,
            embedding_dim= self.config.encoder_embedding_size
        )
        
        self.linear_1 = nn.Linear(
            self.config.embedding_size,
            self.config.encoder_embedding_size
        )
        
        self.linear_2 = nn.Linear(
            self.config.encoder_embedding_size,
            self.config.embedding_size
        )
        self.conv_1 = nn.Conv2d(
            self.config.encoder_embedding_size,
            self.config.encoder_embedding_size,
            kernel_size=self.config.encoder_kernel_size,
            stride=1,
            padding = int((self.config.encoder_kernel_size - 1) / 2),
            dilation=1
            
        )
        self.bn_1 = nn.BatchNorm1d(
            self.config.encoder_embedding_size
        )
        self.dropout_1 = torch.nn.Dropout(self.config.dropout)
        
        self.conv_2 = nn.Conv1d(
            self.config.encoder_embedding_size,
            self.config.encoder_embedding_size,
            kernel_size=self.config.encoder_kernel_size,
            stride= 1,
            padding= int((self.config.encoder_kernel_size -1) / 2),
            dilation=1
        )
        self.bn_2 = nn.BatchNorm1d(
            self.config.encoder_embedding_size
        )
        self.dropout_2 = torch.nn.Dropout(self.config.dropout)
        self.conv_3 = nn.Conv1d(
            self.config.encoder_embedding_size,
            self.config.embedding_size,
            kernel_size=self.config.encoder_kernel_size,
            stride=1,
            padding=int((self.config.encoder_kernel_size - 1) / 2),
            dilation=1
        )
        self.bn_3 = nn.BatchNorm1d(
            self.config.encoder_embedding_size
        )
        
        self.dropout_3 = nn.Dropout(self.config.dropout)
        
        
    def forward(self,text):
        x = self.embedding(text)
        x = self.linear_1(x)
        
        x = x.transpose(2,1)
        
        x = self.conv_1(x)
        x = self.bn_1(x)
        x = F.relu(x)
        x = self.dropout_1(x)
        
        x = self.conv_2(x)
        x = self.bn_2(x)
        x = F.relu(x)
        x = self.dropout_2(x)
        
        
        x = self.conv_3(x)
        x = self.bn_3(x)
        x = F.relu(x)
        x = self.dropout_3(x)
        
        x = x.transpose(2,1)
        x = self.linear_2(x)
        
        return x     

In [13]:
class PostNet(nn.Module):
    def __init__(self, config : ModelBuildingConfig):
        self.config = config
        
        super(PostNet, self).__init__()
        self.conv_1 = nn.Conv1d(
            self.config.mel_freq,
            self.config.postnet_embedding_size,
            kernel_size=self.config.postnet_kernel_size,
            stride=1,
            padding= int((self.config.postnet_kernel_size - 1) / 2),
            dilation=1
        )
        self.bn_1 = nn.BatchNorm1d(
            self.config.postnet_embedding_size
        )
        self.dropout_1 = torch.nn.Dropout(self.config.dropout)
        
        self.conv_2 = nn.Conv1d(
            self.config.mel_freq,
            self.config.postnet_embedding_size,
            kernel_size=self.config.postnet_kernel_size,
            stride=1,
            padding=int((self.config.postnet_kernel_size - 1) / 2),
            dilation= 1
        )
        self.bn_2 = nn.BatchNorm1d(
            self.config.postnet_embedding_size
        )
        self.dropout_2 = nn.Dropout(self.config.dropout)
        
        self.conv_3 = nn.Conv1d(
            self.config.mel_freq,
            self.config.postnet_embedding_size,
            kernel_size=self.config.postnet_kernel_size,
            stride=1,
            padding= int((self.config.postnet_kernel_size - 1) / 2),
            dilation=1
        )
        self.bn_3 = nn.BatchNorm1d(
            self.config.postnet_embedding_size
        )
        self.dropout_3  = nn.Dropout(self.config.dropout)
        
        self.conv_4 = nn.Conv1d(
            self.config.mel_freq,
            self.config.postnet_embedding_size,
            kernel_size=self.config.postnet_kernel_size,
            stride=1,
            padding= int((self.config.postnet_kernel_size - 1) / 2),
            dilation=1
        )
        
        self.bn_4  = nn.BatchNorm1d(
            self.config.postnet_embedding_size
        )
        
        self.dropout_4 = nn.Dropout(self.config.dropout)
        
        self.conv_5 = nn.Conv1d(
            self.config.mel_freq,
            self.config.postnet_embedding_size,
            kernel_size=self.config.postnet_kernel_size,
            stride=1,
            padding= int((self.config.postnet_kernel_size - 1) / 2),
            dilation=1
        )
        
        self.bn_5  = nn.BatchNorm1d(
            self.config.postnet_embedding_size
        )
        
        self.dropout_5 = nn.Dropout(self.config.dropout)
        
        self.conv_6 = nn.Conv1d(
            self.config.mel_freq,
            self.config.postnet_embedding_size,
            kernel_size=self.config.postnet_kernel_size,
            stride=1,
            padding= int((self.config.postnet_kernel_size - 1) / 2),
            dilation=1
        )
        
        self.bn_6  = nn.BatchNorm1d(
            self.config.postnet_embedding_size
        )
        
        self.dropout_6 = nn.Dropout(self.config.dropout)
        
        
        
    def forward(self, x):
        x = x.transpose(2,1)
        x = self.conv_1(x)
        x = self.bn_1(x)
        x = torch.tanh(x)
        x = self.dropout_1(x)
        x = self.conv_2(x)
        x = self.bn_2(x)
        x = torch.tanh(x)
        x = self.dropout_2(x)
        x = self.conv_3(x)
        x = self.bn_3(x)
        x = torch.tanh(x)
        x = self.dropout_3(x)
        x = self.conv_4(x)
        x = self.bn_4(x)
        x = torch.tanh(x)
        x = self.dropout_4(x)
        x = self.conv_5(x)
        x = self.bn_5(x)
        x = torch.tanh(x)
        x = self.dropout_5(x)
        x = self.conv_6(x)
        x = self.bn_6(x)
        x = torch.tanh(x)
        x = self.dropout_6(x)
        x = x.transpose(1,2)
        
        return x

In [14]:
class DecoderPreNet(nn.Module):
    def __init__(self, config : ModelBuildingConfig):
        self.config = config 
        super(DecoderPreNet, self).__init__()
        self.linear_1 = nn.Linear(self.config.mel_freq, self.config.embedding_size)
        self.linear_2 = nn.Linear(self.config.embedding_size, self.config.embedding_size)
        
    def forward(self, x):
        x = self.linear_1(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=True)
        x = self.linear_2(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=True)
        return x
        

In [18]:
from src.simpletts.components.data_transformation import mask_from_seq_lengths
class TransformerTTS(nn.Module):
    def __init__(self, config : ModelBuildingConfig, device = 'cuda'):
        self.config = config
        super(TransformerTTS, self).__init__()
        self.encoder_prenet = EncoderPreNet(config)
        self.decoder_prenet = DecoderPreNet(config)
        self.postnet = PostNet(config)
        self.pos_encoder = nn.Embedding(num_embeddings=self.config.max_mel_time, embedding_dim=self.config.embedding_size)
        self.encoder_block1 = EncoderBlock(config)
        self.encoder_block2 = EncoderBlock(config)
        self.encoder_block3 = EncoderBlock(config)
        self.decoder_block1 = DecoderBlcok(config)
        self.decoder_block2 = DecoderBlcok(config)
        self.decoder_block3 = DecoderBlcok(config)
        self.linear1 = nn.Linear(self.config.embedding_size, self.config.mel_freq)
        self.linear2 = nn.Linear(self.config.embedding_size + 1)
        self.norm_memory = nn.LayerNorm(normalized_shape=self.config.embedding_size)
        
        
    def forward(self, text, text_len, mel, mel_len):
        N = text.shape[0]
        S = text.shape[1]
        TIME = mel.shape[1]
            
        # Create mask
        self.src_key_padding_mask = torch.zeros((N, S), device=text.device).masked_fill(
        ~mask_from_seq_lengths(text_len, max_length=S), float("-inf")
        )
        self.src_mask = torch.zeros((S, S), device=text.device).masked_fill(
             torch.triu(torch.full((S, S), True, dtype=torch.bool), diagonal=1).to(text.device),       
            float("-inf")
        )
        self.tgt_key_padding_mask = torch.zeros((N, TIME), device=mel.device).masked_fill(
            ~mask_from_seq_lengths(mel_len, max_length=TIME), float("-inf")
        )
        self.tgt_mask = torch.zeros((TIME, TIME), device=mel.device).masked_fill(
            torch.triu(torch.full((TIME, TIME), True, device=mel.device, dtype=torch.bool), diagonal=1),       
            float("-inf")
        )
        self.memory_mask = torch.zeros((TIME, S), device=mel.device).masked_fill(
            torch.triu(torch.full((TIME, S), True, device=mel.device, dtype=torch.bool), diagonal=1),       
            float("-inf")
        )
            
            
        # Encoder 
        text_x = self.encoder_prenet(text)
        pos_codes = self.pos_encode(torch.arange(self.config.max_mel_time).to(mel.device))
        S = text_x.shape[1]
        text_x = text_x + pos_codes[:S]
        text_x = self.encoder_block1(text_x, attn_mask = self.src_mask, key_padding_mask = self.src_key_padding_mask)
        text_x = self.encoder_block2(text_x, attn_mask = self.src_mask, key_padding_mask = self.src_key_padding_mask)
        text_x = self.encoder_block3(text_x, attn_mask = self.src_mask, key_padding_mask = self.src_key_padding_mask)
        text_x = self.norm_memory(text_x)
        
        # Decoder 
        mel_x = self.decoder_prenet(mel)
        mel_x = mel_x + pos_codes[:TIME]
        mel_x = self.decoder_block1(x  = mel_x, memory = text_x, x_attn_mask = self.tgt_mask,
                                    x_key_padding_mask = self.tgt_key_padding_mask,
                                    memory_key_padding_mask = self.src_key_padding_mask)
        mel_x = self.decoder_block2(x  = mel_x, memory = text_x, x_attn_mask = self.tgt_mask,
                                    x_key_padding_mask = self.tgt_key_padding_mask,
                                    memory_key_padding_mask = self.src_key_padding_mask)
        mel_x = self.decoder_block3(x  = mel_x, memory = text_x, x_attn_mask = self.tgt_mask,
                                    x_key_padding_mask = self.tgt_key_padding_mask,
                                    memory_key_padding_mask = self.src_key_padding_mask)
        
        
        # Output Processing
        mel_linear = self.linear1(mel_x)
        mel_postnet = self.postnet(mel_linear)
        mel_postnet = mel_linear + mel_postnet
        stop_token = self.linear2(mel_x)
        
        
        # Masking 
        bool_mel_mask = self.tgt_key_padding_mask.ne[0].unsqueeze(-1).repeat(1,1,self.config.mel_freq)
        mel_linear = mel_linear.masked_fill(bool_mel_mask, 0)
        mel_postnet = mel_postnet.masked_fill(bool_mel_mask, 0)
        stop_token = stop_token.masked_fill(bool_mel_mask[:, :, 0].unsqueeze(-1).squeeze(2))
        return mel_postnet, mel_linear, stop_token

In [21]:
from torchinfo import summary
import torch
from pathlib import Path

# Initialize ConfigurationManager
config_manager = ConfigurationManager()
model_config = config_manager.get_model_building_config()

# Initialize the model
model = TransformerTTS(model_config)

# Print and save model summary
def print_and_save_summary(model, input_size, file_path):
    model_summary = summary(model, input_size=input_size, depth=10, col_names=["input_size", "output_size", "num_params", "kernel_size", "mult_adds"])
    print(model_summary)
    
    with open(file_path, 'w') as f:
        f.write(str(model_summary))
    print(f"Model summary saved to {file_path}")

# Example input sizes (you may need to adjust these based on your specific configuration)
text_input_size = (1, 100)  # (batch_size, sequence_length)
text_len_input_size = (1,)
mel_input_size = (1, model_config.max_mel_time, model_config.mel_freq)
mel_len_input_size = (1,)

input_size = (text_input_size, text_len_input_size, mel_input_size, mel_len_input_size)

# Define the file path for saving the summary
summary_file_path = Path(model_config.root_dir) / "model_summary.txt"

# Print and save the model summary
print_and_save_summary(model, input_size, summary_file_path)

[2024-09-03 02:31:40,414: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-09-03 02:31:40,417: INFO: common: yaml file: params.yaml loaded successfully]
[2024-09-03 02:31:40,419: INFO: common: created directory at: artifacts]
[2024-09-03 02:31:40,419: INFO: common: created directory at: artifacts/model]


BoxKeyError: "'ConfigBox' object has no attribute 'root_dir'"

In [20]:
%pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0
Note: you may need to restart the kernel to use updated packages.


