In [1]:
import os
!pwd
os.chdir('../')
!pwd

/Users/nikhil0035/Documents/GitHub/Machine_Translation_using_Transformers/research


/Users/nikhil0035/Documents/GitHub/Machine_Translation_using_Transformers


In [2]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TrainingConfig:
    batch_size: int
    num_epochs: int
    lr: float
    seq_len: int
    d_model: int

In [3]:
from Translate.constants import *
from Translate.utils.common import *
from Translate.entity.config_entity import Config_Data


import torchtext.datasets as datasets
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import LambdaLR

import warnings
from tqdm import tqdm
import os
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def get_weights_file_path(config, epoch: str):
    model_folder = f"{config.datasource}_{config.model_folder}"
    model_filename = f"{config.model_basename}{epoch}.pt"
    return str(Path('.') / model_folder / model_filename)

# Find the latest weights file in the weights folder
def latest_weights_file_path(config):
    model_folder = f"{config.datasource}_{config.model_folder}"
    model_filename = f"{config.model_basename}*"
    weights_files = list(Path(model_folder).glob(model_filename))
    if len(weights_files) == 0:
        return None
    weights_files.sort()
    return str(weights_files[-1])

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_training_config(self)->TrainingConfig:

        params = self.params

        training_config = TrainingConfig(

            batch_size = params.batch_size,
            num_epochs = params.num_epochs,
            lr = params.lr,
            seq_len =  params.seq_len,
            d_model = params.d_model,
        )

        return training_config

    
    def get_config(self) -> Config_Data:
        config = self.config.config_data

        # create_directories([config.root_dir])

        data_ingestion_config = Config_Data(
            
            batch_size = config.batch_size,
            num_epochs = config.num_epochs,
            lr = config.lr,
            seq_len =  config.seq_len,
            d_model = config.d_model,
            datasource = config.datasource,
            lang_src = config.lang_src,
            lang_tgt = config.lang_tgt,
            model_folder = config.model_folder,
            model_basename = config.model_basename,
            preload = config.preload,
            tokenizer_file = config.tokenizer_file,
            experiment_name = config.experiment_name,
        )

        return data_ingestion_config


In [6]:
config_obj=ConfigurationManager()
param = config_obj.get_training_config()
config = config_obj.get_config()

[2024-01-31 22:35:23,045: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-01-31 22:35:23,048: INFO: common: yaml file: params.yaml loaded successfully]
[2024-01-31 22:35:23,048: INFO: common: created directory at: artifacts]


In [7]:
class train_model():
    def __init__(self,config,param,train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt,model):
        self.param=param
        self.config=config
        self.train_dataloader=train_dataloader
        self.val_dataloader=val_dataloader
        self.tokenizer_src=tokenizer_src
        self.tokenizer_tgt=tokenizer_tgt
        self.model = model
    
    def train(self):
        device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"
        print("Using device:", device)
        if (device == 'cuda'):
            print(f"Device name: {torch.cuda.get_device_name(device.index)}")
            print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
        elif (device == 'mps'):
            print(f"Device name: <mps>")
        else:
            print("NOTE: If you have a GPU, consider using it for training.")
            print("      On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc")
            print("      On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu")
        device = torch.device(device)

        Path(f"{config.datasource}_{config.model_folder}").mkdir(parents=True, exist_ok=True)

        model = self.model.to(device)
        
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001, eps=1e-9)
        
        initial_epoch = 0
        global_step = 0
        preload = config.preload

        model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None

        if model_filename:
            print(f'Preloading model {model_filename}')
            state = torch.load(model_filename)
            model.load_state_dict(state['model_state_dict'])
            initial_epoch = state['epoch'] + 1
            optimizer.load_state_dict(state['optimizer_state_dict'])
            global_step = state['global_step']
        else:
            print('No model to preload, starting from scratch')
        
        loss_fn = nn.CrossEntropyLoss(ignore_index=self.tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

        for epoch in range(initial_epoch, param.num_epochs):
            torch.cuda.empty_cache()
            model.train()
            batch_iterator = tqdm(self.train_dataloader, desc=f"Processing Epoch {epoch:02d}")

            for batch in batch_iterator:

                encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
                decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
                encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
                decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

                # Run the tensors through the encoder, decoder and the projection layer
                encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
                decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
                proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

                # Compare the output with the label
                label = batch['label'].to(device) # (B, seq_len)

                # Compute the loss using a simple cross entropy
                loss = loss_fn(proj_output.view(-1, self.tokenizer_tgt.get_vocab_size()), label.view(-1))
                batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

                # # Log the loss
                # writer.add_scalar('train loss', loss.item(), global_step)
                # writer.flush()

                # Backpropagate the loss
                loss.backward()

                # Update the weights
                optimizer.step()
                optimizer.zero_grad(set_to_none=True)

                global_step += 1

            model_filename = get_weights_file_path(config, f"{epoch:02d}")
            torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
            }, model_filename)

In [8]:
from Translate.pipeline.stage_01_data_injestion import DataIngestionTrainingPipeline
from Translate.pipeline.stage_02_prepare_model import PrepareModelPipeline

In [9]:
data_obj = DataIngestionTrainingPipeline()
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = data_obj.main()

[2024-01-31 22:35:23,081: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-01-31 22:35:23,082: INFO: common: yaml file: params.yaml loaded successfully]
[2024-01-31 22:35:23,082: INFO: common: created directory at: artifacts]
[2024-01-31 22:35:28,455: INFO: data_injestion: Max length of source sentence: 309]
[2024-01-31 22:35:28,455: INFO: data_injestion: Max length of target sentence: 274]


In [10]:
prepare_model_obj = PrepareModelPipeline()
model = prepare_model_obj.main(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size())

[2024-01-31 22:35:28,461: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-01-31 22:35:28,462: INFO: common: yaml file: params.yaml loaded successfully]
[2024-01-31 22:35:28,463: INFO: common: created directory at: artifacts]


In [11]:
train_obj = train_model(config,param,train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt,model)

In [12]:
param.batch_size

1

In [13]:
train_obj.train()

Using device: mps
Device name: <mps>
No model to preload, starting from scratch


Processing Epoch 00:   0%|          | 0/29098 [00:00<?, ?it/s]

: 