In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from src.Transformers import Transformer

In [None]:
import torch
import torch.nn as nn
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
df = pd.read_csv('machine_translation_dataset.csv')
print(f'Size of the dataframe is {df.shape}')
df.head()

In [None]:
dataset = df.sample(20000).reset_index(drop=True)
dataset

In [None]:
def preprocess_texts(source_texts, target_texts, max_src_len=None, max_tgt_len=None):
    # Tokenize the source language (Kannada)
    src_tokenizer = Tokenizer()
    src_tokenizer.fit_on_texts(source_texts)
    src_sequences = src_tokenizer.texts_to_sequences(source_texts)
    src_vocab_size = len(src_tokenizer.word_index) + 1

    # Tokenize the target language (English)
    tgt_tokenizer = Tokenizer()
    tgt_tokenizer.fit_on_texts(target_texts)
    tgt_sequences = tgt_tokenizer.texts_to_sequences(target_texts)
    tgt_vocab_size = len(tgt_tokenizer.word_index) + 1

    # Determine max sequence lengths if not provided
    if max_src_len is None:
        max_src_len = max(len(seq) for seq in src_sequences)
    if max_tgt_len is None:
        max_tgt_len = max(len(seq) for seq in tgt_sequences)

    # Padding sequences to ensure consistent length
    src_sequences = pad_sequences(src_sequences, maxlen=max_src_len, padding='post')
    tgt_sequences = pad_sequences(tgt_sequences, maxlen=max_tgt_len, padding='post')

    # Convert sequences to torch tensors
    src_sequences = torch.tensor(src_sequences, dtype=torch.long)
    tgt_sequences = torch.tensor(tgt_sequences, dtype=torch.long)

    return src_sequences, tgt_sequences, src_vocab_size, tgt_vocab_size, max_src_len, max_tgt_len


In [None]:
def initialize_model(src_vocab_size, tgt_vocab_size, max_seq_len, embedding_dim=512, num_layers=6, expansion_factor=4, n_heads=8):
    model = Transformer(
        vocab_size=max(src_vocab_size, tgt_vocab_size),
        embedding_dim=embedding_dim,
        max_seq_len=max_seq_len,
        num_layers=num_layers,
        expansion_factor=expansion_factor,
        n_heads=n_heads
    )
    return model

In [None]:
def train_model(model, src_sequences, tgt_sequences, num_epochs=10, learning_rate=0.001, checkpoint_dir='checkpoints', batch_size=32):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        correct_predictions = 0
        total_tokens = 0
        
        optimizer.zero_grad()
        
        # Wrap the data loader with tqdm for progress tracking
        with tqdm(total=src_sequences.size(0), desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch') as pbar:
            for i in range(0, src_sequences.size(0), batch_size):
                src_batch = src_sequences[i:i+batch_size]
                tgt_batch = tgt_sequences[i:i+batch_size]
                
                # Forward pass
                outputs = model(src_batch, tgt_batch, tgt_mask=None)
                
                # Compute loss
                loss = criterion(outputs.view(-1, outputs.size(-1)), tgt_batch.view(-1))
                
                # Backward pass and optimization
                loss.backward()
                optimizer.step()
                
                # Compute accuracy
                predicted = outputs.argmax(dim=-1)
                mask = tgt_batch != 0  # Assume 0 is the padding index
                correct_predictions += (predicted == tgt_batch).masked_select(mask).sum().item()
                total_tokens += mask.sum().item()
                
                epoch_loss += loss.item()
                
                # Update progress bar
                pbar.update(src_batch.size(0))
                accuracy = correct_predictions / total_tokens if total_tokens > 0 else 0
                pbar.set_postfix(loss=epoch_loss / (i + src_batch.size(0)), accuracy=accuracy)
        
        checkpoint_path = os.path.join(checkpoint_dir, f'model_epoch_{epoch+1}.pt')
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': epoch_loss,
        }, checkpoint_path)
        print(f'Model checkpoint saved at {checkpoint_path}')


In [None]:
def predict(model, src_sequence, max_tgt_len):
    model.eval()
    with torch.no_grad():
        src_sequence = src_sequence.unsqueeze(0)  # Add batch dimension
        tgt_sequence = torch.zeros((1, max_tgt_len), dtype=torch.long)  # Initial empty target sequence

        for i in range(max_tgt_len):
            output = model(src_sequence, tgt_sequence, tgt_mask=None)
            prediction = output[:, i, :].argmax(dim=-1)
            tgt_sequence[:, i] = prediction

        return tgt_sequence.squeeze(0)

In [None]:
src_sequences, tgt_sequences, src_vocab_size, tgt_vocab_size, max_src_len, max_tgt_len = preprocess_texts(dataset['Kannada Sentences'], dataset['English Sentences'])

In [None]:
model = initialize_model(src_vocab_size, tgt_vocab_size, max_seq_len=max(max_src_len, max_tgt_len))

In [None]:
train_model(model, src_sequences, tgt_sequences, num_epochs=10, learning_rate=0.001)