In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from tqdm import tqdm
import torch
import torch.nn as nn
from collections import Counter

In [3]:
from Transformers import Transformer
from Pipeline import TextPreprocessingPipeline
from BPE_tokenizer import BPE

In [4]:
df = pd.read_csv('../Data/dataset_kag.csv')
print(f'Size of the dataframe is {df.shape}')
df.head()

Size of the dataframe is (300000, 2)


Unnamed: 0,English,Kannada
0,Hes a scientist.,ಇವರು ಸಂಶೋಧಕ ಸ್ವಭಾವದವರು.
1,'But we speak the truth aur ye sach hai ke Guj...,"""ಆದರೆ ಸತ್ಯ ಹೊರ ಬಂದೇ ಬರುತ್ತದೆ ಎಂದು ಹೇಳಿದ ರಾಹುಲ್..."
2,8 lakh crore have been looted.,ಕಳ್ಳತನವಾಗಿದ್ದ 8 ಲಕ್ಷ ರೂ.
3,I read a lot into this as well.,ಇದರ ಬಗ್ಗೆ ನಾನೂ ಸಾಕಷ್ಟು ಓದಿದ್ದೇನೆ.
4,She was found dead with the phone's battery ex...,ಆಕೆಯ ತಲೆಯ ಹತ್ತಿರ ಇರಿಸಿಕೊಂಡಿದ್ದ ಫೋನ್‌ನ ಬ್ಯಾಟರಿ ...


In [5]:
dataset = df.sample(20000).reset_index(drop=True)
dataset.head()

Unnamed: 0,English,Kannada
0,"Congress leader in Lok Sabha, Mallikarjun Kharge.","ಮಲ್ಲಿಕಾರ್ಜುನ ಖರ್ಗೆ, ಲೋಕಸಭೆಯ ಕಾಂಗ್ರೆಸ್‌ ಪಕ್ಷದ ನಾಯಕ"
1,SRINAGAR: Ahead of the Republic Day celebratio...,ಶ್ರೀನಗರ: ದೇಶಾದ್ಯಂತ ಸ್ವಾತಂತ್ರ್ಯೋತ್ಸವದ ಸಂಭ್ರಮಕ್ಕ...
2,You will have physical and mental happiness.,ಶಾರೀರಿಕವಾಗಿ ಮತ್ತು ಮಾನಸಿಕವಾಗಿ ತುಂಬಾ ಸಂತೋಷದಿಂದ ಇ...
3,[ Pictures on page 29],[ ಪುಟ 29ರಲ್ಲಿರುವ ಚಿತ್ರ]
4,Why did,ಯಾಕೆ ಮಾಡಿಸಿದ್ದು?


class PreProcess:
    def __init__(self):
        print("Loading the required files")
        self.english_contractions = '../Data/english_contractions.json'
        if not os.path.isfile(self.english_contractions):
            raise FileNotFoundError("Contraction file does not exist")
        else:
            print("JSON file exists at location")

        # Initialize normalizers
        self.text_processor_eng = PP.TextNormalizerEnglish(self.english_contractions)
        self.text_processor_kan = PP.TextNormalizerKannada()

        # Initialize vocabularies
        self.vocab_eng = set()
        self.vocab_kan = set()

        # Special tokens
        self.special_tokens = {
            "<PAD>": 0,
            "<SOS>": 1,
            "<EOS>": 2,
            "<UNK>": 3
        }

        # Load BPE tokenizer
        self.tokenizer = self.load_tokenizer('../Models/tokenizer.pkl')

    def load_tokenizer(self, tokenizer_file):
        print("Loading BPE tokenizer...")
        with open(tokenizer_file, 'rb') as f:
            merges = pickle.load(f)
        print("BPE tokenizer loaded successfully.")
        print(type(merges))
        tokenizer = BPE(corpus=None, vocab_size=None)
        tokenizer.merges = merges
        return tokenizer

    def preprocess_english(self, sentence):
        print("Normalizing the English sentence")
        normalized_sentence = self.text_processor_eng.normalize(sentence)
        self.build_vocabulary(normalized_sentence, lang='english')
        return normalized_sentence

    def preprocess_kannada(self, sentence):
        print("Normalizing the Kannada sentence")
        normalized_sentence = self.text_processor_kan.normalize(sentence)
        self.build_vocabulary(normalized_sentence, lang='kannada')
        return normalized_sentence

    def build_vocabulary(self, sentence, lang):
        if lang == 'english':
            words = sentence.split()
            self.vocab_eng.update(words)
        elif lang == 'kannada':
            words = sentence.split()
            self.vocab_kan.update(words)

    def find_max_sequence_length(self, sentences):
        max_length = max(len(sentence.split()) for sentence in sentences)
        return max_length

    def pad_sentences(self, sentence, max_length):
            # Tokenize using the BPE tokenizer
        print(f'Original Sentence : {sentence}')
        tokens = self.tokenizer.tokenize(sentence)
        print(f'Generated Tokens : {tokens}')
        print("Performing Padding")
            # Add <SOS> at the start and <EOS> at the end
        padded_sentence = ['<SOS>'] + tokens + ['<EOS>']
            # Calculate how many <PAD> tokens are needed
        padding_length = max_length - len(padded_sentence)
            # Pad with <PAD> token if necessary (post padding)
        if padding_length > 0:
                padded_sentence += ['<PAD>'] * padding_length
        return padded_sentence

In [7]:
preprocess = TextPreprocessingPipeline('C:\Users\Amodini\Downloads\Barbie\B.A.R.B.I.E-main\Data\english_contractions.json')
eng_sentences = [
    "I'm going to the store.",
    "This is an example sentence of 7 tokens."
]
kan_sentences = [
    "ನಾನು ಅಂಗಡಿಗೆ ಹೋಗುತ್ತಿದ್ದೇನೆ.",
    "ಈ ಒಂದು ಉದಾಹರಣೆ ವಾಕ್ಯವಾಗಿದೆ."
]

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (2994196487.py, line 1)

In [None]:
def initialize_model(src_vocab_size, tgt_vocab_size, max_seq_len, embedding_dim=512, num_layers=6, expansion_factor=4, n_heads=8):
    model = Transformer(
        vocab_size=max(src_vocab_size, tgt_vocab_size),
        embedding_dim=embedding_dim,
        max_seq_len=max_seq_len,
        num_layers=num_layers,
        expansion_factor=expansion_factor,
        n_heads=n_heads
    )
    return model

In [8]:
def train_model(model, src_sequences, tgt_sequences, num_epochs=10, learning_rate=0.001, checkpoint_dir='checkpoints', batch_size=32):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        correct_predictions = 0
        total_tokens = 0
        
        optimizer.zero_grad()
        
        # Wrap the data loader with tqdm for progress tracking
        with tqdm(total=src_sequences.size(0), desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch') as pbar:
            for i in range(0, src_sequences.size(0), batch_size):
                src_batch = src_sequences[i:i+batch_size]
                tgt_batch = tgt_sequences[i:i+batch_size]
                
                # Forward pass
                outputs = model(src_batch, tgt_batch, tgt_mask=None)
                
                # Compute loss
                loss = criterion(outputs.view(-1, outputs.size(-1)), tgt_batch.view(-1))
                
                # Backward pass and optimization
                loss.backward()
                optimizer.step()
                
                # Compute accuracy
                predicted = outputs.argmax(dim=-1)
                mask = tgt_batch != 0  # Assume 0 is the padding index
                correct_predictions += (predicted == tgt_batch).masked_select(mask).sum().item()
                total_tokens += mask.sum().item()
                
                epoch_loss += loss.item()
                
                # Update progress bar
                pbar.update(src_batch.size(0))
                accuracy = correct_predictions / total_tokens if total_tokens > 0 else 0
                pbar.set_postfix(loss=epoch_loss / (i + src_batch.size(0)), accuracy=accuracy)
        
        checkpoint_path = os.path.join(checkpoint_dir, f'model_epoch_{epoch+1}.pt')
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': epoch_loss,
        }, checkpoint_path)
        print(f'Model checkpoint saved at {checkpoint_path}')


In [None]:
def predict(model, src_sequence, max_tgt_len):
    model.eval()
    with torch.no_grad():
        src_sequence = src_sequence.unsqueeze(0)  # Add batch dimension
        tgt_sequence = torch.zeros((1, max_tgt_len), dtype=torch.long)  # Initial empty target sequence

        for i in range(max_tgt_len):
            output = model(src_sequence, tgt_sequence, tgt_mask=None)
            prediction = output[:, i, :].argmax(dim=-1)
            tgt_sequence[:, i] = prediction

        return tgt_sequence.squeeze(0)

In [9]:

all_tokens = [word.lower() for text in df['Kannada'] for word in text.split()]

unique_tokens = set(all_tokens)

src_vocabulary_size = len(unique_tokens)
print("Vocabulary Size:", src_vocabulary_size)

all_tokens2 = [word.lower() for text in df['English'] for word in text.split()]

unique_tokens2 = set(all_tokens2)

tgt_vocabulary_size = len(unique_tokens2)
print("Vocabulary Size:", tgt_vocabulary_size)

Vocabulary Size: 345752
Vocabulary Size: 133039


In [14]:
sentence_lengths1 = df['Kannada'].dropna().apply(lambda x: len(x.split()))

# Calculate the maximum sentence length
src_max_length = sentence_lengths1.max()

print("Maximum sentence length:", src_max_length)

sentence_lengths2 = df['English'].dropna().apply(lambda x: len(x.split()))

# Calculate the maximum sentence length
tgt_max_length = sentence_lengths2.max()

print("Maximum sentence length:", tgt_max_length)

Maximum sentence length: 135
Maximum sentence length: 238


In [None]:
src_sequences, tgt_sequences, src_vocab_size, tgt_vocab_size, max_src_len, max_tgt_len = preprocess(dataset['Kannada Sentences'], dataset['English Sentences'],src_vocabulary_size,tgt_vocabulary_size,src_max_length,tgt_max_length)

Traceback (most recent call last):
  File "c:\Users\Amodini\.vscode\extensions\ms-python.python-2024.20.0-win32-x64\python_files\python_server.py", line 130, in exec_user_input
    retval = callable_(user_input, user_globals)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<string>", line 1, in <module>
NameError: name 'preprocess' is not defined



In [None]:
model = initialize_model(src_vocab_size, tgt_vocab_size, max_seq_len=max(max_src_len, max_tgt_len))

Traceback (most recent call last):
  File "c:\Users\Amodini\.vscode\extensions\ms-python.python-2024.20.0-win32-x64\python_files\python_server.py", line 130, in exec_user_input
    retval = callable_(user_input, user_globals)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<string>", line 1, in <module>
NameError: name 'src_vocab_size' is not defined



In [None]:
train_model(model, src_sequences, tgt_sequences, num_epochs=10, learning_rate=0.001)

Traceback (most recent call last):
  File "c:\Users\Amodini\.vscode\extensions\ms-python.python-2024.20.0-win32-x64\python_files\python_server.py", line 130, in exec_user_input
    retval = callable_(user_input, user_globals)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<string>", line 1, in <module>
NameError: name 'model' is not defined

