- Remove non alphanumeric characters for simple training

In [None]:
from transformer import Transformer # this is the transformer.py file
import torch
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
english_file = '/content/drive/My Drive/NLP-Project/train.en' # only 100 instances are used for experiment
marathi_file = '/content/drive/My Drive/NLP-Project/train.mr' # only 100 instances are used for experiment

# Generated this by filtering Appendix code

START_TOKEN = '<START>'
PADDING_TOKEN = '<PADDING>'
END_TOKEN = '<END>'

marathi_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ˌ',
                      'ँ', 'ఆ', 'ఇ', 'ా', 'ి', 'ీ', 'ు', 'ూ',
                      'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ॠ', 'ऌ', 'ऎ', 'ए', 'ऐ', 'ऒ', 'ओ', 'औ',
                      'क', 'ख', 'ग', 'घ', 'ङ',
                      'च', 'छ', 'ज', 'झ', 'ञ',
                      'ट', 'ठ', 'ड', 'ढ', 'ण',
                      'त', 'थ', 'द', 'ध', 'न',
                      'प', 'फ', 'ब', 'भ', 'म',
                      'य', 'र', 'ऱ', 'ल', 'ळ', 'व', 'श', 'ष', 'स', 'ह',
                      '़', 'ऽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'ॅ', 'े', 'ै', 'ॉ', 'ो', 'ौ', '्', 'ॐ', '।', '॥', '॰', 'ॱ', PADDING_TOKEN, END_TOKEN]

english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        ':', '<', '=', '>', '?', '@',
                        '[', '\\', ']', '^', '_', '`',
                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                        'y', 'z',
                        '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]




index_to_marathi = {k:v for k,v in enumerate(marathi_vocabulary)}
marathi_to_index = {v:k for k,v in enumerate(marathi_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

with open(english_file, 'r') as file:
    english_sentences = file.readlines()
with open(marathi_file, 'r') as file:
    marathi_sentences = file.readlines()

# Limit Number of sentences
TOTAL_SENTENCES = 500000
english_sentences = english_sentences[:TOTAL_SENTENCES]
marathi_sentences = marathi_sentences[:TOTAL_SENTENCES]
english_sentences = [sentence.rstrip('\n').lower() for sentence in english_sentences]
marathi_sentences = [sentence.rstrip('\n') for sentence in marathi_sentences]



import numpy as np
PERCENTILE = 97
print( f"{PERCENTILE}th percentile length marathi: {np.percentile([len(x) for x in marathi_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )



max_sequence_length = 180

def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(marathi_sentences)):
    marathi_sentence, english_sentence = marathi_sentences[index], english_sentences[index]
    if is_valid_length(marathi_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(marathi_sentence, marathi_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(marathi_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")


marathi_sentences = [marathi_sentences[i] for i in valid_sentence_indicies]
english_sentences = [english_sentences[i] for i in valid_sentence_indicies]


import torch

d_model = 512
batch_size = 512
ffn_hidden = 2048
num_heads = 16
drop_prob = 0.1
num_layers = 2
max_sequence_length = 180
mr_vocab_size = len(marathi_vocabulary)

transformer = Transformer(d_model,
                          ffn_hidden,
                          num_heads,
                          drop_prob,
                          num_layers,
                          max_sequence_length,
                          mr_vocab_size,
                          english_to_index,
                          marathi_to_index,
                          START_TOKEN,
                          END_TOKEN,
                          PADDING_TOKEN)

criterion = nn.CrossEntropyLoss(ignore_index=marathi_to_index[PADDING_TOKEN])
optimizer = torch.optim.Adam(transformer.parameters(), lr=1e-4)
device = torch.device('cuda' if torch.cuda.is_available() else torch.device('cpu'))

97th percentile length marathi: 186.0
97th percentile length English: 199.0
Number of sentences: 500000
Number of valid sentences: 159495


In [None]:
from torch.utils.data import Dataset, DataLoader

# Dataset class
class TextDataset(Dataset):
    def __init__(self, english_sentences, marathi_sentences):
        self.english_sentences = english_sentences
        self.marathi_sentences = marathi_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.marathi_sentences[idx]

# Split the dataset into training and validation sets
split_ratio = 0.8
split_index = int(len(english_sentences) * split_ratio)
train_english = english_sentences[:split_index]
train_marathi = marathi_sentences[:split_index]
valid_english = english_sentences[split_index:]
valid_marathi = marathi_sentences[split_index:]

train_dataset = TextDataset(train_english, train_marathi)
valid_dataset = TextDataset(valid_english, valid_marathi)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

In [None]:
from torch import nn

criterian = nn.CrossEntropyLoss(ignore_index=marathi_to_index[PADDING_TOKEN],
                                reduction='none')

# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
NEG_INFTY = -1e9

def create_masks(eng_batch, mr_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)

    for idx in range(num_sentences):
      eng_sentence_length, mr_sentence_length = len(eng_batch[idx]), len(mr_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
      mr_chars_to_padding_mask = np.arange(mr_sentence_length + 1, max_sequence_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, mr_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, mr_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, mr_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

Modify mask such that the padding tokens cannot look ahead.
In Encoder, tokens before it should be -1e9 while tokens after it should be -inf.


Note the target mask starts with 2 rows of non masked items: https://github.com/SamLynnEvans/Transformer/blob/master/Beam.py#L55


In [None]:
import matplotlib.pyplot as plt

# Training and validation function
def train_and_validate(num_epochs, train_loader, valid_loader, model, criterion, optimizer, device):
    train_losses = []
    valid_losses = []
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        for batch in train_loader:
            eng_batch, mr_batch = batch
            optimizer.zero_grad()
            encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, mr_batch)
            mr_predictions = model(eng_batch,
                                   mr_batch,
                                   encoder_self_attention_mask.to(device),
                                   decoder_self_attention_mask.to(device),
                                   decoder_cross_attention_mask.to(device),
                                   enc_start_token=False,
                                   enc_end_token=False,
                                   dec_start_token=True,
                                   dec_end_token=True)
            labels = model.decoder.sentence_embedding.batch_tokenize(mr_batch, start_token=False, end_token=True)
            loss = criterion(
                mr_predictions.view(-1, len(marathi_vocabulary)).to(device),
                labels.view(-1).to(device)
            )
            valid_indices = labels.view(-1) != marathi_to_index[PADDING_TOKEN]
            loss = (loss * valid_indices).sum() / valid_indices.sum()
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # Validation phase
        model.eval()
        total_valid_loss = 0
        with torch.no_grad():
            for batch in valid_loader:
                eng_batch, mr_batch = batch
                encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, mr_batch)
                mr_predictions = model(eng_batch,
                                       mr_batch,
                                       encoder_self_attention_mask.to(device),
                                       decoder_self_attention_mask.to(device),
                                       decoder_cross_attention_mask.to(device),
                                       enc_start_token=False,
                                       enc_end_token=False,
                                       dec_start_token=True,
                                       dec_end_token=True)
                labels = model.decoder.sentence_embedding.batch_tokenize(mr_batch, start_token=False, end_token=True)
                loss = criterion(
                    mr_predictions.view(-1, len(marathi_vocabulary)).to(device),
                    labels.view(-1).to(device)
                )
                valid_indices = labels.view(-1) != marathi_to_index[PADDING_TOKEN]
                loss = (loss * valid_indices).sum() / valid_indices.sum()
                total_valid_loss += loss.item()

        avg_valid_loss = total_valid_loss / len(valid_loader)
        valid_losses.append(avg_valid_loss)

        print(f"Epoch {epoch}: Train Loss: {avg_train_loss:.4f}, Valid Loss: {avg_valid_loss:.4f}")

    return train_losses, valid_losses

# Run training and validation
num_epochs = 200
train_losses, valid_losses = train_and_validate(num_epochs, train_loader, valid_loader, transformer, criterion, optimizer, device)

# Plot losses
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs+1), train_losses, 'bo-', label='Training loss')
plt.plot(range(1, num_epochs+1), valid_losses, 'ro-', label='Validation loss')
plt.title('Training and Validation Losses')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

Epoch 0: Train Loss: 2.5232, Valid Loss: 2.0366
Epoch 1: Train Loss: 1.9192, Valid Loss: 1.6853
Epoch 2: Train Loss: 1.6796, Valid Loss: 1.5354


## Inference

In [None]:
transformer.eval()
def translate(eng_sentence):
  eng_sentence = (eng_sentence,)
  mr_sentence = ("",)
  for word_counter in range(max_sequence_length):
    encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, mr_sentence)
    predictions = transformer(eng_sentence,
                              mr_sentence,
                              encoder_self_attention_mask.to(device),
                              decoder_self_attention_mask.to(device),
                              decoder_cross_attention_mask.to(device),
                              enc_start_token=False,
                              enc_end_token=False,
                              dec_start_token=True,
                              dec_end_token=False)
    next_token_prob_distribution = predictions[0][word_counter]
    next_token_index = torch.argmax(next_token_prob_distribution).item()
    next_token = index_to_marathi[next_token_index]
    mr_sentence = (mr_sentence[0] + next_token, )
    if next_token == END_TOKEN:
      break
  return mr_sentence[0]

In [None]:
translation = translate("what should we do when the day starts?")
print(translation)

या या या या का का आ आ या आ हे आहे आ.<END>


In [None]:
translation = translate("how is this the truth?")
print(translation)

त्या काय काय काय काही?<END>


In [None]:
translation = translate("the world is a large place with different people")
print(translation)

पण प्रकार प्रण्या प्रण्याच्या प्रत्या आहेत<END>


## Experimentations left:

- Using word-based or BPE based tokenizations or sentence piece.
- Increase the number of encoder / decoder units for better translations. It was set to the minimum of 1 of each unit here.
- layers, cross attn, self attn, multi attn and hyper parameter tuning

In [None]:
from transformers import AutoTokenizer, AutoModel

import os

ACCESS_TOKEN = "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"  # Replace with your actual access token
os.environ["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"] = ACCESS_TOKEN


# Save the tokenizer
tokenizer = AutoTokenizer.from_pretrained("mgharpur/eng2mar", use_fast=True)
tokenizer.save_pretrained("mgharpur/eng2mar")

# Save the model
model = AutoModel.from_config(transformer.config)
model.load_state_dict(transformer.state_dict())
model.save_pretrained("mgharpur/eng2mar")

ValueError: The checkpoint you are trying to load has model type `auto` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.

In [None]:
!pip install transformers



In [None]:
from transformers import AutoConfig, AutoModel, AutoTokenizer

config = AutoConfig.from_pretrained("mgharpur/eng2mar")

# Now, use this configuration to create the model and tokenizer objects
model = AutoModel.from_config(config)
tokenizer = AutoTokenizer.from_pretrained("mgharpur/eng2mar")

# Save the model and tokenizer
model.save_pretrained("mgharpur/eng2mar")
tokenizer.save_pretrained("mgharpur/eng2mar")

ValueError: The checkpoint you are trying to load has model type `transformer` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.