In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'samanantar:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1272055%2F2119948%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240319%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240319T055216Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Db85fd6ad1b47458687401bbb352499aa55dce605eed7180a602266bcd036028bbd2be47b7dad90248cf17ecfaa37fe2b278340511fd118b412b8dd3aaabd88e48bf45f077a89877d7d02d7e27145c3e6ecdd26fcfb46f85117e67e311dab40dd36e734e2c1300ca93021019b70691b54613a4d3b462a9dd1232a18ec84e4e985764588644539c8b3a7080bda082e195776c2a777b1b09deb6abd9f569280a06df5a0e0b6caa33787c8ab34d2809b343db2f1ae53cc532a86265dac01f6e195811a17bd2132a38b6e6fc2c211c028be382e2761fe2a0d4764f39ccdaf93a95245a2dbbf844314530c138088d20f25376c0d436d452a8d75535fa1b8d995d0ddda'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import math
import torch.nn.functional as f

In [None]:
english_file = "/kaggle/input/samanantar/final_data/en-te/train.en"
telugu_file = "/kaggle/input/samanantar/final_data/en-te/train.te"

START_TOKEN = "<START>"
PADDING_TOKEN = "<PADDING>"
END_TOKEN = "<END>"

english_vocab = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        ':', '<', '=', '>', '?', '@',
                        'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R',
                        'S','T','U','V','W','X','Y','Z',
                        '[', '\\', ']', '^', '_', '`',
                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                        'y', 'z',
                        '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]

telugu_vocab = [ START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                ':', '<', '=', '>', '?', '@',
                '[', '\\', ']', '^', '_', '`','{', '|', '}', '~',
                'े', '्', '॥', 'ఁ', 'ం', 'ః', 'అ', 'ఆ', 'ఇ', 'ఈ', 'ఉ', 'ఊ', 'ఋ', 'ఌ',
                'ఎ', 'ఏ', 'ఐ', 'ఒ', 'ఓ', 'ఔ', 'క', 'ఖ', 'గ', 'ఘ', 'ఙ', 'చ', 'ఛ', 'జ', 'ఝ',
                'ఞ', 'ట', 'ఠ', 'డ', 'ఢ', 'ణ', 'త', 'థ', 'ద', 'ధ', 'న', 'ప', 'ఫ', 'బ', 'భ',
                'మ', 'య', 'ర', 'ఱ', 'ల', 'ళ', 'వ', 'శ', 'ష', 'స', 'హ', 'ా', 'ి', 'ీ', 'ు',
                'ూ', 'ృ', 'ె', 'ే', 'ై', 'ొ', 'ో', 'ౌ', '్', 'ౖ', 'ౘ', 'ౙ', 'ౠ', 'ౡ',
                '౦', '౧', '౨', '౩', '౪', '౫', '౬', '౭', '౮', '౯',
                PADDING_TOKEN, END_TOKEN
               ]

In [None]:
text = "కృష్ణ"
list(text)

In [None]:
'క' +  'ె'

In [None]:
index_to_telugu = {ind:te for ind, te in enumerate(telugu_vocab)}
telugu_to_index = {te:ind for ind, te in enumerate(telugu_vocab)}
index_to_english = {ind:en for ind, en in enumerate(english_vocab)}
english_to_index = {en:ind for ind, en in enumerate(english_vocab)}


In [None]:
with open(english_file, 'r') as file:
    english_sentences = file.readlines()
with open(telugu_file, 'r') as file:
    telugu_sentences = file.readlines()

In [None]:
print(len(english_sentences))
print(len(telugu_sentences))

In [None]:
print(english_sentences[:3])
print(telugu_sentences[:3])

In [None]:
TOT_SEN = 30000

# as sentences ke last pai \n hain usko nikalna hain
# and we dont need to all the sentences to do training
english_sentences = [english_sentences[i].rstrip('\n') for i in range(TOT_SEN)]
telugu_sentences = [telugu_sentences[i].rstrip('\n') for i in range(TOT_SEN)]

In [None]:
print(english_sentences[:3])
print(telugu_sentences[:3])

In [None]:
# plan to pass embeddings for each token rather than word
max(len(x) for x in english_sentences), max(len(x) for x in telugu_sentences)

In [None]:
PERCENTAILE = 97
# printing the length of sentneces for 97% of the sentences
np.percentile([len(x) for x in english_sentences], PERCENTAILE)
np.percentile([len(x) for x in telugu_sentences], PERCENTAILE)

In [None]:
MAX_SEQ_LEN = 200

def is_valid_sen(sen, vocab):
    for token in sen:
        if token not in vocab:
            return False
    return True

def is_valid_length(sen, max_len):
    if len(sen) < max_len - 1: # need to add end token along that 200
        return True
    else:
        return False

telugu__sentences = []
english__sentences = []

for i in range(len(telugu_sentences)):
    eng_sen, tel_sen = english_sentences[i], telugu_sentences[i]
    if is_valid_sen(eng_sen, english_vocab) and is_valid_sen(tel_sen, telugu_vocab) and is_valid_length(eng_sen, MAX_SEQ_LEN) and is_valid_length(tel_sen, MAX_SEQ_LEN):
        telugu__sentences.append(tel_sen)
        english__sentences.append(eng_sen)


In [None]:
len(telugu__sentences), len(english__sentences)

In [None]:
def tokeniser(sen, lan_to_index):
    sen_to_ind = [lan_to_index[x] for x in sen]
    sen_to_ind.insert(0, lan_to_index[START_TOKEN])
    sen_to_ind.append(lan_to_index[END_TOKEN])
    for _ in range(len(sen_to_ind), MAX_SEQ_LEN):
        sen_to_ind.append(lan_to_index[PADDING_TOKEN])
    return torch.tensor(sen_to_ind)

telugu_sen_tokensied = [tokeniser(x, telugu_to_index) for x in telugu__sentences]
english_sen_tokensied = [tokeniser(x, english_to_index) for x in english__sentences]


In [None]:
print(telugu_sen_tokensied[:2])
print(english_sen_tokensied[:2])

In [None]:
from torch.utils.data import Dataset, DataLoader

class Data(Dataset):
    def __init__(self, english_sentences, telugu_sentences):
        self.english_sentences = english_sentences
        self.telugu_sentences = telugu_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, ind):
        return self.english_sentences[ind], self.telugu_sentences[ind]


In [None]:
data_set = Data(english_sen_tokensied, telugu_sen_tokensied)

In [None]:
len(data_set)

In [None]:
data_set[0]

In [None]:
BATCH_SIZE = 32
train_loader = DataLoader(data_set, BATCH_SIZE)

In [None]:
for ind, (english, telugu) in enumerate(train_loader):
    print(english)
    print(telugu)
    print(english.size())
    print(telugu.size())
    break

In [None]:
NEG_INF = -1e9

# we need 2 masks : padding_mask [ not to update weights bcz of padding token], lookahead_mask [not to look in future]
def create_masks(eng_batch, tel_batch):
    batch = len(eng_batch)
    look_ahead_mask = torch.triu(torch.full([MAX_SEQ_LEN, MAX_SEQ_LEN], True), diagonal = 1) # jisko nehi dekna tha - usko True pai set kiye
    encoder_padding_mask = torch.full([batch, MAX_SEQ_LEN, MAX_SEQ_LEN], True) # all intilised to true
    decoder_padding_mask_self_attention = torch.full([batch, MAX_SEQ_LEN, MAX_SEQ_LEN], True)
    decoder_padding_mask_cross_attention = torch.full([batch, MAX_SEQ_LEN,  MAX_SEQ_LEN], True)


    for i in range(batch):
        eng_len, tel_len = len(eng_batch[i]), len(tel_batch[i])
        encoder_padding_mask[i, :eng_len, :eng_len] = False
        decoder_padding_mask_self_attention[i, :tel_len, :tel_len] = False
        decoder_padding_mask_cross_attention[i , :tel_len, :tel_len] = False

    encoder_padding_mask = torch.where(encoder_padding_mask, NEG_INF, 0)
    decoder_self_attention_mask = torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INF, 0)
    decoder_padding_mask_cross_attention = torch.where(decoder_padding_mask_cross_attention, NEG_INF, 0)

    return encoder_padding_mask, decoder_self_attention_mask, decoder_padding_mask_cross_attention


In [None]:
look = torch.triu(torch.full([20, 20],True), diagonal = 1)
print(look)

In [None]:
e = torch.full([2, 5, 5], True)
print(e)

In [None]:
e[0,:3,:3] = False
e[1, :3, :3] = False
print(e)

**MODEL START**

In [None]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob = 0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p = drop_prob)

    def forward(self, x):
        # x -> batch*seq*d_model
        x = self.linear1(x) # batch*seq*hidden
        x = self.relu(x) # batch*seq*hidden
        x = self.dropout(x) # batch*seq*hidden
        x = self.linear2(x) # batch*seq*d_model

        return x


class LayerNormalization(nn.Module):
    def __init__(self, parameter_shape, eps = 1e-5):
        super(LayerNormalization, self).__init__()
        # tells about along which dimension we want to perform layer normalization
        self.parameter_shape = parameter_shape # typically it is embedding dimension [d_model]
        self.eps = eps
        # gamma, beta --> learnable parameters
        self.gamma = nn.Parameter(torch.ones(parameter_shape)) # [d_model] --> standard deviation
        self.beta = nn.Parameter(torch.ones(parameter_shape)) # [d_model] --> mean

    def forward(self, inputs):
        # inputs --> batch*seq*d_model
        dims = [-(i + 1) for i in range(len(self.parameter_shape))] # boils to -1
        # if keepdim = Flase --> batch*seq
        mean = inputs.mean(dim = dims, keepdim = True) # batch*seq*1
        var = ((inputs - mean) ** 2).mean(dim = dims, keepdim = True) # batch*seq*1
        std = (var + self.eps).sqrt() # batch*seq*d_model
        # iske wazah se hoga --> mean = 0, standDev = 1
        y = (inputs - mean) / std # ( DOUBT : yaha pai dim mention karne ki zarurat nehi hain kya)
        # as said earlier for masking --> torch adds it for every batch & seq --> BUT DO CHECK ONCE
        out = self.gamma * y + self.beta # batch*seq*d_model
        # we have 512 learnable parameters in gamma & beta --> same as broadcasting discussed earlier

        return out


def scaled_dot_product(q, k, v, mask):
    d_k = q.size()[-1] #head_dim
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k) # batch*head*seq*head_dim , batch*head*head*_dim*seq, batch*head*seq*seq
    if mask is not None:
        scaled = scaled.permute(1, 0, 2, 3) + mask
        scaled = scaled.permute(1, 0, 2, 3)
#         scaled += mask # mask -->batch * seq * seq --> pytorch is good enough to add it across every batch
    attention = f.softmax(scaled, dim = -1)
    values = torch.matmul(attention, v) #batch*head*seq*seq , batch*head*seq*head_dim

    return values, attention


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model//num_heads
        self.qkv_layer = nn.Linear(d_model, 3*d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, mask):
        batch_size, seq_length, d_model = x.size() # batch*seq*d_model
        qkv = self.qkv_layer(x) #batch*seq*d_model*3
        # here --> every seq : 8 heads : q, k, v
        qkv = qkv.reshape(batch_size, seq_length, self.num_heads, 3*self.head_dim) # batch*seq*noOfHead*(3*head_dim)
        # it would be better to have --> every head: all seq : q, k ,v
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim = -1) # q, k, v ==> batch*head*seq*head_dim
        # values : batch*head*seq*head_dim, attention : batch*head*seq*seq
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, seq_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)

        return out # batch*seq*d_model ==> same as X, bt with more context aware


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq):
        super().__init__()
        self.max_seq_length = max_seq
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_seq_length)
                          .reshape(self.max_seq_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE


class SentenceEmbeddings(nn.Module):
    def __init__(self, max_seq_len, d_model, start_token, end_token, pad_token, language_to_index):
        super().__init__()
        self.max_seq_len = max_seq_len
        self.d_model = d_model
        self.start_token = start_token
        self.end_token = end_token
        self.pad_token = pad_token
        self.lan_to_index = language_to_index
        self.vocab = len(language_to_index)

        print(self.vocab, self.d_model)
        self.embed = nn.Embedding(self.vocab, self.d_model)
        self.dropout = nn.Dropout(p = 0.1)
        self.position_encoder = PositionalEncoding(d_model, max_seq_len)

    def batch_tokenize(self, batch):
        def tokeniser(sen):
            sen_to_ind = [self.lan_to_index[x] for x in sen]
#             sen_to_ind.insert(0, lan_to_index[START_TOKEN])
            sen_to_ind.append(self.lan_to_index[END_TOKEN])
            for _ in range(len(sen_to_ind), MAX_SEQ_LEN):
                sen_to_ind.append(self.lan_to_index[PADDING_TOKEN])
            return torch.tensor(sen_to_ind)

        tokenized = []
        for sentence_num in range(len(batch)):
           tokenized.append(tokeniser(batch[sentence_num]))
        tokenized = torch.stack(tokenized)
        return tokenized

    def forward(self, x):
        # x - [batch, length of sentence]
#         print(x.size(), "out")
        x = self.batch_tokenize(x) # [batch, max_seq]
#         print(x.size(), "after")
        x = self.embed(x) # [batch, max_seq, d_model]
        pos = self.position_encoder()
        x = self.dropout(x + pos) # [batch, max_seq, d_model]
        return x # [batch, max_seq, d_model]

class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model = d_model, num_heads = num_heads)
        self.norm1 = LayerNormalization(parameter_shape = [d_model])
        self.dropout1 = nn.Dropout(p = drop_prob)

        self.ffn = PositionwiseFeedForward(d_model = d_model, hidden = ffn_hidden, drop_prob = drop_prob)
        self.norm2 = LayerNormalization(parameter_shape = [d_model])
        self.dropout2 = nn.Dropout(p = drop_prob)

    def forward(self, x, self_attention_mask):
        # x --> batch*seq*d_model
        residual_x = x.clone() # batch*seq*d_model
        x = self.attention(x, mask = self_attention_mask) # batch*seq*d_model
        x = self.dropout1(x) # batch * seq * d_model
        x = self.norm1(x + residual_x) # batch * seq * d_model

        residual_x = x.clone() # batch*seq*d_model
        x = self.ffn(x) # batch*seq*d_model
        x = self.dropout2(x) # batch*seq*d_model
        x = self.norm2(x + residual_x) # batch*seq*d_model
        return x # batch*seq*d_model --> same as of input, bt much more context aware


class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, self_attention_mask = inputs
        # as used Sequential every layer is stored as dict[name is _modules] {name:layer}
        for module in self._modules.values():
            x = module(x, self_attention_mask)
        return x #[batch, seqLen, d_model]

class Encoder(nn.Module):
    def __init__(self, drop_prob, d_model, ffn_hidden, num_heads, num_layers, max_seq, en_to_ind, start_token, end_token, pad_token):
        super().__init__()
        self.sentenceEmbedding = SentenceEmbeddings(max_seq, d_model, start_token, end_token, pad_token, en_to_ind)
        # nn.Sequential takes only one argument as we have x & mask we need to write out own forwad function
        self.layers =  SequentialEncoder(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                      for _ in range(num_layers)])

    def forward(self, x, self_attention_mask):
        # x -> [batch, seqLen]
        x = self.sentenceEmbedding(x) # [batch, max_seq, d_model]
        x = self.layers(x,self_attention_mask)
        return x


class CrossMultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.kv_layer = nn.Linear(d_model , 2 * d_model)
        self.q_layer = nn.Linear(d_model , d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, y, cross_attention_mask):
        batch_size, seq_length, d_model = x.size() # batch*seq*d_model
        kv = self.kv_layer(x) #batch*seq*d_model*3
        q = self.q_layer(y)
        # here --> every seq : 8 heads : q, k, v
        kv = kv.reshape(batch_size, seq_length, self.num_heads, 2*self.head_dim) # batch*seq*noOfHead*(3*head_dim)
        q = q.reshape(batch_size, seq_length, self.num_heads, self.head_dim)
        kv = kv.permute(0, 2, 1, 3)
        q = q.permute(0, 2, 1, 3)
        k, v = kv.chunk(2, dim = -1) # k, v ==> batch*head*seq*head_dim
        # values : batch*head*seq*head_dim, attention : batch*head*seq*seq
        values, attention = scaled_dot_product(q, k, v, cross_attention_mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, seq_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)

        return out # batch*seq*d_model ==> same as X, bt with more context aware



class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super().__init__()
        self.attention = MultiHeadAttention(d_model = d_model, num_heads = num_heads)
        self.norm1 = LayerNormalization(parameter_shape = [d_model])
        self.dropout1 = nn.Dropout(p = drop_prob)

        self.cross_attention = CrossMultiHeadAttention(d_model = d_model, num_heads = num_heads)
        self.norm2 = LayerNormalization(parameter_shape = [d_model])
        self.dropout2 = nn.Dropout(p = drop_prob)

        self.ffn = PositionwiseFeedForward(d_model = d_model, hidden = ffn_hidden, drop_prob = drop_prob)
        self.norm3 = LayerNormalization(parameter_shape = [d_model])
        self.dropout3 = nn.Dropout(p = drop_prob)

    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        # x, y -> [batch, maxSeq, d_model], self & cross _attenion --> [batch, seq, seq]
        residual_y = y.clone() # [batch, maxSeq, d_model]
        y = self.attention(y, self_attention_mask) # [batch, maxSeq, d_model]
        y = self.dropout1(y) # [batch, maxSeq, d_model]
        y = self.norm1(y + residual_y) # [batch, maxSeq, d_model]

        residual_y = y.clone() # [batch, maxSeq, d_model]
        y = self.cross_attention(x, y, cross_attention_mask) # [batch, maxSeq, d_model]
        y = self.dropout2(y) # [batch, maxSeq, d_model]
        y = self.norm2(y + residual_y) # [batch, maxSeq, d_model]

        residual_y = y
        y = self.ffn(y)
        y = self.dropout3(y) # [batch, maxSeq, d_model]
        y = self.norm3(y + residual_y) # [batch, maxSeq, d_model]

        return y # [batch, maxSeq, d_model] --> vectors having context of input of decoder with the knowledge gained from encoder


class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, y, self_attention_mask, cross_attention_mask = inputs
        for module in self._modules.values():
            y = module(x, y, self_attention_mask, cross_attention_mask)
        return y


class Decoder(nn.Module):
    def __init__(self, drop_prob, d_model, ffn_hidden, num_heads, num_layers, max_seq, te_to_ind, start_token, end_token, pad_token):
        super().__init__()
        self.sentenceEmbedding = SentenceEmbeddings(max_seq, d_model, start_token, end_token, pad_token, te_to_ind)
        # nn.Sequential takes only one argument as we have x & mask we need to write out own forwad function
        self.layers =  SequentialDecoder(*[DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                      for _ in range(num_layers)])

    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        # x -> [batch, max_seq, d_model], y -> [batch, seqLen]
        y = self.sentenceEmbedding(y) # [batch, max_seq, d_model]
        y = self.layers(x, y, self_attention_mask, cross_attention_mask)
        return y


class Transformer(nn.Module):
    def __init__(self, drop_prob, d_model, ffn_hidden, num_heads, no_layers, max_seq, en_to_ind, te_to_ind, te_vocab_size,START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()

        self.encoder = Encoder(drop_prob, d_model, ffn_hidden, num_heads, no_layers, max_seq, en_to_ind, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.decoder = Decoder(drop_prob, d_model, ffn_hidden, num_heads, no_layers, max_seq, te_to_ind, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.linear = nn.Linear(d_model, te_vocab_size)

    def forward(self, x, y, encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask):
        # x, y ==> [batch, senLength]
        x = self.encoder(x, encoder_self_attention_mask) # [batch, maxSeq, d_model]
        out = self.decoder(x, y, decoder_self_attention_mask, decoder_cross_attention_mask)
        out = self.linear(out)
        return out # [batch, max_seq, te_vocab_size]


In [None]:
data_set = data_set = Data(english__sentences, telugu__sentences)
data_batched = DataLoader(data_set, BATCH_SIZE)

In [None]:
for eng, tel in train_loader:
    print(eng)
    print(tel)
    break

In [None]:
model = Transformer(0.1, 512, 1024, 8, 5, 200,english_to_index, telugu_to_index, len(telugu_vocab),START_TOKEN, END_TOKEN, PADDING_TOKEN)
criterion = nn.CrossEntropyLoss(ignore_index=telugu_to_index[PADDING_TOKEN],
                                reduction='none')
learning_rate = 0.1
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [None]:
EPOCHS = 1
for i in range(EPOCHS):
    for eng, tel in data_batched:
        encoder_padding_mask, decoder_self_attention_mask, decoder_padding_mask_cross_attention = create_masks(eng, tel)
        # [batch, max_seq, te_vocab_size]
        output = model.forward(eng, tel, encoder_padding_mask, decoder_self_attention_mask, decoder_padding_mask_cross_attention)
        labels = model.decoder.sentenceEmbedding.batch_tokenize(tel)  # [batch, max_seq]
        loss = criterion(
            output.view(-1, len(telugu_to_index)),
            labels.view(-1)
        )
        valid_indicies = torch.where(labels.view(-1) == telugu_to_index[PADDING_TOKEN], False, True)
        loss = loss.sum() / valid_indicies.sum()
        loss.backward()
        optimizer.zero_grad()
        optimizer.step()


        text = "అనఘ నా సోదరి"
        en_sample = ["Anagha is my sister"]
        te_sample = ["<START>"]
        for j in range(200):
            encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(en_sample, te_sample)
            output_sam = model.forward(eng, tel, encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask)
            next_token_prob_distribution = output_sam[0][j]
            next_token_index = torch.argmax(next_token_prob_distribution).item()
            next_token = index_to_telugu[next_token_index]
            te_sample = [te_sample[0] + next_token, ]
            if next_token == END_TOKEN:
                break
        print(te_sample)
#         break

