In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import random
import torch
from torch import nn


In [None]:
df = pd.read_csv('/content/npi.txt', sep='\t', names = ['eng', 'nep', 'info'])
df.sample(10)

Unnamed: 0,eng,nep,info
1624,Are you still looking for Tom?,के तपाइँ अझै टमलाई खोज्दै हुनुहुन्छ?,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
475,This is my choice.,यो मेरो रोजाई हो।,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2534,Tom says he has never tried eating whale meat.,टम भन्छन् कि उनले कहिल्यै ह्वेलको मासु खाने प्...,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
1692,Tom always had a lot of money.,टमसँग सधैं धेरै पैसा थियो।,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2511,"Tom speaks three languages, including French.",टम फ्रान्सेली सहित तीन भाषा बोल्छन्।,CC-BY 2.0 (France) Attribution: tatoeba.org #8...
1413,Tom has a chemistry degree.,टमसँग रसायनशास्त्रको डिग्री छ।,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
519,I decided the same.,मैले पनि त्यही निर्णय गरें ।,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
2391,Tom gave almost all his money to charity.,टमले आफ्नो प्रायः सबै पैसा परोपकारलाई दिए।,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
276,He is unmarried.,उहाँ अविवाहित हुनुहुन्छ ।,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1108,I took care of Tom's cat.,मैले टमको बिरालोको हेरचाह गरें।,CC-BY 2.0 (France) Attribution: tatoeba.org #3...


In [None]:
df.shape

(2689, 3)

In [None]:
#data preprocessing

#lowercase
df['nep'] = df['nep'].str.lower()
df['eng'] = df['eng'].str.lower()

#remove punctuations

import string
df['nep'] = df['nep'].str.translate(str.maketrans('', '', string.punctuation))
df['eng'] = df['eng'].str.translate(str.maketrans('', '', string.punctuation))

In [None]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [None]:
df.sample(5)

Unnamed: 0,eng,nep,info
2012,tom works every day except monday,टम सोमबार बाहेक हरेक दिन काम गर्छ।,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
321,where did he go,ऊ कहाँ गयो,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1394,no one brought us anything,हामीलाई कसैले केही ल्याएनन्।,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
358,i eat vegetables,म तरकारी खान्छु।,CC-BY 2.0 (France) Attribution: tatoeba.org #8...
1987,those books are written in french,ती पुस्तकहरू फ्रेन्चमा लेखिएका छन्।,CC-BY 2.0 (France) Attribution: tatoeba.org #1...


In [None]:
# remove extra space
df['nep'] = df['nep'].str.strip()
df['eng'] = df['eng'].str.strip()

In [None]:
#add start and end marks as <start> and <end> respectively
df['nep'] = '<start> ' + df['nep'] + ' <end>'
# df['eng'] = '<start> ' + df['eng'] + ' <end>'

In [None]:
#tokenization
#find vocab size of df['eng'] and df['cn']

from collections import Counter
eng_counter = Counter([word for line in df['eng'] for word in line.split()])
n_counter = Counter([word for line in df['nep'] for word in line.split()])

eng_vocabs = list(eng_counter.keys())
nep_vocabs = list(n_counter.keys())


print(eng_vocabs[:5], nep_vocabs[:5])

print(len(eng_vocabs), len(nep_vocabs))


['who', 'hide', 'stay', 'hello', 'smile'] ['<start>', 'को', '<end>', 'लुकाउनुहोस्।', 'लुक।']
1992 3200


In [None]:
#find the biggest sequence lenght with that sequence

max_len_eng = max([len(line.split(' ')) for line in df['eng']])
max_len_nep = max([len(line.split(' ')) for line in df['nep']])

print(max_len_eng, max_len_nep)

25 22


In [None]:
ll =([len(line.split()) for line in df['eng']])
print(ll.index(3))
df['eng'][23]

10


'is it bad'

In [None]:
input_words = sorted(eng_vocabs)
target_words = sorted(nep_vocabs)
print(input_words[-5:])

['zealand', 'zero', 'zoologist', 'zoology', '€100']


In [None]:
#Machine translation begins

num_encoder_tokens = len(input_words) + 1 #+1 for zero padding
num_decoder_tokens = len(target_words) + 1

print(num_encoder_tokens, num_decoder_tokens)

1993 3201


In [None]:
#index: word
input_word_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_word_index = dict([(word, i+1) for i, word in enumerate(target_words)])
print(input_word_index.items())

dict_items([('10', 1), ('100', 2), ('13', 3), ('14', 4), ('19', 5), ('20', 6), ('2013', 7), ('20th', 8), ('230', 9), ('30', 10), ('3000', 11), ('40', 12), ('430', 13), ('93', 14), ('a', 15), ('able', 16), ('aboard', 17), ('about', 18), ('above', 19), ('abroad', 20), ('abused', 21), ('accept', 22), ('accepted', 23), ('accident', 24), ('accountant', 25), ('accurate', 26), ('accurately', 27), ('across', 28), ('act', 29), ('active', 30), ('actor', 31), ('actually', 32), ('add', 33), ('addict', 34), ('address', 35), ('admit', 36), ('admitted', 37), ('advance', 38), ('adventures', 39), ('advice', 40), ('afraid', 41), ('after', 42), ('afternoon', 43), ('again', 44), ('age', 45), ('aggressive', 46), ('ago', 47), ('agree', 48), ('agreed', 49), ('ahead', 50), ('airport', 51), ('alcohol', 52), ('alice', 53), ('all', 54), ('allergic', 55), ('allergies', 56), ('allowed', 57), ('almost', 58), ('alone', 59), ('along', 60), ('already', 61), ('also', 62), ('alternative', 63), ('always', 64), ('am', 65)

In [None]:
rev_input_char_index = dict((i, word) for word, i in input_word_index.items())
rev_target_char_index = dict((i, word) for word, i in target_word_index.items())

In [None]:
#Train test split garam aba
from sklearn.model_selection import train_test_split
X = df['eng']
y = df['nep']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((2151,), (538,))

In [None]:
#generate batch data
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    for j in range(0, len(X), batch_size):

        encoder_input_data = np.zeros((max_len_eng, batch_size),dtype='float32')
        decoder_input_data = np.zeros((max_len_nep, batch_size),dtype='float32')
        decoder_target_data = np.zeros((max_len_nep, batch_size, num_decoder_tokens),dtype='float32')

        for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):

            for no, each in enumerate(input_text.split()):
                encoder_input_data[no, i] = input_word_index[each] #encoder input sequience

            for no, each in enumerate(target_text.split()):
                if no<len(target_text.split())-1:
                    decoder_input_data[no, i] = target_word_index[each] # decoder input sequence
                if no>0: #decoder target seq does not include the start token so offset by 1
                    decoder_target_data[no -1, i, target_word_index[each]] = 1.
        yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [None]:
encoder_in_data = np.zeros((len(df['eng']), 25), dtype = 'float32')

decoder_in_data = np.zeros((len(df['nep']), 22), dtype = 'float32')

decoder_target_data = np.zeros((len(df['nep']), 22, num_decoder_tokens), dtype = 'float32')

In [None]:
for i, (input_text, target_text) in enumerate(zip(df['eng'], df['nep'])):

  for no, each in enumerate(input_text.split()):
    encoder_in_data[i, no] = input_word_index[each] #encoder input sequience

  for no, each in enumerate(target_text.split()):
      decoder_in_data[i, no] = target_word_index[each] # decoder input sequence

      if no>0: #decoder target seq does not include the start token so offset by 1
          decoder_target_data[i, no - 1, target_word_index[each]] = 1.

In [None]:
latent_dim = 50

In [None]:

class Encoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size):
    super(Encoder, self).__init__()
    self.input_size = input_size
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(self.input_size, self.embedding_size)
    self.lstm = nn.LSTM(self.embedding_size, self.hidden_size)


  def forward(self, x):
    embedding = self.embedding(x)
    output, (hidden, cell) = self.lstm(embedding)
    encoder_state = (hidden, cell)
    return encoder_state

In [None]:
encoder = Encoder(num_encoder_tokens, latent_dim, latent_dim)

In [None]:
print(encoder)

Encoder(
  (embedding): Embedding(1993, 50)
  (lstm): LSTM(50, 50)
)


In [None]:
class Decoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, output_size):
    super(Decoder, self).__init__()
    self.input_size = input_size
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size
    self.output_size = output_size

    self.embedding = nn.Embedding(self.input_size, self.embedding_size)
    self.lstm = nn.LSTM(self.embedding_size, self.hidden_size)
    self.fc = nn.Linear(self.hidden_size, self.output_size)

  def forward(self, x, encoder_state):
    x = x.unsqueeze(0)
    embedding = self.embedding(x)

    output, (hidden, cell) = self.lstm(embedding, encoder_state)
    prediction = self.fc(output)
    prediction = prediction.squeeze(0)
    decoder_sates = (hidden, cell)

    return prediction, decoder_sates

In [None]:
decoder = Decoder(num_decoder_tokens, latent_dim, latent_dim, num_decoder_tokens)

In [None]:
print(decoder)

Decoder(
  (embedding): Embedding(3201, 50)
  (lstm): LSTM(50, 50)
  (fc): Linear(in_features=50, out_features=3201, bias=True)
)


In [None]:
import random

class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder):
    super(Seq2Seq, self).__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, source, target, teacher_forcing_ratio = 0.5):
    batch_size = source.shape[1]
    target_len = target.shape[0]
    target_vocab_size = self.decoder.output_size
    # target_vocab_size = num_decoder_tokens

    outputs = torch.zeros(target_len, batch_size, target_vocab_size)
    # encoder_state = self.encoder(source)
    hidden, cell = self.encoder(source)

    x = target[0]
    for t in range(1, target_len):
      output, (hidden, cell) = self.decoder(x, (hidden, cell))
      outputs[t] = output
      best_guess = output.argmax(1)
      x = target[t] if random.random() < teacher_forcing_ratio else best_guess

    return outputs

In [None]:
lr = 0.01
epochs = 20
step = 0

model = Seq2Seq(encoder, decoder)
optimizer = torch.optim.Adam(model.parameters(), lr = lr)
criterion = nn.CrossEntropyLoss()

In [None]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 256

In [None]:
epoch_loss = 0.0
best_val_loss = float('inf')
losses = []
best_epoch = -1
ts1 = [] #k ko lagi hola?


for epoch in range(epochs):
  epoch_loss_list = []
  print(f'Epoch {epoch+1}/{epochs}::')

  model.train(True)
  for i, (source, target) in enumerate(generate_batch(X_train, y_train, batch_size)):
    # optimizer.zero_grad()
    input_data_enc = torch.tensor(source[0]).long()
    input_data_dec = torch.tensor(source[1]).long()
    target = torch.tensor(target.argmax(2)).long()

    output = model(input_data_enc, input_data_dec)
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    optimizer.zero_grad()
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()

    step += 1
    epoch_loss += loss.item()
    epoch_loss_list.append(loss.item())


    if epoch_loss < best_val_loss:
      best_val_loss = epoch_loss
      best_epoch = epoch


    #early stopping
    if((epoch - best_epoch) >= 10):
      print('no improvement on 10 big epochs')
      break


    print(f"Iterations/loss {i}: {loss.item()}")
    losses.append(np.mean(epoch_loss_list))
    ts1.append(step)


torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
}, 'seq2seq_lstm.pth')


Epoch 1/2::
Iterations/loss 0: 1.4086347818374634
Iterations/loss 1: 1.5799700021743774
Iterations/loss 2: 1.6649569272994995
Iterations/loss 3: 1.6305102109909058
Iterations/loss 4: 1.5708730220794678
Iterations/loss 5: 1.4610536098480225
Iterations/loss 6: 1.5300309658050537
Iterations/loss 7: 1.6260234117507935
Iterations/loss 8: 0.6045559644699097
Epoch 2/2::
Iterations/loss 0: 1.3805632591247559
Iterations/loss 1: 1.5453180074691772
Iterations/loss 2: 1.6194521188735962
Iterations/loss 3: 1.6051305532455444
Iterations/loss 4: 1.5881344079971313
Iterations/loss 5: 1.4936143159866333
Iterations/loss 6: 1.5563358068466187
Iterations/loss 7: 1.538793683052063
Iterations/loss 8: 0.6193491220474243


In [None]:
print(losses)

[1.4086347818374634, 1.4943023920059204, 1.55118723710378, 1.5710179805755615, 1.5709889888763429, 1.552666425704956, 1.5494327885763985, 1.559006616473198, 1.452956544028388, 1.3805632591247559, 1.4629406332969666, 1.5151111284891765, 1.5376159846782684, 1.547719669342041, 1.5387021104494731, 1.5412212099347795, 1.54091776907444, 1.4385212527381048]


In [None]:
model = Seq2Seq(Encoder(num_encoder_tokens, latent_dim, latent_dim), Decoder(num_decoder_tokens, latent_dim, latent_dim, num_decoder_tokens))


checkpoint = torch.load("seq2seq_lstm.pth")
model.load_state_dict(checkpoint['model_state_dict'])

def decode_sequence(sentence, max_length=50):
    model.eval()
    # lower, removing punctuations,
    tokens =  (''.join(char for char in re.sub(" +", " ", re.sub("'", '', sentence).lower()) if char not in string.punctuation)).split()

    text_to_indices = [ input_word_index[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [target_word_index["<start>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]])

        with torch.no_grad():
            output, ( hidden, cell ) = model.decoder(previous_word, (hidden, cell))
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if best_guess == "<END>":
            break

    translated_sentence = [rev_target_char_index.get(idx, '<PAD>') for idx in outputs]
    return translated_sentence[1:]

In [None]:
k = 506

In [None]:
k += 1
decoded_sentence = decode_sequence(X_train[k:k+1].values[0])
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Chinese Translation:', y_train[k:k+1].values[0])
print('Predicted Chinese Translation:', decoded_sentence)

Input English sentence: i went hiking
Actual Chinese Translation: <start> म पदयात्रामा गएँ। <end>
Predicted Chinese Translation: ['छैन।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
