In [8]:
# import pandas as pd
# import numpy as np
# raw_data = './data_torch/pmindia.v1.hi-en.csv'

# df = pd.read_csv(raw_data)

# str_punct = '''[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~।]'''
# # remove very long sentences and sentences where translations are 
# # not of roughly equal length
# df['eng_len'] = df['english_sentence'].str.count(' ')
# df['hi_len'] = df['hindi_sentence'].str.count(' ')

# # %matplotlib inline

# # df.eng_len.plot()

# # df.head()

# # df.shape

# df.query('hi_len < 50 & eng_len < 50').shape

# df = df.query('hi_len < 50 & eng_len < 50')
# df = df.query('hi_len < eng_len * 1.5 & hi_len * 1.5 > eng_len')
# df['english_sentence'] = df['english_sentence'].apply(lambda x : re.sub(str_punct,'',x))
# df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x : re.sub(str_punct,'',x))

# # np.savetxt(r'./data_torch/data.en', df.english_sentence.head(30000), fmt='%s')
# # np.savetxt(r'./data_torch/data.hi', df.hindi_sentence.head(30000), fmt='%s')

# np.savetxt(r'./data_torch/data_sm.en', df.english_sentence.head(30), fmt='%s')
# np.savetxt(r'./data_torch/data_sm.hi', df.hindi_sentence.head(30), fmt='%s')


# # np.savetxt(r'./data_torch/data_val.en', df.english_sentence.tail(1000), fmt='%s')
# # np.savetxt(r'./data_torch/data_val.hi', df.hindi_sentence.tail(1000), fmt='%s')

# # df.sample(frac=0.1).shape
# # df = df.sample(frac=0.01)

# # from sklearn.model_selection import train_test_split
# # # create train and validation set 
# # train, val = train_test_split(df[['english_sentence', 'hindi_sentence']], test_size=0.2)
# # train.shape, val.shape
# # train.to_csv("./data_torch/train.csv", index=False)
# # val.to_csv("./data_torch/val.csv", index=False)

# # train.columns

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, TabularDataset
import numpy as np
import spacy
import random
import sys
import time
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
from inltk.inltk import tokenize
# from inltk.inltk import setup
# setup('hi')
from torchtext import datasets
from torchtext import data

In [10]:
# spacy_ger = spacy.load("de_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")

def tokenize_hi(text):
    return tokenize(text, "hi")

def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

In [41]:
src = data.Field(tokenize=tokenize_hi)
trg = data.Field(tokenize=tokenize_eng)
# mt_train = datasets.TranslationDataset(
#      fields=(src, trg))

mt_test = datasets.TranslationDataset(
     path='./data_torch/data_val', exts=('.hi', '.en'),
     fields=(src, trg))

# src.build_vocab(mt_train, max_size=20000, min_freq=2)
# trg.build_vocab(mt_train, max_size=20000, min_freq=2)

In [12]:
import pickle
with open('src.pickle', 'rb') as handle:
    src_loaded = pickle.load(handle)

with open('trg.pickle', 'rb') as handle:
    trg_loaded = pickle.load(handle)

In [13]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        return hidden, cell

class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
    ):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(trg.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
            output, hidden, cell = self.decoder(x, hidden, cell)

            outputs[t] = output
            best_guess = output.argmax(1)

            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

In [14]:
num_epochs = 1000
learning_rate = 0.001
batch_size = 64

# Model hyperparameters
# load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = 12800
input_size_decoder = 15042
output_size = 15042

# input_size_encoder = len(src.vocab)
# input_size_decoder = len(trg.vocab)
# output_size = len(trg.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024  # Needs to be the same for both RNN's
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# Tensorboard to get nice loss plot
# writer = SummaryWriter(f"runs/loss_plot")
step = 0

In [15]:
# print(f'input_size_encoder is {input_size_encoder}')
# print(f'input_size_decoder is {input_size_decoder}')
# print(f'output_size is {output_size}')

# sys.exit(0)

encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)

decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout,
).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# pad_idx = trg.vocab.stoi["<pad>"]
pad_idx = 1
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [16]:
load_model = True
if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

model.eval()

=> Loading checkpoint


Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(12800, 300)
    (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(15042, 300)
    (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=15042, bias=True)
  )
)

In [17]:
import pandas as pd

In [23]:
df = pd.read_csv('./data_torch/pmindia.v1.hi-en.csv')
str_punct = '''[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~।]'''

df['eng_len'] = df['english_sentence'].str.count(' ')
df['hi_len'] = df['hindi_sentence'].str.count(' ')
# df.query('hi_len < 50 & eng_len < 50').shape

df = df.query('hi_len < 50 & eng_len < 50')
df = df.query('hi_len < eng_len * 1.5 & hi_len * 1.5 > eng_len')
df['english_sentence'] = df['english_sentence'].apply(lambda x : re.sub(str_punct,'',x))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x : re.sub(str_punct,'',x))



In [30]:
df['hindi_sentence'][10]

'इस प्रकार एक स्वायशासी निकाय के रूप में जेएसके को बंद किया जा सकता है क्योंकि निधि के तौर पर उसका कामकाज विभाग द्वारा संभव है'

In [24]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence,eng_len,hi_len
0,An advance is placed with the Medical Superint...,अग्रिम धन राशि इन अस्पतालों को चिकित्सा निरीक्...,20,19
1,Since the DoHFW provides funds to the hospital...,चूंकि स्वास्थ्य एवं परिवार कल्याण विभाग अस्पता...,19,22
3,Managing Committee of RAN Society will meet to...,आरएएन सोसायटी की प्रबंध समिति सोसायटी पंजीकरण ...,21,21
4,In addition to this Health Minister’s Cancer P...,इसके अलावा स्वास्थ्य मंत्री के कैंसर रोगी निधि...,16,15
5,The timeline required for this is one year,इसके लिए एक वर्ष का समय रखा गया है,7,8


In [39]:
org_sentence = df['hindi_sentence'][10]
tr_sentence = ' '.join(translate_sentence(model, org_sentence, src_loaded, trg_loaded, device))
print(f'Original sentence is \n \n{org_sentence}\n \n translated sentence is \n \n{tr_sentence}')

Original sentence is 
 
इस प्रकार एक स्वायशासी निकाय के रूप में जेएसके को बंद किया जा सकता है क्योंकि निधि के तौर पर उसका कामकाज विभाग द्वारा संभव है
 
 translated sentence is 
 
an area of a Autonomous data can a can be administered as the of the the as a provisions of the the 6th Committee of the 6th of the Vice Committee profession for the profession of the <unk>


In [40]:
org_sentence = df['hindi_sentence'][100]
tr_sentence = ' '.join(translate_sentence(model, org_sentence, src_loaded, trg_loaded, device))
print(f'Original sentence is \n \n{org_sentence}\n \n translated sentence is \n \n{tr_sentence}')

Original sentence is 
 
प्रधानमंत्री ने कहा कि भारत में केंद्र सरकार बुनियादी ढांचे पर ध्यान केंद्रित कर रही है
 
 translated sentence is 
 
conservation transparency in India Digital India initiative in the Prime Minister added India in India based India procurement infrastructure crop City based etc by the country infrastructure system etc in turn builds in a <unk>


In [41]:
org_sentence = df['hindi_sentence'][1000]
tr_sentence = ' '.join(translate_sentence(model, org_sentence, src_loaded, trg_loaded, device))
print(f'Original sentence is \n \n{org_sentence}\n \n translated sentence is \n \n{tr_sentence}')

Original sentence is 
 
आपका स्नेह मुझे कड़ी मेहनत करते रहने की प्रेरणा देता है”
 
 translated sentence is 
 
all have been given their words to their dear <unk>


In [44]:
org_sentence = df['hindi_sentence'][4000]
tr_sentence = ' '.join(translate_sentence(model, org_sentence, src_loaded, trg_loaded, device))
print(f'Original sentence is \n \n{org_sentence}\n \n translated sentence is \n \n{tr_sentence}')

Original sentence is 
 
इसी कार्यक्रम में श्री नरेन्द्र मोदी मिर्जापुर मेडिकल कॉलेज की आधारशिला रखेंगे
 
 translated sentence is 
 
to <unk>


In [45]:
org_sentence = df['hindi_sentence'][6000]
tr_sentence = ' '.join(translate_sentence(model, org_sentence, src_loaded, trg_loaded, device))
print(f'Original sentence is \n \n{org_sentence}\n \n translated sentence is \n \n{tr_sentence}')

Original sentence is 
 
6 ब्रिक्‍स शिखर सम्‍मेलन के पहले चक्र में सामूहिक रूप से हमारी अर्थव्‍यवस्‍थाओं ने आर्थिक संकट वैश्‍विक वित्‍तीय संकट से उबरने के इंजन के रूप में अपनी स्थिति मजबूत की है
 
 translated sentence is 
 
of the launch of BRICS <unk>


In [46]:
org_sentence = df['hindi_sentence'][7000]
tr_sentence = ' '.join(translate_sentence(model, org_sentence, src_loaded, trg_loaded, device))
print(f'Original sentence is \n \n{org_sentence}\n \n translated sentence is \n \n{tr_sentence}')

Original sentence is 
 
हम सहकारी संघवाद की बात करते हैं लेकिन प्रतिस्‍पर्धी संघवाद की भी चर्चा करते हैं जहां राज्‍यों को निवेश और नौकरियों के लिए एकदूसरे से स्‍पर्धा करके श्रेष्‍ठता साबित करनी होगी
 
 translated sentence is 
 
we are also do do we continue but each other we so that we issues and investments continue to support with differences and support to cooperative and with the common common investments there heritage and other cooperative cooperative heritage there be shared their exports and achieving shared than the top


In [48]:
df.shape

(49846, 4)

In [49]:
df.head(10000)

Unnamed: 0,english_sentence,hindi_sentence,eng_len,hi_len
0,An advance is placed with the Medical Superint...,अग्रिम धन राशि इन अस्पतालों को चिकित्सा निरीक्...,20,19
1,Since the DoHFW provides funds to the hospital...,चूंकि स्वास्थ्य एवं परिवार कल्याण विभाग अस्पता...,19,22
3,Managing Committee of RAN Society will meet to...,आरएएन सोसायटी की प्रबंध समिति सोसायटी पंजीकरण ...,21,21
4,In addition to this Health Minister’s Cancer P...,इसके अलावा स्वास्थ्य मंत्री के कैंसर रोगी निधि...,16,15
5,The timeline required for this is one year,इसके लिए एक वर्ष का समय रखा गया है,7,8
...,...,...,...,...
11258,They are helping me stay in close touch with p...,ये लोगों के करीब बने रहने में मेरी मदद कर रहे हैं,9,11
11259,I learn a great deal from their suggestions an...,मैंने उनके सुझावों एवं शिकायतों से काफी कुछ सी...,9,9
11260,We want to free our citizens from the burden o...,हम प्रत्‍येक कार्यालय में अपने नागरिकों को अत्...,15,16
11262,We will set up a digital locker for every citi...,हम निजी दस्‍तावेजों को स्‍टोर करने के लिए प्रत...,19,25


In [50]:
org_sentence = df['hindi_sentence'][11258]
tr_sentence = ' '.join(translate_sentence(model, org_sentence, src_loaded, trg_loaded, device))
print(f'Original sentence is \n \n{org_sentence}\n \n translated sentence is \n \n{tr_sentence}')

Original sentence is 
 
ये लोगों के करीब बने रहने में मेरी मदद कर रहे हैं
 
 translated sentence is 
 
it of social security in this field of adversity in the society also also be an live in the also in their life as festival as to the festival of life child as child it as child and savings to be it as to the people savings to the women


In [39]:
# import pickle
# with open('src.pickle', 'wb') as handle:
#     pickle.dump(src, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('trg.pickle', 'wb') as handle:
#     pickle.dump(trg, handle, protocol=pickle.HIGHEST_PROTOCOL)


with open('src.pickle', 'rb') as handle:
    src_loaded = pickle.load(handle)

with open('trg.pickle', 'rb') as handle:
    trg_loaded = pickle.load(handle)

In [50]:
# sentence = 'अग्रिम धन राशि इन अस्पतालों को चिकित्सा निरीक्षकों को दी जाएगी जो हर मामले को देखते हुए सहायता प्रदान करेंगे'

st = time.time()

print(f'input_size_encoder is {input_size_encoder}')
print(f'input_size_decoder is {input_size_decoder}')
print(f'output_size is {output_size}')
print(f'pad_idx is {pad_idx}')

print(f'Total time taken for data loading was >> {time.time() -st}')

input_size_encoder is 12800
input_size_decoder is 15042
output_size is 15042
pad_idx is 1
Total time taken for data loading was >> 0.0006196498870849609


In [20]:
mt_test = datasets.TranslationDataset(
     path='./data_torch/data_sm', exts=('.hi', '.en'),
     fields=(src, trg))

In [59]:
from torchtext.data.metrics import bleu_score

In [67]:
def bleu(data, model, german, english, device):
    targets = []
    outputs = []
    for example in data:
        try:
            src = vars(example)["src"]
            trg = vars(example)["trg"]

            prediction = translate_sentence(model, src, german, english, device)
            prediction = prediction[:-1] # remove <eos> token

            targets.append([trg])
            outputs.append(prediction)

        except Exception as e:
            print(e, example)
    scores = 0
    try:
        scores = bleu_score(outputs, targets)
        
    except Exception as e:
        print(f'got the exception from bleu score , >>{e}')
    return scores

In [None]:
st = time.time()
score_test = bleu(mt_test[1:400], model, src_loaded, trg_loaded, device)

In [68]:
print(f"Bleu score for test data is {score_test*100:.2f}")
print(f'Total time taken for score_test was >> {time.time() -st}')

got the exception from bleu score , >>index 4 is out of bounds for dimension 0 with size 4


TypeError: unsupported operand type(s) for *: 'NoneType' and 'int'

In [28]:
st = time.time()
score_train = bleu(mt_train[1:100], model, src, trg, device)
print(f"Bleu score for train data is {score_train*100:.2f}")
print(f'Total time taken for score_train was >> {time.time() -st}')

IndexError: index 4 is out of bounds for dimension 0 with size 4

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, TabularDataset
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
from inltk.inltk import tokenize
# from inltk.inltk import setup
# setup('hi')
from torchtext import datasets
from torchtext import data


spacy_ger = spacy.load("de_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")

def tokenize_hi(text):
    return tokenize(text, "hi")

def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


src = data.Field(tokenize=tokenize_hi)
trg = data.Field(tokenize=tokenize_eng)
mt_train = datasets.TranslationDataset(
     path='./data_torch/data_sm', exts=('.hi', '.en'),
     fields=(src, trg))

src.build_vocab(mt_train, max_size=20000, min_freq=2)
trg.build_vocab(mt_train, max_size=20000, min_freq=2)

class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        return hidden, cell

class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
    ):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(trg.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
            output, hidden, cell = self.decoder(x, hidden, cell)

            outputs[t] = output
            best_guess = output.argmax(1)

            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

num_epochs = 1000
learning_rate = 0.001
batch_size = 64

# Model hyperparameters
load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = len(src.vocab)
input_size_decoder = len(trg.vocab)
output_size = len(trg.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024  # Needs to be the same for both RNN's
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# Tensorboard to get nice loss plot
writer = SummaryWriter(f"runs/loss_plot")
step = 0

train_iter = data.BucketIterator(
     dataset=mt_train, batch_size=32,
     sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg)))

encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)

decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout,
).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = trg.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

load_model = False
if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)


sentence = 'अग्रिम धन राशि इन अस्पतालों को चिकित्सा निरीक्षकों को दी जाएगी जो हर मामले को देखते हुए सहायता प्रदान करेंगे'

In [4]:
for epoch in range(num_epochs):
    print(f'[Epoch is {epoch}/{num_epochs}]')
    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
    save_checkpoint(checkpoint)
    
    model.eval()
    translated_sentence = translate_sentence(
        model=model, sentence= sentence, german=src, english=trg, device=device, max_length=50
    )
    translated_sentence =  ' '.join(translated_sentence)
    print(f'Translated sentence is: \n {translated_sentence}')
    
    model.train()
    
    for batch_idx, batch in enumerate(train_iter):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)
        
        output = model(inp_data, target)
        
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        optimizer.zero_grad()
        loss = criterion(output, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        
        writer.add_scalar("training_loss", loss, global_step=step)
        step+=1
        


[Epoch is 0/1000]
=> Saving checkpoint
Translated sentence is: 
 <unk>
[Epoch is 1/1000]
=> Saving checkpoint
Translated sentence is: 
 <unk>
[Epoch is 2/1000]
=> Saving checkpoint
Translated sentence is: 
 <unk>
[Epoch is 3/1000]
=> Saving checkpoint


KeyboardInterrupt: 

In [129]:
txt = '''An advance is placed with  [ the Medical Superintendents of such hospitals who then provide assistance on a case to case basis.'''


re.sub('''[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~।]''','',txt)

df.english_sentence[0]


df.head(1)['hindi_sentence'][0]

# df = df.sample(frac=0.01)

# from sklearn.model_selection import train_test_split
# # create train and validation set 
# train, val = train_test_split(df[['english_sentence', 'hindi_sentence']], test_size=0.2)

# train.to_csv("./data_torch/train.csv", index=False)
# val.to_csv("./data_torch/val.csv", index=False)

In [106]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, TabularDataset
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
from inltk.inltk import tokenize

# from inltk.inltk import setup
# setup('hi')

from torchtext import datasets

from torchtext import data

In [107]:
spacy_ger = spacy.load("de_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")

In [108]:
def tokenize_hi(text):
    return tokenize(text, "hi")

def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

In [109]:
src = data.Field(tokenize=tokenize_hi)
trg = data.Field(tokenize=tokenize_eng)
mt_train = datasets.TranslationDataset(
     path='./data_torch/data', exts=('.hi', '.en'),
     fields=(src, trg))

In [112]:
print

<function print>

In [111]:
src.build_vocab(mt_train, max_size=10000, min_freq=2)
trg.build_vocab(mt_train, max_size=10000, min_freq=2)

In [114]:
len(list(mt_train))

20000

In [115]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        return hidden, cell

class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
    ):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)

        # predictions shape: (1, N, length_target_vocabulary) to send it to
        # loss function we want it to be (N, length_target_vocabulary) so we're
        # just gonna remove the first dim
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(trg.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
            output, hidden, cell = self.decoder(x, hidden, cell)

            # Store next output prediction
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

In [117]:
num_epochs = 1
learning_rate = 0.001
batch_size = 64

# Model hyperparameters
load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = len(src.vocab)
input_size_decoder = len(trg.vocab)
output_size = len(trg.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024  # Needs to be the same for both RNN's
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# Tensorboard to get nice loss plot
writer = SummaryWriter(f"runs/loss_plot")
step = 0

In [118]:
train_iter = data.BucketIterator(
     dataset=mt_train, batch_size=32,
     sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg)))

In [119]:
encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)

In [120]:
decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout,
).to(device)

In [121]:
model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [122]:
pad_idx = trg.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [123]:
# usage
next(iter(train_iter))


[torchtext.data.batch.Batch of size 32]
	[.src]:[torch.LongTensor of size 66x32]
	[.trg]:[torch.LongTensor of size 48x32]

In [124]:
load_model = False

In [125]:
if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)


sentence = 'अग्रिम धन राशि इन अस्पतालों को चिकित्सा निरीक्षकों को दी जाएगी जो हर मामले को देखते हुए सहायता प्रदान करेंगे'

In [126]:
for epoch in range(num_epochs):
    print(f'[Epoch is {epoch}/{num_epochs}]')
    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
    save_checkpoint(checkpoint)
    
    model.eval()
    translated_sentence = translate_sentence(
        model=model, sentence= sentence, german=src, english=trg, device=device, max_length=50
    )
    
    print(f'Translated sentence is: \n {translated_sentence}')
    
    model.train()
    
    for batch_idx, batch in enumerate(train_iter):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)
        
        output = model(inp_data, target)
        
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        optimizer.zero_grad()
        loss = criterion(output, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        
        writer.add_scalar("training_loss", loss, global_step=step)
        step+=1
    

[Epoch is 0/1]
=> Saving checkpoint
Translated sentence is: 
 ['exchanging', 'Munde', 'Design', 'transferee', 'Said', 'Said', 'Said', 'Said', 'Said', 'before', 'ultimate', 'customer', 'stepping', 'stepping', 'Partnership', 'allows', 'allows', 'repository', 'repository', 'BARC', 'discretion', 'Sinchai', 'food', 'Sinchai', 'amounts', 'Italian', 'benami', 'Colleagues', 'Cyclone', 'Braille', 'Said', 'Said', 'Said', 'Said', 'Said', 'before', 'ultimate', 'customer', 'stepping', 'stepping', 'Partnership', 'allows', 'allows', 'repository', 'repository', 'BARC', 'discretion', 'Sinchai', 'food', 'Sinchai']


In [67]:
test_sent = 'अग्रिम धन राशि इन अस्पतालों को चिकित्सा निरीक्षकों को दी जाएगी जो हर मामले को देखते हुए सहायता प्रदान करेंगे'

In [68]:
model.eval()
translate_sentence(model, test_sent, src, trg, device)

['<unk>']

In [53]:
' '.join([i.lstrip('▁') for i in list(mt_train[0].__dict__.values())[0]])

'अग्रिम धन राशि इन अस्पतालों को चिकित्सा निरीक्षक ों को दी जाएगी , जो हर मामले को देखते हुए सहायता प्रदान करेंगे ।'

In [None]:
''.lstrip

In [54]:
src_test = data.Field(tokenize=tokenize_hi)
trg_test = data.Field(tokenize=tokenize_eng)
mt_test = datasets.TranslationDataset(
     path='./data_torch/data_val', exts=('.hi', '.en'),
     fields=(src, trg))

In [26]:
len(list(mt_test))

100

In [21]:
# import pandas as pd
# raw_data = './data_torch/pmindia.v1.hi-en.csv'

# df = pd.read_csv(raw_data)
# # remove very long sentences and sentences where translations are 
# # not of roughly equal length
# df['eng_len'] = df['english_sentence'].str.count(' ')
# df['hi_len'] = df['hindi_sentence'].str.count(' ')

# df = df.query('hi_len < 50 & eng_len < 50')
# df = df.query('hi_len < eng_len * 1.5 & hi_len * 1.5 > eng_len')
# import numpy as np

# np.savetxt(r'./data_torch/data_val.en', df.english_sentence.tail(100), fmt='%s')
# np.savetxt(r'./data_torch/data_val.hi', df.hindi_sentence.tail(100), fmt='%s')
# # df.sample(frac=0.1).shape

# # df = df.sample(frac=0.01)

In [56]:
mt_test[0].__dict__.values()

dict_values([['▁31.', '▁दोनों', '▁नेताओं', '▁ने', '▁डब्ल्यू', 'टी', 'ओ', '▁संबंधी', '▁मुद्दों', '▁पर', '▁एक', '▁द्विपक्ष', 'ीय', '▁सलाहकार', '▁तंत्र', '▁शुरू', '▁करने', '▁के', '▁फैसले', '▁का', '▁स्वागत', '▁करते', '▁हुए', '▁इसे', '▁वैश्विक', '▁व्यापार', '▁बातचीत', '▁के', '▁संदर्भ', '▁में', '▁सहयोग', '▁बढ़ाने', '▁के', '▁लिए', '▁एक', '▁सकारात्मक', '▁कदम', '▁बताया', '।'], ['31', '.', 'The', 'leaders', 'welcomed', 'the', 'decision', 'to', 'launch', 'a', 'bilateral', 'consultative', 'mechanism', 'on', 'WTO', '-', 'related', 'issues', 'as', 'a', 'positive', 'step', 'for', 'enhancing', 'coordination', 'in', 'the', 'context', 'of', 'global', 'trade', 'talks', '.']])

In [55]:
score = bleu(mt_test[1:100], model, src, trg, device)
print(f"Bleu score {score*100:.2f}")

Bleu score 0.00


In [None]:
for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
    save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(
        model, sentence, german, english, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1


score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score*100:.2f}")