In [None]:
# !pip install torchtext==0.9.1

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import torch
import time
import torch.nn as nn
import torch.optim as optim
import random
# from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
from torch.utils.tensorboard import SummaryWriter
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
import spacy
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from torchtext.data.metrics import bleu_score


In [None]:
# Tutorial link: https://www.youtube.com/watch?v=EoGUlvhRYpk&list=RDCMUCkzW5JSFwvKRjXABI-UTAkQ


#Download languade package from spacy
!python -m spacy download en
!python -m spacy download de


Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 2.7 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 5.1 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.7/dist-pa

In [None]:
# from torchtext.data import Field, BucketIterator

spacy_ger = spacy.load("de")
spacy_eng = spacy.load("en")


def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]
# .text converts object into string.

def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


# Field class models common text processing datatypes that can be represented by tensors.
german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")

english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>")

train_data, valid_data, test_data = Multi30k.splits(
    exts=(".de", ".en"), fields=(german, english)
)


# Creating a vocab object for both language using build_vocab
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)


In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

    def forward(self,x):
        # x-shape = (seq_length,N) [N is batch size]

        embedding = self.dropout(self.embedding(x))
        # embedding-shape = (seq_length,N,embedding_size)

        outputs,(hidden,cell) = self.rnn(embedding)

        return hidden, cell


In [None]:

class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
    ):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        
        embedding = self.dropout(self.embedding(x))
        # embedding-shape = (1,N,embedding_size)

        outputs,(hidden,cell) = self.rnn(embedding,(hidden,cell))
        # shape of output: (1, N ,hidden_size)

        prediction = self.fc(outputs)
        # output_size: size of output vocab language
        # shape: (1,N,output_size)
        # we need (N,output_size)
        prediction = prediction.squeeze(0)

        return prediction, hidden,cell


In [None]:
class Seq2Seq(nn.Module):
  def __init__(self,encoder, decoder):
    super(Seq2Seq,self).__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self,source,target,teacher_force_ratio=0.5):

    # Source: betch of input sentence (sentence_len,batch_size)
    batch_size = source.shape[1]
    target_len = target.shape[0]
    target_vocab_size = len(english.vocab)

    hidden ,cell = self.encoder(source)

    outputs = torch.zeros(target_len,batch_size,target_vocab_size).to(device)
    # grab start token
    x = target[0]

    for i in range(1,target_len):
      output, hidden ,cell = self.decoder(x,hidden,cell)

      #(N,output vocab_size)
      outputs[i] = output

      # chooses between original output word and predicted output with probability of teacher_force_ratio
      best_guess = output.argmax(1)
      x = target[i] if random.random() < teacher_force_ratio else best_guess
    
    return outputs

In [None]:
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
n_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5
batch_size = 64


In [None]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device,
)

# This automatically adds padding
# x.src is input and x.trg is output

In [None]:
encoder_net = Encoder(input_size_encoder,encoder_embedding_size,hidden_size,n_layers,enc_dropout).to(device)

decoder_net = Decoder(input_size_decoder,decoder_embedding_size,hidden_size,output_size,n_layers,dec_dropout).to(device)

model = Seq2Seq(encoder_net,decoder_net).to(device)

In [None]:
# get index of <PAD> from vocabulary
pad_idx = english.vocab.stoi["<pad>"]

In [None]:
num_epochs = 100
learning_rate = 0.0001

criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)
optimizer = optim.Adam(model.parameters(),lr = learning_rate)

In [None]:
# Tensorboard to get nice loss plot
writer = SummaryWriter(f"runs/loss_plot")
step = 0

In [None]:
# Loss variables
losses = []

In [None]:
## Training the model

for epoch in range(111,num_epochs+50):
  
  epoch_loss = 0
  timenow = time.time()
  model.train()
  for batch_index, batch in enumerate(train_iterator):
    input = batch.src.to(device)
    target = batch.trg.to(device)

    output = model(input,target)
    # output shape = (target_len,batch_size,output_vocab_len)


    output = output[1:].reshape(-1,output.shape[2])
    target = target[1:].reshape(-1)
    # [1:] because we won't be taking start token <SOS>
    # reshape because loss function does not allow this shape

    optimizer.zero_grad()
    loss = criterion(output,target)

    loss.backward()

    # to avoid eploding gradient problems
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    optimizer.step()
    
    
    writer.add_scalar("Training loss", loss, global_step=step)
    step += 1
    epoch_loss = loss
    
  losses.append(epoch_loss)
  if epoch%10 == 0:
    torch.save(model.state_dict(), f'/content/drive/MyDrive/ChatBot/MTCheckpoint{epoch}.pth')

  print(f'Epoch {epoch} in total {num_epochs+50} Epochs Training-loss:{epoch_loss} Time taken: {time.time() - timenow}')

Epoch 111 in total 150 Epochs Training-loss:0.01803998090326786 Time taken: 58.29538655281067
Epoch 112 in total 150 Epochs Training-loss:0.015289335511624813 Time taken: 59.751731395721436
Epoch 113 in total 150 Epochs Training-loss:0.016247805207967758 Time taken: 60.308337688446045
Epoch 114 in total 150 Epochs Training-loss:0.026776328682899475 Time taken: 60.20340156555176
Epoch 115 in total 150 Epochs Training-loss:0.04024677723646164 Time taken: 60.347891330718994
Epoch 116 in total 150 Epochs Training-loss:0.06525977700948715 Time taken: 60.36409616470337
Epoch 117 in total 150 Epochs Training-loss:0.0648874044418335 Time taken: 60.49260473251343
Epoch 118 in total 150 Epochs Training-loss:0.03940778970718384 Time taken: 60.34889221191406
Epoch 119 in total 150 Epochs Training-loss:0.02775331400334835 Time taken: 60.29300594329834
Epoch 120 in total 150 Epochs Training-loss:0.058879490941762924 Time taken: 60.79063105583191
Epoch 121 in total 150 Epochs Training-loss:0.06393774

In [None]:
model = Seq2Seq(encoder_net,decoder_net).to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/ChatBot/MTCheckpoint140.pth'))

<All keys matched successfully>

In [None]:
def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)

score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score*100:.2f}")

Bleu score 18.98


In [None]:
def translate_sentence(model, sentence, german, english, device, max_length=50):


    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # print(tokens)

    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


In [None]:
sentence = "Hier sind einige erstaunliche Ideen für Essays und Reden, die Sie beim Schreiben eines perfekten Essays und perfekter Reden für den Wettbewerb unterstützen werden."

translated = translate_sentence(
        model, sentence, german, english, device, max_length=50
    )

print(translated)

['there', '<unk>', '<unk>', '<unk>', '<unk>', 'keeping', 'peace', 'and', 'and', 'a', 'going', '-', 'camera', 'the', 'is', 'a', 'for', 'the', '<unk>', 'for', 'the', 'picture', '.', '<eos>']
