<a href="https://colab.research.google.com/github/nnikolovskiii/Deep-learning-lab-exercises/blob/master/Shakespere_to_modern.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!pip install torch



In [18]:
import torch

In [19]:
import torch
import torch.nn as nn
from tqdm import tqdm

def word_tokenize(li):
    i = 2
    d = {"<sos>": 0, "<eos>": 1}
    s = 0.0
    c = 0.0
    for sentence in li:
        for word in sentence.split():
            s += 1
            if word.lower() not in d:
                d[word.lower()] = i
                i = i + 1

        c += 1
    return d, int(s // c)


def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)


def train(model, data_loader, optimizer, criterion, clip):
    model.train()

    epoch_loss = 0

    for i, batch in enumerate(tqdm(data_loader, desc="Processing Batches", total=len(data_loader))):
        src = batch["src"]
        trg = batch["trg"]
        src = src.t()
        trg = trg.t()

        optimizer.zero_grad()

        output = model(src, trg)

        # trg = [trg len, batch size]
        # output = [trg len, batch size, output dim]

        output_dim = output.shape[-1]

        output = output[1:].reshape(-1, output_dim)

        # Reshape the target tensor to [(trg len - 1) * batch size]
        trg = trg[1:].reshape(-1)

        # trg = [(trg len - 1) * batch size]
        # output = [(trg len - 1) * batch size, output dim]

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(data_loader)


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


def evaluate(model, data_loader, criterion):
    model.eval()

    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(tqdm(data_loader, desc="Processing Batches", total=len(data_loader))):
            src = batch["src"]
            trg = batch["trg"]

            output = model(src, trg, 0)  # turn off teacher forcing

            # trg = [trg len, batch size]
            # output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            # trg = [(trg len - 1) * batch size]
            # output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(data_loader)

def path_to_list(file):
    f = open(file, "r")
    text = f.read()

    li = str.splitlines(text)
    return li

In [20]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import random

class ShakespeareanDataset(Dataset):
    def __init__(self, shakespearean_sentences, modern_sentences, src_d, trg_d, avg_src, avg_trg):
        self.shakespearean_sentences = shakespearean_sentences
        self.modern_sentences = modern_sentences
        self.src_d = src_d
        self.trg_d = trg_d
        self.avg_src = avg_src
        self.avg_trg = avg_trg

    def pad_sequence(self, sequence, target_size, padding_value=0):
        if len(sequence) < target_size:
            sequence = sequence + [padding_value] * (target_size - len(sequence))
        elif len(sequence) > target_size:
            sequence = sequence[:target_size]
        return sequence

    def __len__(self):
        return len(self.shakespearean_sentences)

    def __getitem__(self, idx):
        src_sentence = self.shakespearean_sentences[idx]
        trg_sentence = self.modern_sentences[idx]

        src_text = [self.src_d.get(word.lower(), 0) for word in src_sentence.split()]
        src_text = [0] + src_text + [1]

        trg_text = [self.trg_d.get(word.lower(), 0) for word in trg_sentence.split()]
        trg_text = [0] + trg_text + [1]

        src_text = self.pad_sequence(src_text, self.avg_src)
        trg_text = self.pad_sequence(trg_text, self.avg_trg)

        return {
            "src": torch.tensor(src_text).to(device),
            "trg": torch.tensor(trg_text).to(device)
        }

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src):

        #src = [src len, batch size]

        embedded = self.dropout(self.embedding(src))

        #embedded = [src len, batch size, emb dim]

        outputs, (hidden, cell) = self.rnn(embedded)

        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]

        #outputs are always from the top hidden layer

        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

        self.fc_out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):

        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]

        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]

        input = input.unsqueeze(0)

        #input = [1, batch size]

        embedded = self.dropout(self.embedding(input))

        #embedded = [1, batch size, emb dim]

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))

        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]

        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]

        prediction = self.fc_out(output.squeeze(0))

        #prediction = [batch size, output dim]

        return prediction, hidden, cell
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):

        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim


        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)

        #first input to the decoder is the <sos> tokens
        input = trg[0,:]

        for t in range(1, trg_len):

            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)

            #place predictions in a tensor holding predictions for each token
            outputs[t] = output

            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio

            #get the highest predicted token from our predictions
            top1 = output.argmax(1)

            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

In [22]:
# -*- coding: utf-8 -*-
"""Shakespere.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1G-s-J5oXSa15CG1jxdBNZp97zkN4QLam
"""
import math

import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import time


# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    original_li = path_to_list("/content/train.original.nltktok")
    modern_li = path_to_list("/content/train.modern.nltktok")
    original_val = path_to_list("/content/valid.original.nltktok")
    modern_val = path_to_list("/content/valid.modern.nltktok")
    original_test = path_to_list("/content/test.original.nltktok")
    modern_test = path_to_list("/content/test.modern.nltktok")

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    src_d, avg_src = word_tokenize(original_li)
    trg_d, avg_trg = word_tokenize(modern_li)
    print(f"Unique tokens in source (original) vocabulary: {len(src_d)}")
    print(f"Unique tokens in target (modern) vocabulary: {len(trg_d)}")

    dataset = ShakespeareanDataset(original_li, modern_li, src_d, trg_d, 11, 11)
    dataset_val = ShakespeareanDataset(original_val, modern_val, src_d, trg_d,11, 11)
    dataset_test = ShakespeareanDataset(original_test, modern_test, src_d, trg_d, 11, 11)

    batch_size = 32
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    data_val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)
    data_test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=True)

    INPUT_DIM = len(src_d)
    OUTPUT_DIM = len(trg_d)
    ENC_EMB_DIM = 256
    DEC_EMB_DIM = 256
    HID_DIM = 512
    N_LAYERS = 4
    ENC_DROPOUT = 0.5
    DEC_DROPOUT = 0.5
    N_EPOCHS = 10
    CLIP = 1

    best_valid_loss = float('inf')

    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

    model = Seq2Seq(enc, dec, device).to(device)

    model.apply(init_weights)

    criterion = nn.CrossEntropyLoss()

    optimizer = optim.Adam(model.parameters())

    for epoch in range(N_EPOCHS):
        start_time = time.time()

        train_loss = train(model, data_loader, optimizer, criterion, CLIP)
        valid_loss = evaluate(model, data_val_loader, criterion)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
             best_valid_loss = valid_loss
             torch.save(model.state_dict(), 'tut1-model.pt')

        print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


    model.load_state_dict(torch.load('tut1-model.pt'))
    test_loss = evaluate(model, data_test_loader, criterion)
    print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')


Unique tokens in source (original) vocabulary: 12396
Unique tokens in target (modern) vocabulary: 10058


Processing Batches: 100%|██████████| 575/575 [00:29<00:00, 19.43it/s]
Processing Batches: 100%|██████████| 39/39 [00:01<00:00, 34.44it/s]


Epoch: 01 | Time: 0m 30s
	Train Loss: 4.967 | Train PPL: 143.607
	 Val. Loss: 6.486 |  Val. PPL: 656.170


Processing Batches: 100%|██████████| 575/575 [00:29<00:00, 19.65it/s]
Processing Batches: 100%|██████████| 39/39 [00:00<00:00, 39.79it/s]


Epoch: 02 | Time: 0m 30s
	Train Loss: 4.582 | Train PPL:  97.703
	 Val. Loss: 7.279 |  Val. PPL: 1450.129


Processing Batches: 100%|██████████| 575/575 [00:30<00:00, 18.98it/s]
Processing Batches: 100%|██████████| 39/39 [00:00<00:00, 39.83it/s]


Epoch: 03 | Time: 0m 31s
	Train Loss: 4.427 | Train PPL:  83.655
	 Val. Loss: 7.550 |  Val. PPL: 1900.656


Processing Batches: 100%|██████████| 575/575 [00:29<00:00, 19.26it/s]
Processing Batches: 100%|██████████| 39/39 [00:01<00:00, 31.68it/s]


Epoch: 04 | Time: 0m 31s
	Train Loss: 4.278 | Train PPL:  72.129
	 Val. Loss: 7.652 |  Val. PPL: 2104.748


Processing Batches: 100%|██████████| 575/575 [00:29<00:00, 19.37it/s]
Processing Batches: 100%|██████████| 39/39 [00:00<00:00, 39.09it/s]


Epoch: 05 | Time: 0m 30s
	Train Loss: 4.167 | Train PPL:  64.497
	 Val. Loss: 7.304 |  Val. PPL: 1486.337


Processing Batches: 100%|██████████| 575/575 [00:29<00:00, 19.80it/s]
Processing Batches: 100%|██████████| 39/39 [00:00<00:00, 40.77it/s]


Epoch: 06 | Time: 0m 30s
	Train Loss: 4.052 | Train PPL:  57.490
	 Val. Loss: 7.643 |  Val. PPL: 2086.010


Processing Batches: 100%|██████████| 575/575 [00:29<00:00, 19.57it/s]
Processing Batches: 100%|██████████| 39/39 [00:00<00:00, 39.65it/s]


Epoch: 07 | Time: 0m 30s
	Train Loss: 3.943 | Train PPL:  51.598
	 Val. Loss: 7.459 |  Val. PPL: 1735.914


Processing Batches: 100%|██████████| 575/575 [00:29<00:00, 19.66it/s]
Processing Batches: 100%|██████████| 39/39 [00:00<00:00, 40.42it/s]


Epoch: 08 | Time: 0m 30s
	Train Loss: 3.826 | Train PPL:  45.859
	 Val. Loss: 7.848 |  Val. PPL: 2561.120


Processing Batches: 100%|██████████| 575/575 [00:29<00:00, 19.50it/s]
Processing Batches: 100%|██████████| 39/39 [00:01<00:00, 30.91it/s]


Epoch: 09 | Time: 0m 30s
	Train Loss: 3.726 | Train PPL:  41.527
	 Val. Loss: 7.808 |  Val. PPL: 2459.940


Processing Batches: 100%|██████████| 575/575 [00:29<00:00, 19.69it/s]
Processing Batches: 100%|██████████| 39/39 [00:00<00:00, 40.71it/s]


Epoch: 10 | Time: 0m 30s
	Train Loss: 3.621 | Train PPL:  37.364
	 Val. Loss: 7.849 |  Val. PPL: 2562.198


Processing Batches: 100%|██████████| 46/46 [00:01<00:00, 40.12it/s]

| Test Loss: 6.658 | Test PPL: 778.778 |



