In [None]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  !python -m spacy download en_core_web_lg
  !wget -P dataset/PennTreeBank https://raw.githubusercontent.com/BrownFortress/NLU-2024-Labs/main/labs/dataset/PennTreeBank/ptb.test.txt
  !wget -P dataset/PennTreeBank https://raw.githubusercontent.com/BrownFortress/NLU-2024-Labs/main/labs/dataset/PennTreeBank/ptb.valid.txt
  !wget -P dataset/PennTreeBank https://raw.githubusercontent.com/BrownFortress/NLU-2024-Labs/main/labs/dataset/PennTreeBank/ptb.train.txt
  !pip install wandb


Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
--2024-04-11 17:13:17--  https://raw.githubusercontent.com/BrownFortress/NLU-2024-Labs/main/labs/dataset/PennTreeBank/ptb.test.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, 

In [None]:
from google.colab import userdata

!wandb login --relogin {userdata.get('wandb')}


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
import numpy as np
import wandb
import random

In [None]:
if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')

save_path = "drive/MyDrive/NLU/models/"



Mounted at /content/drive


In [None]:
#@title LM_RNN


class LM_RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size, pad_index=0, out_dropout=0.1,
                 emb_dropout=0.1, n_layers=1):
        super(LM_RNN, self).__init__()
        # Token ids to vectors, we will better see this in the next lab
        self.embedding = nn.Embedding(output_size, emb_size, padding_idx=pad_index)
        # Pytorch's RNN layer: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
        self.rnn = nn.RNN(emb_size, hidden_size, n_layers, bidirectional=False, batch_first=True)
        self.pad_token = pad_index
        # Linear layer to project the hidden layer to our output space
        self.output = nn.Linear(hidden_size, output_size)

    def forward(self, input_sequence):
        emb = self.embedding(input_sequence)
        rnn_out, _  = self.rnn(emb)
        output = self.output(rnn_out).permute(0,2,1)

        return output

In [None]:
#@title LM_LSTM


class LM_LSTM(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size, pad_index=0, out_dropout=0.1,
                 emb_dropout=0.1, n_layers=1):
        super(LM_LSTM, self).__init__()
        # Token ids to vectors, we will better see this in the next lab
        self.embedding = nn.Embedding(output_size, emb_size, padding_idx=pad_index)
        # Pytorch's RNN layer: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
        self.rnn = nn.LSTM(emb_size, hidden_size, n_layers, bidirectional=False,batch_first=True)
        self.pad_token = pad_index
        # Linear layer to project the hidden layer to our output space
        self.output = nn.Linear(hidden_size, output_size)


    def forward(self, input_sequence):
        emb = self.embedding(input_sequence)
        rnn_out, _  = self.rnn(emb)
        output = self.output(rnn_out).permute(0,2,1)
        return output

In [None]:
#@title LM_LSTM_DROP


class LM_LSTM_DROP(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size, pad_index=0, out_dropout=0.1,
                 emb_dropout=0.1,ins_dropout=0.1, n_layers=1):
        super(LM_LSTM_DROP, self).__init__()
        # Token ids to vectors, we will better see this in the next lab
        self.embedding = nn.Embedding(output_size, emb_size, padding_idx=pad_index)
        # Pytorch's RNN layer: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
        self.rnn = nn.LSTM(emb_size, hidden_size, n_layers, dropout=ins_drop, bidirectional=False, batch_first=True)
        self.pad_token = pad_index
        # Linear layer to project the hidden layer to our output space
        self.output = nn.Linear(hidden_size, output_size)

        #dropout layers
        self.emb_drop = nn.Dropout(p=emb_dropout)
        self.out_drop = nn.Dropout(p=out_dropout)

    def forward(self, input_sequence):
        emb = self.embedding(input_sequence)
        drop1 = self.emb_drop(emb)
        rnn_out, _  = self.rnn(drop1)
        drop2 = self.out_drop(rnn_out)
        output = self.output(drop2).permute(0,2,1)
        return output

In [None]:
class VarDropout(nn.Module):
    def __init__(self, p=0.1):
        super(VarDropout, self).__init__()
        self.p = p

    def forward(self, input_sequence):
        rand_mask = torch.rand((input_sequence.shape[::2]),requires_grad=True, device="cuda")
        batch_mask = (rand_mask > self.p).int()
        expanded_mask = batch_mask.unsqueeze(1)
        full_mask = expanded_mask.repeat(1,input_sequence.shape[1],1)
        return (input_sequence*full_mask) * (1.0/(1.0-self.p))

In [None]:
#@title LM_LSTM_TWO
import pdb










class LM_LSTM_TWO(nn.Module):
    def __init__(self,
                 emb_size,
                 hidden_size,
                 output_size,
                 tie = True,
                 var_drop = True,
                 pad_index=0,
                 emb_dropout=0.1,
                 out_dropout=0.1,
                 n_layers=1):
        super(LM_LSTM_TWO, self).__init__()

        if tie:
          if hidden_size != emb_size:
            print("WARNING: hidden size and emb size not tied")
          hidden_size = emb_size



        self.embedding = nn.Embedding(output_size, emb_size, padding_idx=pad_index)

        self.rnn = nn.LSTM(emb_size, hidden_size, n_layers, bidirectional=False, batch_first=True)

        self.pad_token = pad_index

        # Linear layer to project the hidden layer to our output space
        self.decoder = nn.Linear(hidden_size, output_size)

        if tie:
          self.decoder.weight = self.embedding.weight


        if var_drop:
          self.drop_emb = VarDropout(p=emb_dropout)
          self.drop_dec = VarDropout(p=out_dropout)
        else:
          self.drop_emb = nn.Dropout(p=emb_dropout)
          self.drop_dec = nn.Dropout(p=out_dropout)

    def forward(self, input_sequence):
        embedded = self.embedding(input_sequence)

        dropped_1 = self.drop_emb(embedded)

        rnn_out, _  = self.rnn(dropped_1)

        dropped_2 = self.drop_dec(rnn_out)

        output = self.decoder(dropped_2).permute(0,2,1)

        return output

In [None]:
DEVICE = 'cuda:0' # it can be changed with 'cpu' if you do not have a gpu

In [None]:
#@title Corpus


# Loading the corpus

def read_file(path, eos_token="<eos>"):
    output = []
    with open(path, "r") as f:
        for line in f.readlines():
            output.append(line.strip() + " " + eos_token)
    return output

# Vocab with tokens to ids
def get_vocab(corpus, special_tokens=[]):
    output = {}
    i = 0
    for st in special_tokens:
        output[st] = i
        i += 1
    for sentence in corpus:
        for w in sentence.split():
            if w not in output:
                output[w] = i
                i += 1
    return output

In [None]:

train_raw = read_file("dataset/PennTreeBank/ptb.train.txt")
dev_raw = read_file("dataset/PennTreeBank/ptb.valid.txt")
test_raw = read_file("dataset/PennTreeBank/ptb.test.txt")


In [None]:
# Vocab is computed only on training set
# We add two special tokens end of sentence and padding
vocab = get_vocab(train_raw, ["<pad>", "<eos>"])

In [None]:
len(vocab)

10001

In [None]:
#@title Vocab


# This class computes and stores our vocab
# Word to ids and ids to word
class Lang():
    def __init__(self, corpus, special_tokens=[]):
        self.word2id = self.get_vocab(corpus, special_tokens)
        self.id2word = {v:k for k, v in self.word2id.items()}
    def get_vocab(self, corpus, special_tokens=[]):
        output = {}
        i = 0
        for st in special_tokens:
            output[st] = i
            i += 1
        for sentence in corpus:
            for w in sentence.split():
                if w not in output:
                    output[w] = i
                    i += 1
        return output


In [None]:
lang = Lang(train_raw, ["<pad>", "<eos>"])

In [None]:
#@title Dataset


import torch
import torch.utils.data as data

class PennTreeBank (data.Dataset):
    # Mandatory methods are __init__, __len__ and __getitem__
    def __init__(self, corpus, lang):
        self.source = []
        self.target = []

        for sentence in corpus:
            self.source.append(sentence.split()[0:-1]) # We get from the first token till the second-last token
            self.target.append(sentence.split()[1:]) # We get from the second token till the last token
            # See example in section 6.2

        self.source_ids = self.mapping_seq(self.source, lang)
        self.target_ids = self.mapping_seq(self.target, lang)

    def __len__(self):
        return len(self.source)

    def __getitem__(self, idx):
        src= torch.LongTensor(self.source_ids[idx])
        trg = torch.LongTensor(self.target_ids[idx])
        sample = {'source': src, 'target': trg}
        return sample

    # Auxiliary methods

    def mapping_seq(self, data, lang): # Map sequences of tokens to corresponding computed in Lang class
        res = []
        for seq in data:
            tmp_seq = []
            for x in seq:
                if x in lang.word2id:
                    tmp_seq.append(lang.word2id[x])
                else:
                    print('OOV found!')
                    print('You have to deal with that') # PennTreeBank doesn't have OOV but "Trust is good, control is better!"
                    break
            res.append(tmp_seq)
        return res

In [None]:
train_dataset = PennTreeBank(train_raw, lang)
dev_dataset = PennTreeBank(dev_raw, lang)
test_dataset = PennTreeBank(test_raw, lang)

In [None]:
#@title DataLoaders

from functools import partial
from torch.utils.data import DataLoader

def collate_fn(data, pad_token):
    def merge(sequences):
        '''
        merge from batch * sent_len to batch * max_len
        '''
        lengths = [len(seq) for seq in sequences]
        max_len = 1 if max(lengths)==0 else max(lengths)
        # Pad token is zero in our case
        # So we create a matrix full of PAD_TOKEN (i.e. 0) with the shape
        # batch_size X maximum length of a sequence
        padded_seqs = torch.LongTensor(len(sequences),max_len).fill_(pad_token)
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, :end] = seq # We copy each sequence into the matrix
        padded_seqs = padded_seqs.detach()  # We remove these tensors from the computational graph
        return padded_seqs, lengths

    # Sort data by seq lengths

    data.sort(key=lambda x: len(x["source"]), reverse=True)
    new_item = {}
    for key in data[0].keys():
        new_item[key] = [d[key] for d in data]

    source, _ = merge(new_item["source"])
    target, lengths = merge(new_item["target"])

    new_item["source"] = source.to(DEVICE)
    new_item["target"] = target.to(DEVICE)
    new_item["number_tokens"] = sum(lengths)
    return new_item

# Dataloader instantiation
# You can reduce the batch_size if the GPU memory is not enough
train_loader = DataLoader(train_dataset, batch_size=256, collate_fn=partial(collate_fn, pad_token=lang.word2id["<pad>"]),  shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=1024, collate_fn=partial(collate_fn, pad_token=lang.word2id["<pad>"]))
test_loader = DataLoader(test_dataset, batch_size=1024, collate_fn=partial(collate_fn, pad_token=lang.word2id["<pad>"]))

In [None]:
#@title Loops and Init

import math
def train_loop(data, optimizer, criterion, model, clip=5):
    model.train()
    loss_array = []
    number_of_tokens = []

    for sample in data:

        optimizer.zero_grad() # Zeroing the gradient
        output = model(sample['source'])
        loss = criterion(output, sample['target'])
        loss_array.append(loss.item() * sample["number_tokens"])
        number_of_tokens.append(sample["number_tokens"])
        loss.backward() # Compute the gradient, deleting the computational graph
        # clip the gradient to avoid explosioning gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step() # Update the weights

    ppl = math.exp(sum(loss_array) / sum(number_of_tokens))
    loss_to_return = sum(loss_array)/sum(number_of_tokens)
    return ppl, loss_to_return

def eval_loop(data, eval_criterion, model):
    model.eval()
    loss_to_return = []
    loss_array = []
    number_of_tokens = []
    # softmax = nn.Softmax(dim=1) # Use Softmax if you need the actual probability
    with torch.no_grad(): # It used to avoid the creation of computational graph
        for sample in data:
            output = model(sample['source'])
            loss = eval_criterion(output, sample['target'])
            loss_array.append(loss.item())
            number_of_tokens.append(sample["number_tokens"])

    ppl = math.exp(sum(loss_array) / sum(number_of_tokens))
    loss_to_return = sum(loss_array) / sum(number_of_tokens)
    return ppl, loss_to_return

def init_weights(mat):
    for m in mat.modules():
        if type(m) in [nn.GRU, nn.LSTM, nn.RNN]:
            for name, param in m.named_parameters():
                if 'weight_ih' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.xavier_uniform_(param[idx*mul:(idx+1)*mul])
                elif 'weight_hh' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.orthogonal_(param[idx*mul:(idx+1)*mul])
                elif 'bias' in name:
                    param.data.fill_(0)
        else:
            if type(m) in [nn.Linear]:
                torch.nn.init.uniform_(m.weight, -0.01, 0.01)
                if m.bias != None:
                    m.bias.data.fill_(0.01)

In [None]:
#@title Single Experiment

import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm
import copy
from time import strftime, localtime, time
import os

emb_size = 200
hid_size = 200
lr = 0.001
clip = 5
n_layers = 1
emb_drop = 0.5
ins_drop = 0.1
out_drop = 0.1
tying = True
var_drop = True

device = 'cuda:0'

vocab_len = len(lang.word2id)

#model = LM_RNN(emb_size, hid_size, vocab_len, pad_index=lang.word2id["<pad>"]).to(device)
# model = LM_LSTM(emb_size, hid_size, vocab_len, pad_index=lang.word2id["<pad>"]).to(device)
# model = LM_LSTM_DROP( emb_size,
#                       hid_size,
#                       vocab_len,
#                       out_dropout=out_drop,
#                       ins_dropout=ins_drop,
#                       emb_dropout=emb_drop,
#                       n_layers=n_layers,
#                       pad_index=lang.word2id["<pad>"]).to(device)

model = LM_LSTM_TWO( emb_size,
                      hid_size,
                      vocab_len,
                      tie=tying,
                      out_dropout=out_drop,
                      ins_dropout=ins_drop,
                      emb_dropout=emb_drop,
                      n_layers=n_layers,
                      pad_index=lang.word2id["<pad>"]).to(device)

model.apply(init_weights)

#optimizer = optim.SGD(model.parameters(), lr=lr)
optimizer = optim.AdamW(model.parameters(), lr=lr)

criterion_train = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"])
criterion_eval = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"], reduction='sum')

# build run folder
run_name = strftime('run_%Y%m%d_%H%M%S', localtime(time()))
run_path = save_path + run_name + "/"
os.mkdir(run_path)

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="NLU_assignment",
    name= run_name,
    config={
        "model":str(type(model).__name__),
        "lr":lr,
        "optim":str(type(optimizer).__name__),
        "clip":clip,
        "hid_size" : hid_size,
        "emb_size" : emb_size,
        "layers":n_layers,
        "tie":tying,
        "var_drop":var_drop,
        "dropout": [emb_drop, out_drop]

    }
)

EPOCHS = 100
PAT = 3
SAVE_RATE = 1
losses_train = []
losses_dev = []
sampled_epochs = []
best_ppl = math.inf
best_model = None
best_epoch = -1
pbar = tqdm(range(1,EPOCHS))


for epoch in pbar:
    ppl_train, loss = train_loop(train_loader, optimizer, criterion_train, model, clip)

    if epoch % 1 == 0:
        sampled_epochs.append(epoch)
        losses_train.append(np.asarray(loss).mean())

        ppl_dev, loss_dev = eval_loop(dev_loader, criterion_eval, model)
        losses_dev.append(np.asarray(loss_dev).mean())

        if  ppl_dev < best_ppl: # the lower, the better
            best_ppl = ppl_dev
            best_epoch = epoch
            patience = PAT
            best_model = copy.deepcopy(model).to('cpu')
        else:
            patience -= 1

        pbar.set_description("PPL: " + str(round(ppl_dev,2)) + " best: " + str(round(best_ppl,2))+ " P: " + str(patience))
        wandb.log({"ppl": ppl_dev,"ppl_train":ppl_train, "loss": loss_dev})

    if epoch % SAVE_RATE == 0:
      checkpoint_path = run_path + "epoch_" +( "0000" + str(epoch))[-4:] + ".pt"
      torch.save(model.state_dict(), checkpoint_path)

    if patience <= 0: # Early stopping with patience
          break # Not nice but it keeps the code clean



checkpoint_path = run_path + "epoch_" +( "0000" + str(epoch))[-4:] + ".pt"
torch.save(model.state_dict(), checkpoint_path)


best_model.to(device)
final_ppl,  _ = eval_loop(test_loader, criterion_eval, best_model)

print('Best ppl: ', best_ppl)
print('Test ppl: ', final_ppl)



TypeError: LM_LSTM_TWO.__init__() got an unexpected keyword argument 'ins_dropout'

In [21]:
#@title Multiple Experiments

import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm
import copy
from time import strftime, localtime, time
import os


defaults = { "emb_size" : 300,
          "hid_size" : 300,
          "lr" : 1.5,
          "clip" : 5,
          "n_layers" : 1,
          "emb_drop" : 0.1,
          "out_drop" : 0.1,
          "tying" : False,
          "var_drop" : False,
          "EPOCHS": 99,
          "OPT":"SGD"}

experiments = [

              {
                "lr":1.8,
                "emb_drop" : 0.5,
                "out_drop" : 0.5,
                "var_drop" : True
                },
                {
                "lr":2.1,
                "emb_drop" : 0.5,
                "out_drop" : 0.5,
                "var_drop" : True
                },
              {
                "emb_drop" : 0.1,
                "out_drop" : 0.1,
                "var_drop" : True
                },
                            {
                "emb_drop" : 0.25,
                "out_drop" : 0.0,
                "var_drop" : True
                },
              ]


for exp in experiments:

  args = defaults | exp

  print(args)

  emb_size = args["emb_size"]
  hid_size = args["hid_size"]
  lr = args["lr"]
  clip = args["clip"]
  n_layers = args["n_layers"]
  emb_drop = args["emb_drop"]
  out_drop = args["out_drop"]
  tying = args["tying"]
  var_drop = args["var_drop"]
  OPT = args["OPT"]
  device = 'cuda:0'

  vocab_len = len(lang.word2id)

  model = LM_LSTM_TWO( emb_size,
                        hid_size,
                        vocab_len,
                        tie=tying,
                        out_dropout=out_drop,
                        emb_dropout=emb_drop,
                        n_layers=n_layers,
                        var_drop=var_drop,
                        pad_index=lang.word2id["<pad>"]).to(device)

  model.apply(init_weights)

  if OPT == "SGD":
    print("using SGD")
    optimizer = optim.SGD(model.parameters(), lr=lr)
  else:
    print("using AdamW")
    optimizer = optim.AdamW(model.parameters(), lr=lr)

  criterion_train = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"])
  criterion_eval = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"], reduction='sum')

  # build run folder
  run_name = strftime('run_%Y%m%d_%H%M%S', localtime(time()))
  run_path = save_path + run_name + "/"
  os.mkdir(run_path)

  # start a new wandb run to track this script
  wandb.init(
      # set the wandb project where this run will be logged
      project="NLU_assignment",
      name= run_name,
      config={
          "model":str(type(model).__name__),
          "lr":lr,
          "optim":str(type(optimizer).__name__),
          "clip":clip,
          "hid_size" : hid_size,
          "emb_size" : emb_size,
          "layers":n_layers,
          "tie":tying,
          "var_drop":var_drop,
          "dropout": [emb_drop, out_drop]

      }
  )

  EPOCHS = args["EPOCHS"]
  PAT = 3
  SAVE_RATE = 1
  losses_train = []
  losses_dev = []
  sampled_epochs = []
  best_ppl = math.inf
  best_model = None
  best_epoch = -1
  pbar = tqdm(range(1,EPOCHS))


  for epoch in pbar:
      ppl_train, loss = train_loop(train_loader, optimizer, criterion_train, model, clip)

      if epoch % 1 == 0:
          sampled_epochs.append(epoch)
          losses_train.append(np.asarray(loss).mean())

          ppl_dev, loss_dev = eval_loop(dev_loader, criterion_eval, model)
          losses_dev.append(np.asarray(loss_dev).mean())

          if  ppl_dev < best_ppl: # the lower, the better
              best_ppl = ppl_dev
              best_epoch = epoch
              patience = PAT
              best_model = copy.deepcopy(model).to('cpu')
          else:
              patience -= 1

          pbar.set_description("PPL: " + str(round(ppl_dev,2)) + " best: " + str(round(best_ppl,2))+ " P: " + str(patience))
          wandb.log({"ppl": ppl_dev,"ppl_train":ppl_train, "loss": loss_dev})

      if epoch % SAVE_RATE == 0:
        checkpoint_path = run_path + "epoch_" +( "0000" + str(epoch))[-4:] + ".pt"
        torch.save(model.state_dict(), checkpoint_path)

      if patience <= 0: # Early stopping with patience
            break # Not nice but it keeps the code clean



  checkpoint_path = run_path + "epoch_" +( "0000" + str(epoch))[-4:] + ".pt"
  torch.save(model.state_dict(), checkpoint_path)


  best_model.to(device)
  final_ppl,  _ = eval_loop(test_loader, criterion_eval, best_model)

  print('Best ppl: ', best_ppl)
  print('Test ppl: ', final_ppl)

from google.colab import runtime
runtime.unassign()

{'emb_size': 300, 'hid_size': 300, 'lr': 1.8, 'clip': 5, 'n_layers': 1, 'emb_drop': 0.5, 'out_drop': 0.5, 'tying': False, 'var_drop': True, 'EPOCHS': 99, 'OPT': 'SGD'}
using SGD


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▆▆▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
ppl,█▅▅▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
ppl_train,█▄▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
loss,5.45467
ppl,233.8476
ppl_train,205.82501


PPL: 211.34 best: 211.34 P: 3:  70%|███████   | 69/98 [34:37<14:33, 30.11s/it]


KeyboardInterrupt: 

In [None]:

# To load the model you need to initialize it
print('Best ppl: ', best_ppl)
test_ep = best_epoch

if test_ep == -1:
  paths = os.listdir(run_path)
  paths.sort()
  checkpoint_path = run_path + paths[-1]
else:
  checkpoint_path = run_path + "epoch_" +( "0000" + str(test_ep))[-4:] + ".pt"

model.load_state_dict(torch.load(checkpoint_path))
model.to(DEVICE)
final_ppl,  _ = eval_loop(test_loader, criterion_eval, model)
print('Test ppl: ', final_ppl)


# Mandatory Exam Exercise
## Part 1 (4 points)
In this, you have to modify the baseline LM_RNN by adding a set of techniques that might improve the performance. In this, you have to add one modification at a time incrementally. If adding a modification decreases the performance, you can remove it and move forward with the others. However, in the report, you have to provide and comment on this unsuccessful experiment.  For each of your experiments, you have to print the performance expressed with Perplexity (PPL).
<br>
One of the important tasks of training a neural network is  hyperparameter optimization. Thus, you have to play with the hyperparameters to minimise the PPL and thus print the results achieved with the best configuration (in particular <b>the learning rate</b>).
These are two links to the state-of-the-art papers which use vanilla RNN [paper1](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5947611), [paper2](https://www.fit.vutbr.cz/research/groups/speech/publi/2010/mikolov_interspeech2010_IS100722.pdf).

**Mandatory requirements**: For the following experiments the perplexity must be below 250 (***PPL < 250***).

1. Replace RNN with a Long-Short Term Memory (LSTM) network --> [link](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)
2. Add two dropout layers: --> [link](https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html)
    - one after the embedding layer,
    - one before the last linear layer
3. Replace SGD with AdamW --> [link](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html)

## Part 2 (11 points)
**Mandatory requirements**: For the following experiments the perplexity must be below 250 (***PPL < 250***) and it should be lower than the one achieved in Part 1.1 (i.e. base LSTM).

Starting from the `LM_RNN` in which you replaced the RNN with a LSTM model, apply the following regularisation techniques:
- Weight Tying
- Variational Dropout (no DropConnect)
- Non-monotonically Triggered AvSGD

These techniques are described in [this paper](https://openreview.net/pdf?id=SyyGPP0TZ).
