The following has been inspired by the following repositories:
- [02456 Deep Learning @ DTU - Technical University of Denmark - awd-inspired-lstm](https://github.com/mikkelbrusen/awd-inspired-lstm)
- [pytorch/examples](https://github.com/pytorch/examples)
- [AWD-LSTM paper original code](https://github.com/salesforce/awd-lstm-lm)

All the code has been written by hand and tested thoroughly.

##### google colab

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!cp '/content/drive/My Drive/ptbdataset.zip' '/content'
drive.flush_and_unmount()

!unzip ptbdataset.zip

Mounted at /content/drive
Archive:  ptbdataset.zip
  inflating: README                  
  inflating: ptb.char.test.txt       
  inflating: ptb.char.train.txt      
  inflating: ptb.char.valid.txt      
  inflating: ptb.test.txt            
  inflating: ptb.train.txt           
  inflating: ptb.valid.txt           


##### imports & params





In [None]:
import argparse
import time
import math
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import Counter
from tqdm.notebook import trange, tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
_cuda = torch.cuda.is_available()
_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#################
# hyperparameters
#################

_train_batch_size = 20
_bptt = 70  # sequence length - backpropagation through time window
_embed_size = 400
_hidden_size = 1150
_num_layers = 3

_num_epochs = 50
_learning_rate = 30
_ntasgd_n = 5  # after how many epochs should we use ASGD instead of SGD | 5

_lock_dropout_input = 0.4  # locked dropout prob. on input | 0.4
_lock_dropout_hidden = 0.25  # locked dropout prob. on hidden | 0.25
_lock_dropout_output = 0.4  # locked dropout prob. on output | 0.4
_embedding_dropout= 0.1  # embedding dropout prob. | 0.1
_weight_dropout = 0.5  # variational dropout prob. | 0.5
_gradient_clip = 0.25  # gradient clipping | 0.25
_log_interval = 100  # how often should we evaluate during training

_tie_weights = True  # tie embedding and softmax weight together or not

_seed = 1  # fix seed for reproducibility
np.random.seed(_seed)
torch.manual_seed(_seed)
if _cuda:
  torch.cuda.manual_seed(_seed)

_save = "model_long_run.pt"  # name of the saved best model

##### load data

In [None]:
class Dictionary(object):
  def __init__(self):
    self.word2idx = {}
    self.idx2word = []
    self.total = 0

  def add_word(self, word):
    if word not in self.word2idx:
      self.idx2word.append(word)
      self.word2idx[word] = len(self.idx2word) - 1
    token_id = self.word2idx[word]
    self.total += 1
    return self.word2idx[word]

  def __len__(self):
    return len(self.idx2word)


class Corpus(object):
  def __init__(self):
    self.dictionary = Dictionary()
    self.train = self.tokenize('ptb.train.txt')
    self.valid = self.tokenize('ptb.valid.txt')
    self.test = self.tokenize('ptb.test.txt')

  def tokenize(self, path):
    with open(path, 'r') as f:
      tokens = 0
      for line in f:
        words = line.split() + ['<eos>']
        tokens += len(words)
        for word in words:
          self.dictionary.add_word(word)

    with open(path, 'r') as f:
      ids = torch.LongTensor(tokens)
      token = 0
      for line in f:
        words = line.split() + ['<eos>']
        for word in words:
          ids[token] = self.dictionary.word2idx[word]
          token += 1

    return ids

##### utils

In [None]:
# detach the hidden state so that the model does not backpropagate all the way to start of the dataset
def detach_hidden(h):
  if isinstance(h, torch.Tensor): return h.detach()
  else: return tuple(detach_hidden(v) for v in h)


# https://pytorch.org/tutorials/beginner/transformer_tutorial.html
# creat batches of dataset with certain batch_size
def batchify(data, batch_size):
  seq_len = data.size(0) // batch_size
  data = data[:seq_len * batch_size]  # remove extra elements that would not cleannly fit
  data = data.view(batch_size, seq_len).t().contiguous()
  return data.to(_device)


def get_batch(source, i):
  seq_len = min(_bptt, len(source) - 1 - i)
  data = source[i:i+seq_len]
  target = source[i+1:i+1+seq_len].reshape(-1)
  return data, target


def model_save(fn):
  with open(fn, 'wb') as f:
    torch.save({'state_dict': model.state_dict()}, f)


def model_load(fn):
  global model
  with open(fn, 'rb') as f:
    checkpoint = torch.load(f)
    model.load_state_dict(checkpoint['state_dict'])

##### load and divide data

In [None]:
eval_batch_size = 10
test_batch_size = 10

corpus = Corpus()

train_data = batchify(corpus.train, _train_batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, test_batch_size)

##### functions

**embedding dropout**

equivalent to performing dropout on the embedding matrix at a word level

remaining non-dropped word embeddings are scaled by $\frac{1}{1-p_e}$, where $p_e$ is the probability of dropout

In [None]:
def embedded_dropout(embed, words, dropout=0.1):
  mask = embed.weight.data.new().resize_((embed.weight.size(0), 1))  # create new tensor with equal size
  mask = mask.bernoulli_(1 - dropout)  # sample bernoulli distribution
  mask = mask.expand_as(embed.weight) / (1 - dropout)  # scale remaining data
  masked_embed_weight = mask * embed.weight  # apply dropout mask to the embedding

  padding_idx = embed.padding_idx
  if padding_idx is None:
    padding_idx = -1

  X = torch.nn.functional.embedding(words, masked_embed_weight,
    padding_idx, embed.max_norm, embed.norm_type,
    embed.scale_grad_by_freq, embed.sparse)
  return X

**weight dropout**

apply dropout on the weights rather that the activations

In [None]:
class WeightDropout(nn.Module):
  def __init__(self, module, weights, dropout=0):
    super(WeightDropout, self).__init__()
    self.module = module
    self.weights = weights
    self.dropout = dropout
    self._setup()

  def dummy(*args, **kwargs):
    return

  def _setup(self):
    if issubclass(type(self.module), torch.nn.RNNBase):
      self.module.flatten_parameters = self.dummy

    for name_w in self.weights:
      print(f'apply weight drop of {self.dropout} to {name_w}')
      w = getattr(self.module, name_w)
      del self.module._parameters[name_w]
      self.module.register_parameter(name_w + '_raw', nn.Parameter(w.data))

  def _setweights(self):
    for name_w in self.weights:
      raw_w = getattr(self.module, name_w + '_raw')
      w = None
      w = torch.nn.Parameter(torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training))
      setattr(self.module, name_w, w)

  def forward(self, *args):
    self._setweights()
    return self.module.forward(*args)

**locked dropout**

sample binary mask only once upon first call, then use that locked dropout mask

used to apply the same dropout mask every time step

In [None]:
class LockedDropout(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, x, dropout=0.5):
    if not self.training or not dropout:
      return x
    m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout)
    mask = m.clone().detach() / (1 - dropout)
    mask = mask.expand_as(x)
    return mask * x

**LSTM module**

In [None]:
class LSTMModel(nn.Module):
  def __init__(self, num_tokens, hidden_size, embed_size, output_size, dropout=0.5, n_layers=1, wdrop=0, dropouth=0.5, dropouti=0.5, dropoute=0.1, tie_weights=False):
    super(LSTMModel, self).__init__()
    self.embed_size = embed_size
    self.hidden_size = hidden_size
    self.n_layers = n_layers
        
    self.tie_weights = tie_weights
    self.lockdrop = LockedDropout()
    self.dropouti = dropouti
    self.dropouth = dropouth
    self.dropoute = dropoute
    self.dropout = dropout
    self.encoder = nn.Embedding(num_tokens, embed_size)
        
    self.lstms = []
        
    for layer_idx in range(n_layers):
      if layer_idx == 0: layer_input_size = embed_size
      else: layer_input_size = hidden_size
      if layer_idx != n_layers-1: layer_output_size = hidden_size
      else: layer_output_size = (embed_size if tie_weights else hidden_size)
      self.lstms.append(nn.LSTM(layer_input_size, layer_output_size, num_layers=1))
    if wdrop:
      self.lstms = [WeightDropout(lstm, ['weight_hh_l0'], dropout=wdrop) for lstm in self.lstms]
    self.lstms = nn.ModuleList(self.lstms)
        
    self.decoder = nn.Linear(embed_size if tie_weights else hidden_size, output_size)
        
    if tie_weights:
      self.decoder.weight = self.encoder.weight
          
    initrange = 0.1
    self.encoder.weight.data.uniform_(-initrange, initrange)
    self.decoder.bias.data.zero_()
    self.decoder.weight.data.uniform_(-initrange, initrange)

  def forward(self, inp, hidden):
    emb = embedded_dropout(self.encoder, inp, dropout=self.dropoute if self.training else 0)
    emb = self.lockdrop(emb, self.dropouti)
        
    new_hidden = []
    outputs = []
    output = emb
    for i, lstm in enumerate(self.lstms): 
      output, new_hid = lstm(output, hidden[i])
            
      new_hidden.append(new_hid)
      if i != self.n_layers-1:
        output = self.lockdrop(output, self.dropouth)
        
    hidden = new_hidden
    output = self.lockdrop(output, self.dropout)
   
    decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
    decoded = decoded.view(output.size(0), output.size(1), decoded.size(1))
    return decoded, hidden

  def init_hidden(self,bsz):
    weight = next(self.parameters()).data
    return [(weight.new(1, bsz, self.hidden_size if l != self.n_layers - 1 else (self.embed_size if self.tie_weights else self.hidden_size)).zero_(),
            weight.new(1, bsz, self.hidden_size if l != self.n_layers - 1 else (self.embed_size if self.tie_weights else self.hidden_size)).zero_())
            for l in range(self.n_layers)]

evaluate

In [None]:
def evaluate(data_source, name='train'):
  # Turn on evaluation mode which disables dropout.
  model.eval()
  total_loss = 0.
  ntokens = len(corpus.dictionary)
  hidden = model.init_hidden(test_batch_size)
  with torch.no_grad():
    with tqdm(range(0, data_source.size(0) - 1, _bptt), desc=f'{name} evaluation', total=(len(data_source)//_bptt)+1) as tqdm_:
      for i in tqdm_:
        data, targets = get_batch(data_source, i)
        output, hidden = model(data, hidden)
        output_flat = output.view(-1, ntokens)
        total_loss += len(data) * criterion(output_flat, targets).item()
        hidden = detach_hidden(hidden)
        if (i*_bptt) % _log_interval == 0 and (i*_bptt) > 0:
          tqdm_.set_postfix({'ppl':math.exp(total_loss / (len(data_source) - 1))})
  return total_loss / (len(data_source) - 1)

train

In [None]:
def train(epoch):
  # Turn on training mode which enables dropout.
  model.train()
  total_loss = 0.
  start_time = time.time()
  ntokens = len(corpus.dictionary)
  hidden = model.init_hidden(_train_batch_size)
  with tqdm(enumerate(range(0, train_data.size(0) - 1, _bptt)), desc=f'epoch: {epoch}', total=(len(train_data)//_bptt)+1) as tqdm_:
    for batch, i in tqdm_:
      data, targets = get_batch(train_data, i)
      hidden = detach_hidden(hidden)
      optimizer.zero_grad()
      output, hidden = model(data, hidden)

      loss = criterion(output.view(-1, ntokens), targets)
      loss.backward()

      # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
      torch.nn.utils.clip_grad_norm_(model.parameters(), _gradient_clip)
      optimizer.step()

      total_loss += loss.item()

      if batch % _log_interval == 0 and batch > 0:
        cur_loss = total_loss / _log_interval
        tqdm_.set_postfix({'lr':_learning_rate, 'loss':cur_loss, 'ppl':math.exp(cur_loss)})
        total_loss = 0

##### define model and loss function

In [None]:
ntokens = len(corpus.dictionary)

model = LSTMModel(ntokens, _hidden_size, _embed_size, ntokens, _lock_dropout_output, _num_layers, _weight_dropout, _lock_dropout_hidden, _lock_dropout_input, _embedding_dropout, _tie_weights).to(_device)
criterion = nn.CrossEntropyLoss()

apply weight drop of 0.5 to weight_hh_l0
apply weight drop of 0.5 to weight_hh_l0
apply weight drop of 0.5 to weight_hh_l0


##### do actual training

In [None]:
best_val_loss = 100000000
stored_losses = []

optimizer = torch.optim.SGD(model.parameters(), lr=_learning_rate)
for epoch in range(1, _num_epochs+1):
  epoch_start_time = time.time()
  train(epoch)
  if 't0' in optimizer.param_groups[0]:  # if ASGD
    tmp = {}
    for prm in model.parameters():
      tmp[prm] = prm.data.clone()
      if 'ax' in optimizer.state[prm]:
        prm.data = optimizer.state[prm]['ax'].clone()

    val_loss = evaluate(val_data)

    if val_loss < best_val_loss:
      model_save(_save)
      best_val_loss = val_loss

    nparams = 0
    nparams_in_temp_keys = 0
    for prm in model.parameters():
      nparams += 1
      if prm in tmp.keys():
        nparams_in_temp_keys += 1
        prm.data = tmp[prm].clone()

  else:
    val_loss = evaluate(val_data, 'valid')

    if val_loss < best_val_loss:
      'new best validation result - saving model'
      model_save(_save)
      best_val_loss = val_loss
            
    elif len(stored_losses) > _ntasgd_n and val_loss > min(stored_losses[:-_ntasgd_n]):
      print('switching to ASGD')
      optimizer = torch.optim.ASGD(model.parameters(), lr=_learning_rate, t0=0, lambd=0.)

  stored_losses.append(val_loss)

epoch: 1:   0%|          | 0/664 [00:00<?, ?it/s]

valid evaluation:   0%|          | 0/106 [00:00<?, ?it/s]

epoch: 2:   0%|          | 0/664 [00:00<?, ?it/s]

valid evaluation:   0%|          | 0/106 [00:00<?, ?it/s]

open the best saved model run it on the test data

In [None]:
# Load the best saved model.
model_load(_save)

test_loss = evaluate(test_data)
print(f'test ppl based on best validation results: {math.exp(test_loss)}')

In [None]:
with open('model_long_run.npy', 'wb') as f:
    np.save(f, np.array(stored_losses))

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!cp  '/content/model_long_run.pt' '/content/drive/My Drive/'
!cp  '/content/model_long_run.npy' '/content/drive/My Drive/'

drive.flush_and_unmount()

In [None]:
with open('model_long_run.npy', 'rb') as f:
  temp = list(np.load(f))

---