# Recurrent Language Model

<sup> Party adapted from https://github.com/pytorch/examples/tree/master/word_language_model </sup>

Before we begin, let's make sure we have all the required saved files for this notebook to run. Please run the cells below to download saved files

In [None]:
! bash pyfiles/download_reqs.sh

In [None]:
import os

path_to_utils = 'pyfiles'

dataset = 'amazon'

github_repo_path = '../../'
path_to_train_data = os.path.join(github_repo_path, 'data/train.txt')
path_to_valid_data = os.path.join(github_repo_path, 'data/valid.txt')

path_to_saved_models = 'saved_models'

In [None]:

import sys

sys.path.append(path_to_utils)

import loading_text_and_tokenization
import torch
import numpy as np
import torch.nn as nn
import random
import math
import torch.nn.functional as F

import pickle

In [None]:
USE_CUDA = True
device = torch.device("cuda" if USE_CUDA else "cpu")

In [None]:
batch_size = 64

### Loading Data and turning into batches

In [None]:
corpus_filename = os.path.join(path_to_saved_models, dataset+'_obj.p')

create_dataset_obj_again = False
if os.path.exists(corpus_filename) and (not create_dataset_obj_again):
    corpus = pickle.load( open( corpus_filename, "rb" ) )
else:
    corpus = loading_text_and_tokenization.Corpus(path_to_train_data, 
                                                  path_to_valid_data)
    pickle.dump( corpus, open( corpus_filename, "wb" ) )

### Aside: torch.Tensor.narrow

In [None]:
dummy_tensor = torch.arange(0, 10);
print(dummy_tensor)

In [None]:
dummy_tensor.narrow(0, 1, 5)

In [None]:
dummy_tensor.narrow(0, 5, 4)

All we need to understand is that .narrow() is just a way to do indexing. When we do dummy_tensor.narrow(0, i, j) we are indexing dummy_tensor[i:i+j]

In [None]:
def batchify(data, bsz, random_start_idx=False):
    # calculate total number of batches that fit cleanly
    nbatch = data.size(0) // bsz
    if random_start_idx:
        start_idx = random.randint(0, data.size(0) % bsz - 1)
    else:
        start_idx = 0
        
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    # Nice thing about this: 
    # u don't need to pad since every sequence now has same length
    data = data.narrow(0, start_idx, nbatch * bsz)
    
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)


In [None]:

def get_batch(source, i, max_seq_len):
    seq_len = min(max_seq_len, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


### Train Step

In [None]:
clip = 0.25
log_interval = 200
criterion = nn.CrossEntropyLoss()

def repackage_hidden(h):
    """
        Wraps hidden states in new Tensors, to detach them from their history.
    """
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
    
def train_step(model, lr, epoch):
    model.train()
    total_loss = 0.
    hidden = model.init_hidden(batch_size)
    
    # We shuffle train data every epoch
    train_data = batchify(corpus.train, batch_size, random_start_idx=True)
    
    for batch, i in enumerate(range(0, train_data.size(0) - 1, max_seq_len)):
        data, targets = get_batch(train_data, i, max_seq_len)
        
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, vocab_size), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            if p.grad is not None:
                p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch %log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // max_seq_len, lr,
                cur_loss, math.exp(cur_loss)))
            total_loss = 0
            
    return model


### Perplexity

1. Let's say we have a vocabulary size of $|V|$. Our model is completely random - every conditional distribution is uiform over the vocabulary. What will be the (expected) perplexity of the model?
2. We have models $M_1$ and $M_2$ with vocabulary size $|V_1| >> |V_2|$. Suppose $M_2$ has a much lower perplexity. Can we conclude that $M_2$ is a better model than $M_1$?

In [None]:
# perplexity evaluation for a given corpus
def evaluate(model, data_source, max_seq_len, eval_batch_size=32):
    model.eval()
    total_loss = 0.
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, max_seq_len):
            data, targets = get_batch(data_source, i, max_seq_len)
            
            output, hidden = model(data, hidden)
    
            output_flat = output.view(-1, vocab_size)
            
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
            
    return total_loss / len(data_source)

### Train for $n$ epochs

In [None]:
def train_for_n_epochs(model, filename, num_epochs = 10, lr=20):
    best_val_loss = np.inf
    val_data = batchify(corpus.valid, batch_size, random_start_idx=True)
    for epoch in range(1, num_epochs+1):
        model = train_step(model, lr, epoch)
        val_loss = evaluate(model, val_data, max_seq_len, batch_size)
        print('-' * 89)
        print('| end of epoch {:3d} | valid loss {:5.2f} | '
                    'valid ppl {:8.2f}'.format(epoch, 
                                               val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(filename, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
    return model

## RNN Model

In [None]:
## Common Model Parameters

embed_size = 200
hidden_size = 200
num_layers = 2
num_epochs = 20
lr = 20.0
dropout = 0.2
max_seq_len = 35

if dataset == 'amazon':
    num_epochs = 5;
    max_seq_len = 70


vocab_size = len(corpus.dictionary)

In [None]:
print('vocab size: ', vocab_size)

In [None]:
filename_rnn = os.path.join(path_to_saved_models, 'rnn_'+dataset+'.pth')

In [None]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0.5):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        
        self.encoder = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, dropout=dropout)
        self.decoder = nn.Linear(hidden_size, vocab_size)

        self.init_weights()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return (weight.new_zeros(self.num_layers, bsz, self.hidden_size),
                    weight.new_zeros(self.num_layers, bsz, self.hidden_size))

Note that in our decoder output we haven't applied softmax. Why?

We use nn.CrossEntropyLoss() to train. From pytorch documentation for nn.CrossEntropyLoss() ( https://pytorch.org/docs/stable/nn.html ) - this criterion combines nn.LogSoftmax() and nn.NLLLoss() in one single class. So, this is actually exactly the same as minimizing the log likelihood after applying softmax. 

In [None]:
model_rnn = RNNModel(vocab_size, embed_size, hidden_size, num_layers, dropout).to(device)

### Training RNN Model or Loading Saved Model

In [None]:
train_again = False

In [None]:
if os.path.exists(filename_rnn) and (not train_again):
    model_rnn = torch.load(filename_rnn);
else:
    model_rnn = train_for_n_epochs(model = model_rnn, 
                               filename = filename_rnn,
                               num_epochs = num_epochs)


### Checking Validation Performance

In [None]:
def validation_performance(model, model_name):
    val_data = batchify(corpus.valid, batch_size, random_start_idx=True)
    val_loss = evaluate(model, val_data, max_seq_len, batch_size)
    print('| {} | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(model_name, val_loss, math.exp(val_loss)))

In [None]:
validation_performance(model_rnn, 'RNN')

### Scoring The Sentence

In [None]:
def score_sentences(model, sent_list):
    ppl_list = []
    for sent in sent_list:
        tokens = corpus.tokenize_sentence(sent)
        test_sent_idx = batchify(tokens, 1)
        loss = evaluate(model, test_sent_idx, len(tokens), 1)
        ppl_list.append((sent, math.exp(loss)))
    return ppl_list

In [None]:
test_sentences = [ 'i like pandas', \
                  'this cloth is nice.', \
                  'i like this', \
                  'i will definitely recommend this', \
                  'i like eating', \
                  'i like dress', \
                  'i like this dress', \
                  'cho likes dress', \
                  'roberta likes dress', \
                  'roberta likes this dress', \
                  'this purse is nice', \
                  'my wife really likes the color of this dress']

score_sentences(model_rnn, test_sentences)

## Generation

In [None]:
def generate_words(model, n_words = 100, input_token = None):
    #n_words number of words to generate
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(1)
    
    if input_token is None:
        input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
    else:
        input = torch.tensor([[corpus.dictionary.word2idx[input_token]]], dtype=torch.long).to(device)
    
    sentence = corpus.dictionary.idx2word[input.item()] + ' '
    model.eval()
    with torch.no_grad():  # no tracking history
        for i in range(n_words):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(1.0).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]

            input.fill_(word_idx)
  
            word = corpus.dictionary.idx2word[word_idx]

            sentence += word +' '
        
    return sentence

In [None]:
generate_words(model_rnn)

In [None]:
generate_words(model_rnn, input_token = 'a')

# Self Attention

In [None]:
class SelfAttn_Model(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, 
                 dropout = 0.5, idropout = 0.5, self_attention = True):
        super(SelfAttn_Model, self).__init__()

        
        self.drop = nn.Dropout(dropout)
        
        self.encoder = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRUCell(hidden_size, hidden_size, bias=True);

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
    
        self.projector_summ = nn.Sequential(nn.Dropout(idropout),
                                            nn.Linear(embed_size + hidden_size, hidden_size),
                                            nn.Dropout(idropout))
        
        self.decoder = nn.Linear(embed_size + hidden_size, vocab_size)
        self.vocab_size = vocab_size;
        
        self.self_attention = self_attention;

    def forward(self, input, memory):
        emb = self.encoder(input)
        emb = F.relu(emb)
        
        return_scores = torch.empty(emb.size(0), emb.size(1), self.vocab_size).to(input.device)        
        
        for t in range(emb.size(0)):
            current_vec = emb[t];
            
            if self.self_attention:
                selected_memory, attention0 = self.attention(current_vec, memory)
            else:
                selected_memory, attention0 = memory[:, 0, :], None;

            # recurrent
            mem_out = current_vec;

            # update memory
            memory = torch.cat([mem_out[:, None, :], memory[:, :-25, :]], dim=1)
    
            scores = self.decoder( torch.cat([mem_out, selected_memory], dim = 1) )
            return_scores[t] = scores
            
        return return_scores.contiguous(), memory

    def attention(self, input, memory):
        # select memory to use
        concat_vec = torch.cat([input,  memory[:, 0, :]], dim=1);
        projected_vec = self.projector_summ(concat_vec);
    
        dot_product_values = torch.bmm(memory, projected_vec.unsqueeze(-1)).squeeze(-1)/ math.sqrt(self.hidden_size);
        
        weights =  F.softmax(dot_product_values, dim = 1).unsqueeze(-1);
        
        selected_memory = torch.sum( memory * weights, dim=1)
        return selected_memory, weights

    def init_hidden(self, bsz):
        return torch.zeros(bsz, 1, self.hidden_size).to(self.decoder.weight.device)


In [None]:
filename_selfattn = os.path.join(path_to_saved_models, 'selfattn_'+dataset+'.pth');

In [None]:
model_selfattn = SelfAttn_Model(vocab_size, embed_size, hidden_size, num_layers, dropout).to(device)

In [None]:
train_again = False
if os.path.exists(filename_selfattn) and (not train_again):
    model_selfattn = torch.load(filename_selfattn);
else:
    model_selfattn = train_for_n_epochs(model = model_selfattn, 
                               filename = filename_selfattn,
                               num_epochs = num_epochs)


### Checking Validation Performance

In [None]:
validation_performance(model_selfattn, 'Self Attention')

### Scoring Sentences

In [None]:
score_sentences(model_selfattn, test_sentences)

### Generating Sentences

In [None]:
generate_words(model_selfattn)

In [None]:
generate_words(model_selfattn, input_token = 'a')

## RNN + Self Attention

In [None]:
class RNN_SelfAttn_Model(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, 
                 dropout = 0.5, idropout = 0.5, self_attention = True):
        super(RNN_SelfAttn_Model, self).__init__()

        
        self.drop = nn.Dropout(dropout)
        
        self.encoder = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRUCell(hidden_size, hidden_size, bias=True);

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.memory_rnn = nn.GRUCell(embed_size, hidden_size)
        self.projector_summ = nn.Sequential(nn.Dropout(idropout),
                                            nn.Linear(embed_size + hidden_size, hidden_size),
                                            nn.Dropout(idropout))
        
        self.decoder = nn.Linear(hidden_size, vocab_size)
        self.vocab_size = vocab_size;
        
        self.self_attention = self_attention;

    def forward(self, input, memory):
        emb = self.encoder(input)
        emb = F.relu(emb)
        
#         print(emb.shape)
#         emb = emb.transpose(0, 1);
        return_scores = torch.empty(emb.size(0), emb.size(1), self.vocab_size).to(input.device)        
        
        for t in range(emb.size(0)):
            current_vec = emb[t];
            
            if self.self_attention:
                selected_memory, attention0 = self.attention(current_vec, memory)
            else:
                selected_memory, attention0 = memory[:, 0, :], None;

            # recurrent
            mem_out = self.memory_rnn(current_vec, selected_memory);

            # update memory
            memory = torch.cat([mem_out[:, None, :], memory[:, :-3, :]], dim=1)
    
            scores = self.decoder(mem_out)
            return_scores[t] = scores
            
        return return_scores.contiguous(), memory

    def attention(self, input, memory):
        # select memory to use
        concat_vec = torch.cat([input,  memory[:, 0, :]], dim=1);
        projected_vec = self.projector_summ(concat_vec);
    
        dot_product_values = torch.bmm(memory, projected_vec.unsqueeze(-1)).squeeze(-1)/ math.sqrt(self.hidden_size);
        
        weights =  F.softmax(dot_product_values, dim = 1).unsqueeze(-1);
        
        selected_memory = torch.sum( memory * weights, dim=1)
        return selected_memory, weights

    def init_hidden(self, bsz):
        return torch.zeros(bsz, 1, self.hidden_size).to(self.decoder.weight.device)


In [None]:
filename_rnn_selfattn = os.path.join(path_to_saved_models, 'rnn_selfattn_'+dataset+'.pth');

In [None]:
model_rnn_selfattn = RNN_SelfAttn_Model(vocab_size, embed_size, hidden_size, num_layers, dropout).to(device)

In [None]:
train_again = False
continue_training = True
if os.path.exists(filename_rnn_selfattn) and (not train_again):
    model_rnn_selfattn = torch.load(filename_rnn_selfattn);
else:
    if continue_training:
        if os.path.exists(filename_rnn_selfattn):
            model_rnn_selfattn = torch.load(filename_rnn_selfattn);
    model_rnn_selfattn = train_for_n_epochs(model = model_rnn_selfattn, 
                               filename = filename_rnn_selfattn,
                               num_epochs = num_epochs, 
                               lr = lr)


### Checking Validation Performance

In [None]:
validation_performance(model_rnn_selfattn, 'RNN + Self Attention')

### Scoring Sentences

In [None]:
score_sentences(model_rnn_selfattn, test_sentences)

### Generating Sentences

In [None]:
generate_words(model_rnn_selfattn)

In [None]:
generate_words(model_rnn_selfattn, input_token = 'a')