# Neural and N-Gram Language Model and for CPSC 503 Assignment 2
### The neural model notebook is modified from Yunjey Choi's Github repository - pytorch-tutorial.
### pytorch-tutorial/tutorials/02-intermediate/language_model/
### https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/02-intermediate/language_model

### The N-gram model notebook is from Josh Loehr's Github repository - ngram-language-model.
### https://github.com/joshualoehr/ngram-language-model

### ========================================================================================

### First of the first, let's load a number of dependencies:

In [1]:
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_
import torch.nn.functional as F
import numpy as np
import os
import math
from collections import defaultdict

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Here are two classes needed for the data loading and formating:

In [2]:
class Dictionary(object):
    # < MISSING CLASS DESCRIPTION >
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0
    
    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def return_dict(self):
        return self.idx2word
        
    def __len__(self):
        return len(self.word2idx)


class Corpus(object):
    # < MISSING CLASS DESCRIPTION >
    def __init__(self):
        self.dictionary = Dictionary()

    def get_data(self, path, n_gram=2):
        with open(path, 'r') as f:
            tokens = 0;
            sample_list = []
            for line in f:
                words = ['<start>'] * n_gram + line.split() # < MISSING COMMENT >
                tokens += len(words)
                sample_list.append(words)
                for word in words: 
                    self.dictionary.add_word(word)  

        # < MISSING COMMENT FOR THE FOLLOWING PIECE OF CODE >
        ids_list = [[0]*len(s) for s in sample_list if len(s) > n_gram] 
        with open(path, 'r') as f:
            sample_num = 0
            for line in f:
                token = 0
                words = ['<start>'] * n_gram + line.split()
                if len(words) > n_gram:
                    for word in words:
                        ids_list[sample_num][token] = self.dictionary.word2idx[word]
                        token += 1
                    sample_num += 1

        # < MISSING COMMENT FOR THE FOLLOWING PIECE OF CODE >
        for n in range(len(ids_list)):
            flat_ids = ids_list[n]
            ids_list[n] = torch.LongTensor([flat_ids[i:i+n_gram+1] for i in range(len(flat_ids)-n_gram)])
        return ids_list

### Hyper-parameters: 

In [3]:
n_gram = 3

embed_size = 128
intermediate_size = 1024
num_epochs = 10
learning_rate = 0.002

### Load the "Penn Treebank" dataset and split it into train/dev/test

In [4]:
corpus = Corpus()
ids = corpus.get_data('data/train_mini.txt', n_gram-1)
train_ids = ids[:-200]
dev_ids = ids[-200:-100]
test_ids = ids[-100:]
vocab_size = len(corpus.dictionary)

### The class of n-gram language model:

In [6]:
class NGramLM(object):
    def __init__(self, vocab_size, x_n, x_m, laplace=1):
        self.vocab_size = vocab_size
        self.laplace = laplace
        
        # Dictionaries for tracking the count of n-grams
        self.n_gram_count = self.count_ngrams(x_n)
        self.m_gram_count = self.count_ngrams(x_m)
    
    def count_ngrams(self, x):
        """
        Populate the dictionary with the number of occurrences of each n-gram
        """
        count_list = defaultdict(int)
        for example in x:
            for n_gram in example.tolist():
                count_list[tuple(n_gram)] += 1 
        return count_list
    
    def compute_mle(self, n_gram):
        """
        Compute the MLE of P(w_n|w_{n−1}, ...) with add-one Laplacian smoothing 
        """
        n_count = self.n_gram_count[n_gram]
        m_gram = n_gram[:-1]
        m_count = self.m_gram_count[m_gram]
        prob = (n_count + self.laplace) / (m_count + self.laplace * self.vocab_size) 
        return prob


### Train the n-gram model based on MLE

In [7]:
# Populate the dictionaries with counts from training corpus
m_gram = n_gram - 1
m_gram_train_ids = corpus.get_data('data/train_mini.txt', n_gram-2)[:-200]
n_gram_train_ids = train_ids

model = NGramLM(vocab_size, n_gram_train_ids, m_gram_train_ids)

# Compute average perplexity on training set
train_ppl = 0
for i in range(0, len(train_ids)):

    probabilities = list(map(lambda x: model.compute_mle(tuple(x)), train_ids[i].tolist()))
    perplexity = np.exp(sum(-np.log(probabilities)) / len(train_ids[i]))
    train_ppl += perplexity
    
print('The average training perplexity for N-gram LM: '+str(train_ppl/len(train_ids)))

The average training perplexity for N-gram LM: 1464.8837207362549


In [8]:
# Compute average perplexity on training set
test_ppl = 0
for i in range(0, len(test_ids)):

    probabilities = list(map(lambda x: model.compute_mle(tuple(x)), test_ids[i].tolist()))
    perplexity = np.exp(sum(-np.log(probabilities)) / len(test_ids[i]))
    print('Perplexity for test sample '+str(i)+' :', perplexity)
    test_ppl += perplexity
    
print('The average testing perplexity for N-gram LM: '+str(test_ppl/len(test_ids)))

Perplexity for test sample 0 : 3117.1487062216083
Perplexity for test sample 1 : 2889.9450889166437
Perplexity for test sample 2 : 2064.8644143110246
Perplexity for test sample 3 : 3579.142800151963
Perplexity for test sample 4 : 3192.5345501363786
Perplexity for test sample 5 : 2236.3169582936957
Perplexity for test sample 6 : 3181.4940906216416
Perplexity for test sample 7 : 3468.124739622231
Perplexity for test sample 8 : 3152.878827231253
Perplexity for test sample 9 : 3579.0000000000005
Perplexity for test sample 10 : 3217.8170238617386
Perplexity for test sample 11 : 2855.8295692385764
Perplexity for test sample 12 : 2809.6022165773124
Perplexity for test sample 13 : 2510.8634250307455
Perplexity for test sample 14 : 1960.5092731715915
Perplexity for test sample 15 : 2608.9912879228987
Perplexity for test sample 16 : 2675.1672745072965
Perplexity for test sample 17 : 3118.549302583625
Perplexity for test sample 18 : 2513.006687888638
Perplexity for test sample 19 : 2752.323920225

### The class of neural language model:

In [9]:
class LM(nn.Module):
    def __init__(self, vocab_size, embed_size, intermediate_size, n_gram):
        super(LM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.intermediate = nn.Linear(n_gram * embed_size, intermediate_size)
        self.linear = nn.Linear(intermediate_size, vocab_size)
        
    def forward(self, x):
        x = self.embed(x) # < MISSING COMMENT >
        conc_emb = x.view(x.size(0), x.size(1)*x.size(2))
        #conc_emb = torch.cat([x[:,0,:], x[:,1,:]],1)
        intermediate_output = self.intermediate(conc_emb) # < MISSING COMMENT >
        final_out = self.linear(intermediate_output) # < MISSING COMMENT >
        return final_out

In [10]:
model = LM(vocab_size, embed_size, intermediate_size, n_gram-1).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### Train the neural model

In [11]:
for epoch in range(num_epochs):
    avg_ppl = 0; avg_loss = 0;
    for i in range(0, len(train_ids)):
        inputs = train_ids[i][:, 0:n_gram-1].to(device)
        targets = train_ids[i][:, n_gram-1:].to(device)
        
        # < MISSING COMMENT FOR THE FOLLOWING PIECE OF CODE >
        outputs = model(inputs)
        loss = criterion(outputs, targets.reshape(-1))
        avg_loss += loss.item();
        
        # < MISSING COMMENT FOR THE FOLLOWING PIECE OF CODE >
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
    
    # < MISSING COMMENT FOR THE FOLLOWING PIECE OF CODE >
    for i in range(0, len(dev_ids)):
        dev_inputs = dev_ids[i][:, 0:n_gram-1].to(device)
        dev_targets = dev_ids[i][:, n_gram-1:].to(device)
        dev_outputs = model(dev_inputs)
        ce = criterion(dev_outputs, dev_targets.reshape(-1))
        avg_ppl += np.exp(ce.item());
    
    print ('Epoch [{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
        .format(epoch + 1, num_epochs, avg_loss/len(train_ids), avg_ppl/len(dev_ids)))

Epoch [1/10], Loss: 7.4975, Perplexity: 4435.34
Epoch [2/10], Loss: 5.2731, Perplexity: 46597.55
Epoch [3/10], Loss: 4.7165, Perplexity: 151783.69
Epoch [4/10], Loss: 4.3831, Perplexity: 504945.50
Epoch [5/10], Loss: 4.2320, Perplexity: 1279958.53
Epoch [6/10], Loss: 4.0733, Perplexity: 3732354.98
Epoch [7/10], Loss: 3.9003, Perplexity: 6607131.58
Epoch [8/10], Loss: 3.7443, Perplexity: 20519255.33
Epoch [9/10], Loss: 3.6646, Perplexity: 44540035.69
Epoch [10/10], Loss: 3.5143, Perplexity: 61763915.74


### Save the trained model

In [12]:
torch.save(model, 'model.ckpt')

### Neural Model testing

In [13]:
model = torch.load('model.ckpt')
model.eval()
test_ppl = 0
with torch.no_grad():
    for i in range(0, len(test_ids)):
        inputs = test_ids[i][:, 0:n_gram-1].to(device)
        gold = test_ids[i][:, n_gram-1:].to(device)
        output = model(inputs)
        cross_entropy = criterion(output, gold.reshape(-1)).item()
        print('Perplexity for test sample '+str(i)+' :', np.exp(cross_entropy))
        test_ppl += np.exp(cross_entropy)
    print('The average testing perplexity for neural LM: '+str(test_ppl/len(test_ids)))

Perplexity for test sample 0 : 43991473.02316608
Perplexity for test sample 1 : 38956968.219748355
Perplexity for test sample 2 : 432663714.7798215
Perplexity for test sample 3 : 2278079121.956286
Perplexity for test sample 4 : 3461183.9717070693
Perplexity for test sample 5 : 35432045.63429264
Perplexity for test sample 6 : 327742.8383638747
Perplexity for test sample 7 : 706207.7372765793
Perplexity for test sample 8 : 66998.13633510543
Perplexity for test sample 9 : 86565921.38277905
Perplexity for test sample 10 : 21023245.193479586
Perplexity for test sample 11 : 199948.54056460116
Perplexity for test sample 12 : 258972.59883020335
Perplexity for test sample 13 : 1483864.2984453
Perplexity for test sample 14 : 17368948.932059865
Perplexity for test sample 15 : 6719689.085955047
Perplexity for test sample 16 : 2857813.0067902347
Perplexity for test sample 17 : 449770.9792981786
Perplexity for test sample 18 : 27700551.774743408
Perplexity for test sample 19 : 226362.9626812696
Perp