## Neural and N-Gram Language Model for CPSC 503 Assignment 2
#### The neural model notebook is modified from Yunjey Choi's Github repository - pytorch-tutorial.
#### Familiarize yourself with pytorch, start with: https://pytorch.org/tutorials/beginner/basics/intro.html

#### The N-gram model notebook is from Josh Loehr's Github repository - ngram-language-model.
#### https://github.com/joshualoehr/ngram-language-model

#### ========================================================================================

### Let's load a number of dependencies:

In [28]:
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_
import torch.nn.functional as F
import numpy as np
import os
import math
from collections import defaultdict
from tqdm import tqdm

# check if GPU is available to pytorch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Here are two classes needed for the data loading and formating:

In [29]:
class Dictionary(object):
    # use to generate and return the word-to-index (index-to-word) vocabulary dictionary
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0
    
    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def return_dict(self):
        return self.idx2word
        
    def __len__(self):
        return len(self.word2idx)


class Corpus(object):
    # load and prepare the corpus  the language models input format
    def __init__(self):
        self.dictionary = Dictionary()

    def get_data(self, path, n_gram=2):
        with open(path, 'r') as f:
            tokens = 0;
            sample_list = []
            for line in f:
                # add <start> tokens based on the number of n-grams.
                words = ['<start>'] * (n_gram - 1) + line.split() + ['<end>']
                tokens += len(words)
                sample_list.append(words)
                for word in words: 
                    self.dictionary.add_word(word)  

        # # Read corpus and store the each line (word sequence) into its corresponding index sequence.
        ids_list = [[0]*len(s) for s in sample_list if len(s) > n_gram] 
        with open(path, 'r') as f:
            sample_num = 0
            for line in f:
                token = 0
                words = ['<start>'] * (n_gram - 1) + line.split() + ['<end>']
                if len(words) >= n_gram:
                    for word in words:
                        ids_list[sample_num][token] = self.dictionary.word2idx[word]
                        token += 1
                    sample_num += 1

        # FOR THE NEURAL MODEL Convert the flat index sequences into the n-gram tensors which are ready for neural model.
        for n in range(len(ids_list)):
            flat_ids = ids_list[n]
            ids_list[n] = torch.LongTensor([flat_ids[i:i+n_gram] for i in range(len(flat_ids)-(n_gram - 1))])
        return ids_list

### Hyper-parameters for both Language Models: 

In [30]:
# Trigram model
n_gram = 2

# m_gram is the number of preceding/conditioning words 
m_gram = n_gram - 1

### Load the "Penn Treebank" dataset and split it into train/dev/test

In [31]:
corpus = Corpus()
#ids = corpus.get_data('data/train_mini.txt', n_gram)
ids = corpus.get_data('data/train.txt', n_gram)


# Use 70% for training, 15% for development, and 15% for testing 
n_train = round(len(ids) * .75)
n_dev = round(len(ids) * .15)

train_ids = ids[:n_train]
dev_ids = ids[n_train:n_train + n_dev]
test_ids = ids[n_train + n_dev:]

print(f"Number of sentences: {len(train_ids)} train, {len(dev_ids)} dev, {len(test_ids)} test")
vocab_size = len(corpus.dictionary)
print(f"Vocab size: {vocab_size}")

Number of sentences: 31551 train, 6310 dev, 4207 test
698238
tensor([[ 0,  1],
        [ 1,  2],
        [ 2,  3],
        [ 3,  4],
        [ 4,  5],
        [ 5,  6],
        [ 6,  7],
        [ 7,  8],
        [ 8,  9],
        [ 9, 10],
        [10, 11],
        [11, 12],
        [12, 13],
        [13, 14],
        [14, 15],
        [15, 16],
        [16, 17],
        [17, 18],
        [18, 19],
        [19, 20],
        [20, 21],
        [21, 22],
        [22, 23],
        [23, 24],
        [24, 25]])


### The class of count-based language model:

In [33]:
class CountLM(object):
    def __init__(self, vocab_size, x_n, x_m, laplace=1):
        self.vocab_size = vocab_size
        self.laplace = laplace
        
        # Dictionaries for tracking the count of n-grams
        self.n_gram_count = self.count_ngrams(x_n)
        self.m_gram_count = self.count_ngrams(x_m)
    
    def count_ngrams(self, x):
        """
        Populate the dictionary with the number of occurrences of each n-gram
        """
        count_list = defaultdict(int)
        for example in x:
            for n_gram in example.tolist():
                count_list[tuple(n_gram)] += 1 
        return count_list
    
    def compute_mle(self, n_gram):
        """
        Compute the MLE of P(w_n|w_{n−1}, ...) with add-one Laplacian smoothing
        
        Please see chapter 3.5.1 of J&M 3rd Ed. for more information
        """
        n_count = self.n_gram_count[n_gram]
        m_gram = n_gram[:-1]
        m_count = self.m_gram_count[m_gram]
        prob = (n_count + self.laplace) / (m_count + self.laplace * self.vocab_size) 
        return prob


### Train the n-gram model based on MLE

In [34]:
#m_gram_train_ids = corpus.get_data('data/train_mini.txt', m_gram)[:-200]
m_gram_train_ids = corpus.get_data('data/train.txt', m_gram)[:n_train]
n_gram_train_ids = train_ids

# Populate the dictionaries with counts from training corpus
count_model = CountLM(vocab_size, n_gram_train_ids, m_gram_train_ids)

# Compute average perplexity on training set
train_ppl = 0
for i in range(0, len(train_ids)):

    probabilities = list(map(lambda x: count_model.compute_mle(tuple(x)), train_ids[i].tolist()))
    perplexity = np.exp(sum(-np.log(probabilities)) / len(train_ids[i]))
    train_ppl += perplexity
    
print('The average training perplexity for count-based LM: '+str(train_ppl/len(train_ids)))

The average training perplexity for count-based LM: 872.2299472899585


In [55]:
# Compute average perplexity on testing set
test_ppl = 0
for i in range(0, len(test_ids)):

    probabilities = list(map(lambda x: count_model.compute_mle(tuple(x)), test_ids[i].tolist()))
    perplexity = np.exp(sum(-np.log(probabilities)) / len(test_ids[i]))
    print('Perplexity for test sample '+str(i)+' :', perplexity)
    test_ppl += perplexity
    
print('The average testing perplexity for count-based LM: '+str(test_ppl/len(test_ids)))

The average testing perplexity for count-based LM: 1184.6602509874976


### Hyper-parameters for the neural language model

In [49]:
# FOR THE NEURAL MODEL
embed_size = 128
intermediate_size = 1024
num_epochs = 2
learning_rate = 2e-3

### The class of neural language model:

In [50]:
class NeuralLM(nn.Module):
    def __init__(self, vocab_size, embed_size, intermediate_size, m_gram):
        super(NeuralLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.intermediate = nn.Linear(m_gram * embed_size, intermediate_size)
        self.final = nn.Linear(intermediate_size, vocab_size)
        
    def forward(self, x):
        x = self.embed(x) # Embed word id(s) to vectors
        conc_emb = x.view(x.size(0), x.size(1)*x.size(2))
        intermediate_output = self.intermediate(conc_emb) # one layer of MLP
        intermediate_output = F.relu(intermediate_output) # ReLU non-linear function
        final_out = self.final(intermediate_output) # Map to the vocabulary size output
        return final_out

In [51]:
neural_model = NeuralLM(vocab_size, embed_size, intermediate_size, m_gram).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(neural_model.parameters(), lr=learning_rate, momentum=0.9)

### Train the neural model

In [52]:
# Reduce batch size if you are running out of memory
batch_size = 64
training_data = torch.cat(train_ids, dim=0)
neural_model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for i in tqdm(range(0, len(training_data), batch_size)):
        batch = training_data[i:i + batch_size]
        inputs = batch[:, 0:n_gram-1].to(device)
        targets = batch[:, n_gram-1:].to(device)
        
        # Forward pass
        outputs = neural_model(inputs)
        loss = criterion(outputs, targets.reshape(-1))
        total_loss += loss.item();
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    
    # Calculate the performance (perplexity) of the current trained model on dev set.
    total_ppl = 0
    for i in range(0, len(dev_ids)):
        dev_inputs = dev_ids[i][:, 0:n_gram-1].to(device)
        dev_targets = dev_ids[i][:, n_gram-1:].to(device)
        dev_outputs = neural_model(dev_inputs)
        ce = criterion(dev_outputs, dev_targets.reshape(-1))
        total_ppl += np.exp(ce.item());
    
    print ('Epoch [{}/{}], Training Loss: {:.4f}, Dev Perplexity: {:5.2f}'
        .format(epoch + 1, num_epochs, total_loss/len(train_ids), total_ppl/len(dev_ids)))

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 10910/10910 [00:25<00:00, 423.33it/s]


Epoch [1/2], Training Loss: 2.1705, Dev Perplexity: 539.15


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 10910/10910 [00:25<00:00, 422.89it/s]


Epoch [2/2], Training Loss: 2.0152, Dev Perplexity: 472.67


### Save the trained model

In [53]:
torch.save(neural_model, 'model.ckpt')

### Neural Model testing

In [56]:
neural_model = torch.load('model.ckpt')
neural_model.eval()
test_ppl = 0
with torch.no_grad():
    for i in range(0, len(test_ids)):
        inputs = test_ids[i][:, 0:n_gram-1].to(device)
        gold = test_ids[i][:, n_gram-1:].to(device)
        output = neural_model(inputs)
        cross_entropy = criterion(output, gold.reshape(-1)).item()
        print('Perplexity for test sample '+str(i)+' :', np.exp(cross_entropy))
        test_ppl += np.exp(cross_entropy)
    print('The average testing perplexity for neural LM: '+str(test_ppl/len(test_ids)))

The average testing perplexity for neural LM: 449.3337552729946


### Text generation with n-gram language models

In [61]:
np.random.seed(100)
start_tokens = ['we', 'a', 'they']


print('Generating from count-based language model:')
for token in start_tokens:
    
    sentence = [token]
    token_id = corpus.dictionary.word2idx[token]
    
    context = tuple([corpus.dictionary.word2idx['<start>'] for _ in range(m_gram - 1)] + [token_id])
    while sentence[-1] != '<end>':
        filtered_ngrams = [(n_gram_ids[-1], count) for n_gram_ids, count in count_model.n_gram_count.items() if n_gram_ids[:-1] == context]
        candidates = [x[0] for x in filtered_ngrams]
        total_count = sum([x[1] for x in filtered_ngrams])
        probs = [x[1] / total_count for x in filtered_ngrams]
    
        sampled_id = np.random.choice(candidates, 1, p=probs)[0]
        sampled_token = corpus.dictionary.idx2word[sampled_id]
        sentence.append(sampled_token)
        
        context = context[1:] + (sampled_id,)
    print(' '.join(sentence[:-1]))

    
neural_model.eval()
print('\nGenerating from neural language model:')
for token in start_tokens:
    
    sentence = [token]
    token_id = corpus.dictionary.word2idx[token]
    
    context = torch.LongTensor([[corpus.dictionary.word2idx['<start>'] for _ in range(m_gram - 1)] + [token_id]]).to(device)
    
    while sentence[-1] != '<end>':
        output = neural_model(context)
        softmax_output = F.softmax(output, dim=-1)
        prob = softmax_output.squeeze(0).tolist()
        prob = list(map(lambda x: x / sum(prob), prob))
        sampled_id = np.random.choice(output.shape[1], 1, p=prob)[0]
        sampled_token = corpus.dictionary.idx2word[sampled_id]
        sentence.append(sampled_token)
        context = torch.cat([context[:, 1:], torch.LongTensor([[sampled_id]]).to(device)], dim=1)
    print(' '.join(sentence[:-1]))

Generating from count-based language model:
we keep up
a life including the pentagon in the buy-back is n't be an early termination by the indictment against index tumble quickly tumbled N billion
they are less <unk> on it 's one of luzon

Generating from neural language model:
we to performed idea anything price you <unk> a business publicly was in east tailspin gain to move at a
a rate
they can be able to N southeast misleading bills including to N N N N tons machinists much return to
