## N-gram Language Models

In [1]:
import nltk
import re
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Michael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Text Normalization

In [2]:
corpus = "Learning% makes 'me' happy. I am happy be-cause I am learning! :)"
corpus.lower()

"learning% makes 'me' happy. i am happy be-cause i am learning! :)"

In [36]:
# regular expression
# match any symobls not in [a-zA-Z0-9.?! ]+
corpus = re.sub(r"[^a-zA-Z0-9.?! ]+", "", corpus)
corpus = corpus.lower()

In [37]:
token_list = nltk.word_tokenize(corpus)
token_list

['learning',
 'makes',
 'me',
 'happy',
 '.',
 'i',
 'am',
 'happy',
 'because',
 'i',
 'am',
 'learning',
 '!']

### Sentence to n-gram

In [38]:
def sentence_to_trigram(token_list):
    for i in range(len(token_list)-2):
        trigram = token_list[i:i+3]
        print(trigram)
sentence_to_trigram(token_list)

['learning', 'makes', 'me']
['makes', 'me', 'happy']
['me', 'happy', '.']
['happy', '.', 'i']
['.', 'i', 'am']
['i', 'am', 'happy']
['am', 'happy', 'because']
['happy', 'because', 'i']
['because', 'i', 'am']
['i', 'am', 'learning']
['am', 'learning', '!']


In [39]:
# add pre-fix and ending 
# bi-gram n = 2, trigram n = 3
n = 3
token_list = ['<s>'] * (n-1) + token_list + ['</s>']
print(token_list)

['<s>', '<s>', 'learning', 'makes', 'me', 'happy', '.', 'i', 'am', 'happy', 'because', 'i', 'am', 'learning', '!', '</s>']


In [40]:
sentence_to_trigram(token_list)

['<s>', '<s>', 'learning']
['<s>', 'learning', 'makes']
['learning', 'makes', 'me']
['makes', 'me', 'happy']
['me', 'happy', '.']
['happy', '.', 'i']
['.', 'i', 'am']
['i', 'am', 'happy']
['am', 'happy', 'because']
['happy', 'because', 'i']
['because', 'i', 'am']
['i', 'am', 'learning']
['am', 'learning', '!']
['learning', '!', '</s>']


### Constructing Matrix and Count frequency

In [13]:
import numpy as np
import pandas as pd
from collections import defaultdict  # default dict with default value 

In [21]:
def n_gram_matrix(corpus, n = 3):
    """
    Create a n-gram matrix with corpus
    Args:
        corpus: Pre-processed and tokenized corpus. 
        n : n-gram, default value = 3 
    
    Returns:
        bigrams: list of all bigram prefixes, row index
        vocabulary: list of all found words, the column index
        count_matrix: pandas dataframe with bigram prefixes as rows, 
                      vocabulary words as columns 
                      and the counts of the bigram/word combinations (i.e. trigrams) as values
    """
    grams = []
    vocabulary = []
    count_matrix_dict = defaultdict(dict)
    
    for i in range(len(corpus)-n+1):
        n_gram = tuple(corpus[i:i+n])
        n_minus_1_gram = n_gram[:-1]
        if n_minus_1_gram not in grams:
            grams.append(n_minus_1_gram)
        last_word = n_gram[-1]
        if last_word not in vocabulary:
            vocabulary.append(last_word)
        # initialize the matrix     
        pair = (n_minus_1_gram, last_word)
        count_matrix_dict[pair] = count_matrix_dict.get(pair, 0) + 1
            
    # convert the count_matrix to np.array to fill in the blanks
    matrix_shape = (len(grams), len(vocabulary))
    count_matrix = np.zeros(matrix_shape)
    for gram_key, gram_value in count_matrix_dict.items():
        # grams.index() gives the index value based on the value 
        count_matrix[grams.index(gram_key[0]), vocabulary.index(gram_key[1])] = gram_value
        
    count_matrix = pd.DataFrame(count_matrix, index=grams, columns=vocabulary)
    
    return grams, vocabulary, count_matrix
        

In [41]:
test_g, test_v, test_matrix = n_gram_matrix(token_list)
print(test_matrix)

                   learning  makes   me  happy    .    i   am  because    !  \
(<s>, <s>)              1.0    0.0  0.0    0.0  0.0  0.0  0.0      0.0  0.0   
(<s>, learning)         0.0    1.0  0.0    0.0  0.0  0.0  0.0      0.0  0.0   
(learning, makes)       0.0    0.0  1.0    0.0  0.0  0.0  0.0      0.0  0.0   
(makes, me)             0.0    0.0  0.0    1.0  0.0  0.0  0.0      0.0  0.0   
(me, happy)             0.0    0.0  0.0    0.0  1.0  0.0  0.0      0.0  0.0   
(happy, .)              0.0    0.0  0.0    0.0  0.0  1.0  0.0      0.0  0.0   
(., i)                  0.0    0.0  0.0    0.0  0.0  0.0  1.0      0.0  0.0   
(i, am)                 1.0    0.0  0.0    1.0  0.0  0.0  0.0      0.0  0.0   
(am, happy)             0.0    0.0  0.0    0.0  0.0  0.0  0.0      1.0  0.0   
(happy, because)        0.0    0.0  0.0    0.0  0.0  1.0  0.0      0.0  0.0   
(because, i)            0.0    0.0  0.0    0.0  0.0  0.0  1.0      0.0  0.0   
(am, learning)          0.0    0.0  0.0    0.0  0.0 

In [42]:
test2_g, test2_v, test2_matrix = n_gram_matrix(token_list, n=2)
print(test2_matrix)

             <s>  learning  makes   me  happy    .    i   am  because    !  \
(<s>,)       1.0       1.0    0.0  0.0    0.0  0.0  0.0  0.0      0.0  0.0   
(learning,)  0.0       0.0    1.0  0.0    0.0  0.0  0.0  0.0      0.0  1.0   
(makes,)     0.0       0.0    0.0  1.0    0.0  0.0  0.0  0.0      0.0  0.0   
(me,)        0.0       0.0    0.0  0.0    1.0  0.0  0.0  0.0      0.0  0.0   
(happy,)     0.0       0.0    0.0  0.0    0.0  1.0  0.0  0.0      1.0  0.0   
(.,)         0.0       0.0    0.0  0.0    0.0  0.0  1.0  0.0      0.0  0.0   
(i,)         0.0       0.0    0.0  0.0    0.0  0.0  0.0  2.0      0.0  0.0   
(am,)        0.0       1.0    0.0  0.0    1.0  0.0  0.0  0.0      0.0  0.0   
(because,)   0.0       0.0    0.0  0.0    0.0  0.0  1.0  0.0      0.0  0.0   
(!,)         0.0       0.0    0.0  0.0    0.0  0.0  0.0  0.0      0.0  0.0   

             </s>  
(<s>,)        0.0  
(learning,)   0.0  
(makes,)      0.0  
(me,)         0.0  
(happy,)      0.0  
(.,)          0.0  
(

In [43]:
# vertically downwards across rows (axis 0)
# running horizontally across columns (axis 1).
row_sum = test_matrix.sum(axis=1)
row_sum

(<s>, <s>)           1.0
(<s>, learning)      1.0
(learning, makes)    1.0
(makes, me)          1.0
(me, happy)          1.0
(happy, .)           1.0
(., i)               1.0
(i, am)              2.0
(am, happy)          1.0
(happy, because)     1.0
(because, i)         1.0
(am, learning)       1.0
(learning, !)        1.0
dtype: float64

In [44]:
prob_matrix = test_matrix.div(row_sum, axis=0)
prob_matrix

Unnamed: 0,learning,makes,me,happy,.,i,am,because,!,</s>
"(<s>, <s>)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>, learning)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(learning, makes)",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(makes, me)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
"(me, happy)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"(happy, .)",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"(., i)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"(i, am)",0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
"(am, happy)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
"(happy, because)",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [45]:
# find the probability
# this is the population
corpus

'learning makes me happy. i am happy because i am learning! '

In [47]:
trigram = ('i', 'am', 'happy')
bigram = trigram[:-1]
word = trigram[-1]
prob_matrix[word][bigram]

0.5

### Model Evaluation

In [2]:
import random
def sample_split(data, train_percent, validation_percent):
    """
    Input:
        data: list of sentence
        train_percent: e.g. 60%
        validation_percent: e.g. 30%
    Note: train_percent + validation_percent <= 100 
    Output:
        train_data: list of sentences 
        validation_data
        test_data
    """
    random.seed(87)
    random.shuffle(data)
    
    train_size = int(len(data) * train_percent /100)
    train_data = data[:train_size]
    
    validation_size = int(len(data) * validation_percent / 100)
    validataion_data = data[train_size:train_size+validation_size]
    
    test_data = data[train_size+validation_size:]
    
    return train_data, validataion_data, test_data 

data = [x for x in range(0, 100)]
train_data, _, _ = sample_split(data, 60, 20)
len(train_data)

60

### Out of vocabulary words

We use \<UNK\> to replace the out of vocabulary words

In [3]:
from collections import Counter

In [4]:
# if there are many <unk> replaces in your train and test set, you may get a very low perplexity
# even though the model itsef wouldn't be helpful
training_set = ['i', 'am', 'happy', 'because','i', 'am', 'learning', '.']
training_set_unk = ['i', 'am', '<UNK>', '<UNK>','i', 'am', '<UNK>', '<UNK>']

test_set = ['i', 'am', 'learning']
test_set_unk = ['i', 'am', '<UNK>']

# initialize the results 
M = len(test_set)
prob = 1
prob_unk = 1

# many <unk> low perplexity 
training_set = ['i', 'am', 'happy', 'because','i', 'am', 'learning', '.']
training_set_unk = ['i', 'am', '<UNK>', '<UNK>','i', 'am', '<UNK>', '<UNK>']

test_set = ['i', 'am', 'learning']
test_set_unk = ['i', 'am', '<UNK>']

M = len(test_set)
probability = 1
probability_unk = 1

# pre-calculated probabilities
bigram_probabilities = {('i', 'am'): 1.0, ('am', 'happy'): 0.5, 
                        ('happy', 'because'): 1.0, 
                        ('because', 'i'): 1.0, 
                        ('am', 'learning'): 0.5, 
                        ('learning', '.'): 1.0}
bigram_probabilities_unk = {('i', 'am'): 1.0, 
                            ('am', '<UNK>'): 1.0, 
                            ('<UNK>', '<UNK>'): 0.5, ('<UNK>', 'i'): 0.25}

for i in range(len(test_set) -1):
    bigram = tuple(test_set[i:i+2])
    prob = prob * bigram_probabilities[bigram]  # intersection probability 
    
    bigram_unk = tuple(test_set_unk[i:i+2])
    prob_unk = prob_unk * bigram_probabilities_unk[bigram_unk]
    
# calculate perplexity
perplexity = prob ** (-1/M)   # average probability for an independent event
perplexity_unk = prob_unk ** (-1/M)

print(f"perplexity for the training set: {perplexity}")
print(f"perplexity for the training set with <UNK>: {perplexity_unk}")

perplexity for the training set: 1.2599210498948732
perplexity for the training set with <UNK>: 1.0


__comment__: the lower perplexity, the closer it is to human speech 

In [None]:
# smoothing 
def add_k_smoothing_prob(k, vocabulary_size, n_gram_count, n_gram_prefix_count):
    numerator = n_gram_count + k 
    denominator = n_gram_prefix_count + k * vocabulary_size  # proportional to size 
    
    return numerator / denominator

In [12]:
# back-off 
trigram_p = {('i', 'am', 'happy'):0}
bigram_p = {('am', 'happy'):0.3}
unigram_p = {('happy'):0.4}

trigram_test = ('are', 'you', 'happy')

bigram_back_off = trigram_test[1:3]
unigram_back_off = trigram_test[2]

print(f"besides the trigram {trigram_test} we also use back-off {bigram_back_off} and {unigram_back_off}")

besides the trigram ('are', 'you', 'happy') we also use back-off ('you', 'happy') and happy


In [13]:
lambda_factor = 0.4  # back-off factor
trigram_estimated_prob = 0  # initialize the probability

if trigram_test not in trigram_p or trigram_p[trigram_test] == 0:
    print(f"probability for {trigram_test} Not found")
    
    if bigram_back_off not in bigram_p or bigram_p[bigram_back_off] == 0:
        print(f"probability for {bigram_back_off} Not found")
        
        if unigram_back_off in unigram_p:
            print(f"probability of unigram {unigram_back_off} Found\n") 
            trigram_estimated_prob = lambda_factor * lambda_factor * unigram_p[unigram_back_off]
        else:
            trigram_estimated_prob = 0
    else: 
        trigram_estimated_prob = lambda_factor * bigram_p[bigram_back_off]
    
else:
    trigram_estimated_prob = trigram_p[trigram_test]
    
print(f"probability for trigram {trigram_test} estimated as {trigram_estimated_prob}")

probability for ('are', 'you', 'happy') Not found
probability for ('you', 'happy') Not found
probability of unigram happy Found

probability for trigram ('are', 'you', 'happy') estimated as 0.06400000000000002


In [None]:
# Interpolation
# more like weighted average of probability