# Pipeline 2: Analysis

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import re
import math

import nltk
from nltk.util import pad_sequence
from nltk.util import ngrams

import itertools
import pickle

from itertools import permutations

## Load pkl files

Load n-gram training dictionaries

In [None]:
def load_pkl(folder, file):
    """Helper function to load a pkl file"""
    filename = os.path.join(folder, file)
    with open(filename, "rb") as file: 
        file_contents = pickle.load(file)
    return(file_contents)

#### Training data

In [None]:
data_folder = "output_data/UNK 5-55/"
unigram_pkl = "unigram_dictionary_training.pkl"
bigram_pkl = "bigram_dictionary_training.pkl"
trigram_pkl = "trigram_dictionary_training.pkl"
fourgram_pkl = "fourgram_dictionary_training.pkl"
training_sentences_unk="training_sentences_unk.pkl"

In [None]:
unigram_dict = load_pkl(data_folder, unigram_pkl)
bigram_dict = load_pkl(data_folder, bigram_pkl)
trigram_dict = load_pkl(data_folder, trigram_pkl)
fourgram_dict = load_pkl(data_folder, fourgram_pkl)

Create a dictionary *ngram_dicts* that combines the unigram, bigram, trigram, and fourgram dicts.

In [None]:
ngram_dict = unigram_dict | bigram_dict | trigram_dict | fourgram_dict

#### Testing data

In [None]:
#Data for testing
unigram_pkl_test = "unigram_dictionary_test.pkl"
bigram_pkl_test = "bigram_dictionary_test.pkl"
trigram_pkl_test = "trigram_dictionary_test.pkl"
fourgram_pkl_test = "fourgram_dictionary_test.pkl"
test_sentences_unk="test_sentences_unk.pkl"

In [None]:
unigram_dict_test = load_pkl(data_folder, unigram_pkl_test)
bigram_dict_test = load_pkl(data_folder, bigram_pkl_test)
trigram_dict_test = load_pkl(data_folder, trigram_pkl_test)
fourgram_dict_test = load_pkl(data_folder, fourgram_pkl_test)

## Helper functions

In [None]:
def ngram_probability(ngram, k = None): 
    """ Computes the probability of the given ngram. 
    
    Parameters
    -------------
    ngram: tple. representing an n-gram of length n>=2
    k: float or None. If None is provided, no smoothing is applied. 
        If a float is provided, add-k smoothing is applied.
    
    Return
    -------------
    The probability of the ngram (with or without add-k smoothing)

    """
    
    n = len(ngram)
    
    # Obtain prefix
    if n > 2: 
        prefix = ngram[:(n-1)]
    elif n == 2:
        prefix = ngram[0]
    else: 
        print("ngram must be of length 2 or greater")
            
    # No smoothing applied
    if k is None: 
        probability = ngram_dict[ngram]/float(ngram_dict[prefix])
        
    # Apply add-k smoothing
    else:
        V = float(len(unigram_dict)) # Vocabulary size
        probability = (ngram_dict[ngram] + k)/(ngram_dict[prefix] + (k*V))
    
    return(probability)

## I. Summary statistics for the training/ testing datasets

#### Training data

In [None]:
print("Total number of tokens in the training set:", np.sum(list(unigram_dict.values())))

Total number of tokens in the training set: 859703


In [None]:
print("Number of UNK tags in training set:", unigram_dict['<UNK>'])

Number of UNK tags in training set: 52997


In [None]:
print("Training data statistics:")
print("Number unique unigrams: ", len(unigram_dict))
print("Number unique bigrams: ", len(bigram_dict))
print("Number unique trigrams: ", len(trigram_dict))
print("Number unique 4-grams: ", len(fourgram_dict))

Training data statistics:
Number unique unigrams:  10247
Number unique bigrams:  264407
Number unique trigrams:  567807
Number unique 4-grams:  711575


#### Testing data

In [None]:
print("Total number of tokens in the testing set:", np.sum(list(unigram_dict_test.values())))

Total number of tokens in the testing set: 199642


In [None]:
print("Number of UNK tags in testing set:", unigram_dict_test['<UNK>'])

Number of UNK tags in testing set: 11210


In [None]:
# Testing data
print("Testing data statistics:")
print("Number unique unigrams: ", len(unigram_dict_test))
print("Number unique bigrams: ", len(bigram_dict_test))
print("Number unique trigrams: ", len(trigram_dict_test))
print("Number unique 4-grams: ", len(fourgram_dict_test))

Testing data statistics:
Number unique unigrams:  10767
Number unique bigrams:  90140
Number unique trigrams:  154378
Number unique 4-grams:  179327


## II. Code for generating tables in Section 3: Analysis

#### Computation of n-gram counts (smoothed and unsmoothed) and probabilities (smoothed and unsmoothed) 

Bigram computations (counts and probabilities)

In [None]:
bigram_unsmoothed_count_dict = bigram_dict.copy()
bigram_smoothed_count_dict = {}
bigram_unsmoothed_prob_dict = {}
bigram_smoothed_prob_dict = {}

# Compute the smoothed counts and smoothed and unsmoothed probabilities
for bigram in bigram_dict.keys(): 
    bigram_smoothed_count_dict[bigram] = bigram_dict[bigram] + 1
    bigram_unsmoothed_prob_dict[bigram] = ngram_probability(bigram, k=None)
    bigram_smoothed_prob_dict[bigram] = ngram_probability(bigram, k=1)

Trigram probability computations

In [None]:
trigram_smoothed_prob_dict = {}
trigram_unsmoothed_prob_dict = {}

# Compute the smoothed smoothed and unsmoothed probabilities
for trigram in trigram_dict.keys(): 
    trigram_unsmoothed_prob_dict[trigram] = ngram_probability(trigram, k=None)
    trigram_smoothed_prob_dict[trigram] = ngram_probability(trigram, k=1)

Fourgram probability computations

In [None]:
fourgram_smoothed_prob_dict = {}
fourgram_unsmoothed_prob_dict = {}

# Compute the smoothed smoothed and unsmoothed probabilities
for fourgram in fourgram_dict.keys(): 
    fourgram_unsmoothed_prob_dict[fourgram] = ngram_probability(fourgram, k=None)
    fourgram_smoothed_prob_dict[fourgram] = ngram_probability(fourgram, k=1)

#### Table Generation

In [None]:
# Dictionary of unigrams sorted in descending order by frequency 
sorted_unigrams_tpls = sorted(unigram_dict.items(), key=lambda item: item[1], reverse = True)
sorted_unigrams_dct = {k: v for k, v in sorted_unigrams_tpls}
# sorted_unigrams_dct

We selected 8 unigrams that appeared frequently in the `sorted_unigrams_dct`

In [None]:
# 8 frequent occurring unigrams in the vocabulary
words = ['the', 'people', 'said', 'of', 'last', 'two', 'financial', 'years']

# Obtain all permutations of length-2 from the 8-word list 
all_tples =  list(permutations(words, 2))
for word in words:
    all_tples.append((word, word))

Obtain the bigram counts and probabilities with and without smoothing for the given set of 8 words to be used for generating the tables in figure XX and figure XX

In [None]:
tble_unsmoothed_counts = {}
tble_smoothed_counts = {}
tble_unsmoothed_prob = {}
tble_smoothed_prob = {}
for tple in all_tples: 
    tble_unsmoothed_counts[tple] = bigram_unsmoothed_count_dict.get(tple, 0)
    tble_smoothed_counts[tple] = bigram_smoothed_count_dict.get(tple, 1)
    tble_unsmoothed_prob[tple] = bigram_unsmoothed_prob_dict.get(tple, 0)
    tble_smoothed_prob[tple] = bigram_smoothed_prob_dict.get(tple, 0)

Generate the tables

In [None]:
def generate_tble(words, dct): 
    """ Create a table containing the frequency or probability of a given bigram based on the provided words and dictionary """
    df = pd.DataFrame(0, columns=words, index=words)
    
    # Populate the dataframe with counts or probabilities
    for (key, value) in dct.items():
        df.at[key[0], key[1]] = round(dct[key], 5)
        
    return df

In [None]:
# Table of unsmoothed bigram counts
generate_tble(words, tble_unsmoothed_counts)

Unnamed: 0,the,people,said,of,last,two,financial,years
the,13,76,0,0,212,263,445,27
people,5,0,4,25,0,1,0,0
said,424,2,0,17,24,3,0,0
of,6021,94,1,1,32,49,13,8
last,1,0,1,3,0,16,1,30
two,0,7,1,36,0,0,1,124
financial,0,0,0,0,0,0,0,1
years,33,1,11,93,0,0,0,0


In [None]:
# Table of smoothed bigram counts
generate_tble(words, tble_smoothed_counts)

Unnamed: 0,the,people,said,of,last,two,financial,years
the,14,77,1,1,213,264,446,28
people,6,1,5,26,1,2,1,1
said,425,3,1,18,25,4,1,1
of,6022,95,2,2,33,50,14,9
last,2,1,2,4,1,17,2,31
two,1,8,2,37,1,1,2,125
financial,1,1,1,1,1,1,1,2
years,34,2,12,94,1,1,1,1


In [None]:
# Table of unsmoothed bigram probabilities
generate_tble(words, tble_unsmoothed_prob)

Unnamed: 0,the,people,said,of,last,two,financial,years
the,0.00026,0.00151,0.0,0.0,0.00423,0.00524,0.00887,0.00054
people,0.00437,0.0,0.0035,0.02185,0.0,0.00087,0.0,0.0
said,0.1009,0.00048,0.0,0.00405,0.00571,0.00071,0.0,0.0
of,0.25123,0.00392,4e-05,4e-05,0.00134,0.00204,0.00054,0.00033
last,0.00083,0.0,0.00083,0.00248,0.0,0.01322,0.00083,0.02479
two,0.0,0.00554,0.00079,0.0285,0.0,0.0,0.00079,0.09818
financial,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00166
years,0.02322,0.0007,0.00774,0.06545,0.0,0.0,0.0,0.0


In [None]:
# Table of smoothed bigram probabilities
generate_tble(words, tble_smoothed_prob)

Unnamed: 0,the,people,said,of,last,two,financial,years
the,0.00023,0.00127,0.0,0.0,0.00353,0.00437,0.00738,0.00046
people,0.00053,0.0,0.00044,0.00228,0.0,0.00018,0.0,0.0
said,0.02941,0.00021,0.0,0.00125,0.00173,0.00028,0.0,0.0
of,0.17601,0.00278,6e-05,6e-05,0.00096,0.00146,0.00041,0.00026
last,0.00017,0.0,0.00017,0.00035,0.0,0.00148,0.00017,0.00271
two,0.0,0.0007,0.00017,0.00321,0.0,0.0,0.00017,0.01086
financial,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00018
years,0.00291,0.00017,0.00103,0.00806,0.0,0.0,0.0,0.0
