# <u> ICS2203 – Statistical Natural Language Processing <u>
## <u> Building a Language Model – Part I <u>

## Imports

In [1]:
# Standard Library Imports
import os
import re
import time
import maths
import xml.etree.ElementTree as ET
from datetime import datetime
from collections import Counter

# Third-Party Library Imports
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split

# System Library Imports
import psutil

## Extracting and Pre-processing the Selected Corpus

In [2]:
# Set directory path for corpus
corpus_dir_path = 'corpus'

# Create an empty list for tokens
tokens = []

# Instantiate a WordNetLemmatizer object
lemmatizer = WordNetLemmatizer()

# Instantiate a RegexpTokenizer object for tokenizing text
tokenizer = RegexpTokenizer(r"\w+(?:[-']\w+)*|[^\w\s]")

# Record start time of token generation
generation_start = datetime.now()

# Traverse all files in the specified directory and subdirectories, and tokenize text in them
for root_dir, subdirs, files in os.walk(corpus_dir_path):
    for filename in files:
        # Select only XML files
        if filename.endswith('.xml'):
            # Parse XML file
            tree = ET.parse(os.path.join(root_dir, filename))
            root = tree.getroot()
            text = ''
            for element in root.iter():
                if element.text is not None:
                    text += element.text + ' '

            # Remove URLs from text
            text = re.sub(r'https?://\S+|www\.\S+', '', text)  

            # Convert text to lowercase
            text = text.lower()

            # Tokenize text using RegexpTokenizer object
            file_tokens = tokenizer.tokenize(text)

            # Remove punctuation, numbers, and lemmatize tokens
            file_tokens = [lemmatizer.lemmatize(token) for token in file_tokens 
                           if not token.isdigit() and 
                           (token.isalpha() or len(token) > 1)]

            # Add the tokens to the tokens list
            tokens.extend(file_tokens)

# Output list of the first 100 tokens
print()
print(tokens[:100])


['oxford', 'art', 'journal', 'sample', 'containing', 'about', 'word', 'from', 'a', 'periodical', 'domain', 'art', 'data', 'capture', 'and', 'transcription', 'oxford', 'university', 'press', 'bnc', 'xml', 'edition', 'december', 'token', 'w-units', 's-units', 'distributed', 'under', 'licence', 'by', 'oxford', 'university', 'computing', 'service', 'on', 'behalf', 'of', 'the', 'bnc', 'consortium', 'this', 'material', 'is', 'protected', 'by', 'international', 'copyright', 'law', 'and', 'may', 'not', 'be', 'copied', 'or', 'redistributed', 'in', 'any', 'way', 'consult', 'the', 'bnc', 'web', 'site', 'at', 'for', 'full', 'licencing', 'and', 'distribution', 'condition', 'a6u', 'artjnl', 'oxford', 'art', 'journal', 'oxford', 'university', 'press', 'oxford', 'w', 'ac', 'humanity', 'art', 'art', 'tag', 'usage', 'updated', 'for', 'bnc-xml', 'last', 'check', 'for', 'bnc', 'world', 'first', 'release', 'redo', 'tagusage', 'table', 'check']


## Computation

In [3]:
# Define a function to get RAM usage
def RAMusage():
    pid = os.getpid()
    py = psutil.Process(pid)
    memoryUse = py.memory_info()[0] / 2.**30
    return memoryUse

# Print the total size of the token list
print("Total size of corpus: "+str(len(tokens))+" words")

Total size of corpus: 4003540 words


## Split the corpus tokens into test set and train set

In [4]:
# Split the words into train and test sets
train_set, test_set = train_test_split(tokens, test_size=0.2)

# Define the vocabulary from the training set
vocab = set()
for sentence in train_set:
    for word in sentence.split():
        vocab.add(word)
vocab = list(vocab)
vocab.sort()

# Print the size of the train and test sets
print("Train set size:", len(train_set))
print("Test set size:", len(test_set))

Train set size: 3202832
Test set size: 800708


## Vanilla Language Model

In [5]:
# Define function to create n-grams from a list of tokens
def create_vanilla_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

#-------------------------------------------------------------------------------------------------------------------------------

# Create a vanilla unigram model by iterating over the train_set and counting the frequency of each word
vanilla_unigram_freq = {}
vanilla_total_unigrams = len(train_set)
for word in train_set:
    if word in vanilla_unigram_freq:
        vanilla_unigram_freq[word] += 1
    else:
        vanilla_unigram_freq[word] = 1

# Calculate vanilla unigram probabilities by dividing the frequency of each word by the total number of unigrams
vanilla_unigram_probs = {}
for word, count in vanilla_unigram_freq.items():
    vanilla_unigram_probs[word] = count / vanilla_total_unigrams
    
#-------------------------------------------------------------------------------------------------------------------------------

# Create a vanilla bigram model by creating bigrams from the train_set and counting the frequency of each bigram
vanilla_bigram_freq = {}
vanilla_total_bigrams = len(create_vanilla_ngrams(train_set, 2))
vanilla_bigram_tokens = create_vanilla_ngrams(train_set, 2)
for tokens in vanilla_bigram_tokens:
    if tokens in vanilla_bigram_freq:
        vanilla_bigram_freq[tokens] += 1
    else:
        vanilla_bigram_freq[tokens] = 1

# Calculate vanilla bigram probabilities by dividing the frequency of each bigram by the frequency of its first word
vanilla_bigram_probs = {}
for tokens, count in vanilla_bigram_freq.items():
    vanilla_bigram_probs[tokens] = count / vanilla_unigram_freq[tokens[0]]
    
#-------------------------------------------------------------------------------------------------------------------------------

# Create a vanilla trigram model by creating trigrams from the train_set and counting the frequency of each trigram
vanilla_trigram_freq = {}
vanilla_total_trigrams = len(create_vanilla_ngrams(train_set, 3))
vanilla_trigram_tokens = create_vanilla_ngrams(train_set, 3)
for tokens in vanilla_trigram_tokens:
    if tokens in vanilla_trigram_freq:
        vanilla_trigram_freq[tokens] += 1
    else:
        vanilla_trigram_freq[tokens] = 1

# Calculate vanilla trigram probabilities by dividing the frequency of each trigram by the frequency of its first two words
vanilla_trigram_probs = {}
for tokens, count in vanilla_trigram_freq.items():
    vanilla_trigram_probs[tokens] = count / vanilla_bigram_freq[(tokens[0], tokens[1])]

In [6]:
## Testing Vanilla Language Model by printing subsets of the model and probability dictionaries

# Print the frequency of the first 10 unigrams in the model
print("\nVanilla unigram model (subset):")
print({k: vanilla_unigram_freq[k] for k in list(vanilla_unigram_freq)[:10]})

# Print the probability of the first 10 unigrams in the model
print("\nVanilla unigram probabilities (subset):")
print({k: vanilla_unigram_probs[k] for k in list(vanilla_unigram_probs)[:10]})
print("-"*127)

# Print the frequency of the first 10 bigrams in the model
print("\nVanilla bigram model (subset):")
print({k: vanilla_bigram_freq[k] for k in list(vanilla_bigram_freq)[:10]})

# Print the probability of the first 10 bigrams in the model
print("\nVanilla bigram probabilities (subset):")
print({k: vanilla_bigram_probs[k] for k in list(vanilla_bigram_probs)[:10]})
print("-"*127)

# Print the frequency of the first 10 trigrams in the model
print("\nVanilla trigram model (subset):")
print({k: vanilla_trigram_freq[k] for k in list(vanilla_trigram_freq)[:10]})

# Print the probability of the first 10 trigrams in the model
print("\nVanilla trigram probabilities (subset):")
print({k: vanilla_trigram_probs[k] for k in list(vanilla_trigram_probs)[:10]})
print("-"*127)


Vanilla unigram model (subset):
{'go': 6404, 'unsure': 14, 'you': 39636, 'treat': 151, 'recollection': 13, 'first': 3675, 'constant': 247, 'be': 20016, 'the': 169361, 'need': 2495}

Vanilla unigram probabilities (subset):
{'go': 0.001999480459793083, 'unsure': 4.371131548579507e-06, 'you': 0.012375297861392668, 'treat': 4.71457759882504e-05, 'recollection': 4.058907866538114e-06, 'first': 0.0011474220315021207, 'constant': 7.711924946422416e-05, 'be': 0.006249469219740529, 'the': 0.052878515014212424, 'need': 0.0007789980866932764}
-------------------------------------------------------------------------------------------------------------------------------

Vanilla bigram model (subset):
{('go', 'unsure'): 1, ('unsure', 'you'): 1, ('you', 'treat'): 2, ('treat', 'recollection'): 1, ('recollection', 'first'): 1, ('first', 'constant'): 1, ('constant', 'be'): 2, ('be', 'the'): 1084, ('the', 'need'): 140, ('need', 'people'): 8}

Vanilla bigram probabilities (subset):
{('go', 'unsure'): 0.

## Laplace Language Model

In [7]:
# Define function to create n-grams from a list of tokens
def create_laplace_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

#-------------------------------------------------------------------------------------------------------------------------------

# Create a Laplace unigram model by iterating over the train_set and counting the frequency of each word
laplace_unigram_freq = {}
laplace_total_unigrams = len(train_set)
for word in train_set:
    if word in laplace_unigram_freq:
        laplace_unigram_freq[word] += 1
    else:
        laplace_unigram_freq[word] = 1

# Calculate Laplace unigram probabilities by adding 1 to the frequency of each word and dividing by the total number of unigrams plus the vocabulary size
laplace_unigram_probs = {}
for word, count in laplace_unigram_freq.items():
    laplace_unigram_probs[word] = (count + 1) / (laplace_total_unigrams + len(laplace_unigram_freq))
    
#-------------------------------------------------------------------------------------------------------------------------------
    
# Create a Laplace bigram model by creating bigrams from the train_set and counting the frequency of each bigram
laplace_bigram_freq = {}
laplace_total_bigrams = len(create_laplace_ngrams(train_set, 2))
laplace_bigram_tokens = create_laplace_ngrams(train_set, 2)
for tokens in laplace_bigram_tokens:
    if tokens in laplace_bigram_freq:
        laplace_bigram_freq[tokens] += 1
    else:
        laplace_bigram_freq[tokens] = 1

# Calculate Laplace bigram probabilities by adding 1 to the frequency of each bigram and dividing by the frequency of its first word plus the vocabulary size
laplace_bigram_probs = {}
for tokens, count in laplace_bigram_freq.items():
    laplace_bigram_probs[tokens] = (count + 1) / (laplace_unigram_freq[tokens[0]] + len(laplace_unigram_freq))
    
#-------------------------------------------------------------------------------------------------------------------------------
    
# Create a Laplace trigram model by creating trigrams from the train_set and counting the frequency of each trigram
laplace_trigram_freq = {}
laplace_total_trigrams = len(create_laplace_ngrams(train_set, 3))
laplace_trigram_tokens = create_laplace_ngrams(train_set, 3)
for tokens in laplace_trigram_tokens:
    if tokens in laplace_trigram_freq:
        laplace_trigram_freq[tokens] += 1
    else:
        laplace_trigram_freq[tokens] = 1

# Calculate Laplace trigram probabilities by adding 1 to the frequency of each trigram and dividing by the frequency of its first two words plus the vocabulary size
laplace_trigram_probs = {}
for tokens, count in laplace_trigram_freq.items():
    laplace_trigram_probs[tokens] = (count + 1) / (laplace_bigram_freq[(tokens[0], tokens[1])] + len(laplace_unigram_freq))

In [8]:
## Testing Laplace Language Model by printing subsets of the model and probability dictionaries

# Print the frequency of the first 10 unigrams in the model
print("\nLaplace unigram model (subset):")
print({k: laplace_unigram_freq[k] for k in list(laplace_unigram_freq)[:10]})

# Print the probability of the first 10 unigrams in the model
print("\nLaplace unigram probabilities (subset):")
print({k: laplace_unigram_probs[k] for k in list(laplace_unigram_probs)[:10]})
print("-"*127)

# Print the frequency of the first 10 bigrams in the model
print("\nLaplace bigram model (subset):")
print({k: laplace_bigram_freq[k] for k in list(laplace_bigram_freq)[:10]})

# Print the probability of the first 10 bigrams in the model
print("\nLaplace bigram probabilities (subset):")
print({k: laplace_bigram_probs[k] for k in list(laplace_bigram_probs)[:10]})
print("-"*127)

# Print the frequency of the first 10 trigrams in the model
print("\nLaplace trigram model (subset):")
print({k: laplace_trigram_freq[k] for k in list(laplace_trigram_freq)[:10]})

# Print the probability of the first 10 trigrams in the model
print("\nLaplace trigram probabilities (subset):")
print({k: laplace_trigram_probs[k] for k in list(laplace_trigram_probs)[:10]}) 
print("-"*127)


Laplace unigram model (subset):
{'go': 6404, 'unsure': 14, 'you': 39636, 'treat': 151, 'recollection': 13, 'first': 3675, 'constant': 247, 'be': 20016, 'the': 169361, 'need': 2495}

Laplace unigram probabilities (subset):
{'go': 0.001959877737900241, 'unsure': 4.58987760632375e-06, 'you': 0.012128598578790298, 'treat': 4.651075974408066e-05, 'recollection': 4.283885765902167e-06, 'first': 0.0011248260053897403, 'constant': 7.588597642455266e-05, 'be': 0.0061250386697188335, 'the': 0.05182339007748019, 'need': 0.0007637556336922719}
-------------------------------------------------------------------------------------------------------------------------------

Laplace bigram model (subset):
{('go', 'unsure'): 1, ('unsure', 'you'): 1, ('you', 'treat'): 2, ('treat', 'recollection'): 1, ('recollection', 'first'): 1, ('first', 'constant'): 1, ('constant', 'be'): 2, ('be', 'the'): 1084, ('the', 'need'): 140, ('need', 'people'): 8}

Laplace bigram probabilities (subset):
{('go', 'unsure'): 2.

## UNK Language model

In [9]:
# Create a new list with <UNK> tokens for words with count less than or equal to 2
unk_train_set = []
word_freq = {}
for word in train_set:
    if word in word_freq:
        word_freq[word] += 1
    else:
        word_freq[word] = 1
    if word_freq[word] <= 2:
        unk_train_set.append('<UNK>')
    else:
        unk_train_set.append(word)

# Create UNK Laplace unigram model
unk_laplace_unigram_freq = {}
unk_laplace_total_unigrams = len(unk_train_set)
for word in unk_train_set:
    if word in unk_laplace_unigram_freq:
        unk_laplace_unigram_freq[word] += 1
    else:
        unk_laplace_unigram_freq[word] = 1

# Calculate UNK Laplace unigram probabilities
unk_laplace_unigram_probs = {}
for word, count in unk_laplace_unigram_freq.items():
    unk_laplace_unigram_probs[word] = (count + 1) / (unk_laplace_total_unigrams + len(unk_laplace_unigram_freq))

#-------------------------------------------------------------------------------------------------------------------------------

# Create UNK Laplace bigram model
unk_laplace_bigram_freq = {}
unk_laplace_total_bigrams = len(create_laplace_ngrams(unk_train_set, 2))
unk_laplace_bigram_tokens = create_laplace_ngrams(unk_train_set, 2)
for tokens in unk_laplace_bigram_tokens:
    if tokens in unk_laplace_bigram_freq:
        unk_laplace_bigram_freq[tokens] += 1
    else:
        unk_laplace_bigram_freq[tokens] = 1

# Calculate UNK Laplace bigram probabilities
unk_laplace_bigram_probs = {}
for tokens, count in unk_laplace_bigram_freq.items():
    unk_laplace_bigram_probs[tokens] = (count + 1) / (unk_laplace_unigram_freq[tokens[0]] + len(unk_laplace_unigram_freq))

#-------------------------------------------------------------------------------------------------------------------------------

# Create UNK Laplace trigram model
unk_laplace_trigram_freq = {}
unk_laplace_total_trigrams = len(create_laplace_ngrams(unk_train_set, 3))
unk_laplace_trigram_tokens = create_laplace_ngrams(unk_train_set, 3)
for tokens in unk_laplace_trigram_tokens:
    if tokens in unk_laplace_trigram_freq:
        unk_laplace_trigram_freq[tokens] += 1
    else:
        unk_laplace_trigram_freq[tokens] = 1

# Calculate UNK Laplace trigram probabilities
unk_laplace_trigram_probs = {}
for tokens, count in unk_laplace_trigram_freq.items():
    unk_laplace_trigram_probs[tokens] = (count + 1) / (unk_laplace_bigram_freq[(tokens[0], tokens[1])] + len(unk_laplace_unigram_freq))

In [10]:
## Testing UNK Language Model by printing subsets of the model and probability dictionaries

# Print the frequency of the first 10 unigrams in the model
print("\nUNK Laplace unigram model (subset):")
print({k: unk_laplace_unigram_freq[k] for k in list(unk_laplace_unigram_freq)[:10]})

# Print the probability of the first 10 unigrams in the model
print("\nUNK Laplace unigram probabilities (subset):")
print({k: unk_laplace_unigram_probs[k] for k in list(unk_laplace_unigram_probs)[:10]})
print("-"*127)

# Print the frequency of the first 10 bigrams in the model
print("\nUNK Laplace bigram model (subset):")
print({k: unk_laplace_bigram_freq[k] for k in list(unk_laplace_bigram_freq)[:10]})

# Print the probability of the first 10 bigrams in the model
print("\nUNK Laplace bigram probabilities (subset):")
print({k: unk_laplace_bigram_probs[k] for k in list(unk_laplace_bigram_probs)[:10]})
print("-"*127)

# Print the frequency of the first 10 trigrams in the model
print("\nUNK Laplace trigram model (subset):")
print({k: unk_laplace_trigram_freq[k] for k in list(unk_laplace_trigram_freq)[:10]})

# Print the probability of the first 10 trigrams in the model
print("\nUNK Laplace trigram probabilities (subset):")
print({k: unk_laplace_trigram_probs[k] for k in list(unk_laplace_trigram_probs)[:10]})
print("-"*127)


UNK Laplace unigram model (subset):
{'<UNK>': 103033, 'wa': 29642, 'the': 169359, 'that': 41636, 'you': 39634, 'to': 76120, 'of': 80989, 'point': 1760, 'no': 12998, 'in': 55586}

UNK Laplace unigram probabilities (subset):
{'<UNK>': 0.03187985446543523, 'wa': 0.009171870702087626, 'the': 0.05240184941151572, 'that': 0.012882946409702882, 'you': 0.01226350555872358, 'to': 0.023552675832864835, 'of': 0.025059198062344464, 'point': 0.0005448727964907841, 'no': 0.004022033777162806, 'in': 0.01719923006163158}
-------------------------------------------------------------------------------------------------------------------------------

UNK Laplace bigram model (subset):
{('<UNK>', '<UNK>'): 7007, ('<UNK>', 'wa'): 998, ('wa', '<UNK>'): 964, ('<UNK>', 'the'): 5528, ('the', '<UNK>'): 5441, ('<UNK>', 'that'): 1279, ('that', '<UNK>'): 1350, ('<UNK>', 'you'): 1251, ('you', '<UNK>'): 1252, ('<UNK>', 'to'): 2448}

UNK Laplace bigram probabilities (subset):
{('<UNK>', '<UNK>'): 0.05303144958682689

## Linear Interpolation

In [11]:
def linear_interpolation(sentence, lm_type):
    # Set the lambda values for each n-gram model
    lambda_3 = 0.6
    lambda_2 = 0.3
    lambda_1 = 0.1
    
    # Tokenize the sentence
    sentence_tokens = sentence.split()
    
    # Calculate the probabilities of each n-gram model for the sentence
    if lm_type == "vanilla":
        unigram_probs = [vanilla_unigram_probs[token] for token in sentence_tokens]
        bigram_probs = [vanilla_bigram_probs.get((sentence_tokens[i-1], sentence_tokens[i]), 0) for i in range(1, len(sentence_tokens))]
        trigram_probs = [vanilla_trigram_probs.get((sentence_tokens[i-2], sentence_tokens[i-1], sentence_tokens[i]), 0) for i in range(2, len(sentence_tokens))]
    elif lm_type == "laplace":
        unigram_probs = [(laplace_unigram_freq.get(token, 0) + 1) / (laplace_total_unigrams + len(laplace_unigram_freq)) for token in sentence_tokens]
        bigram_probs = [(laplace_bigram_freq.get((sentence_tokens[i-1], sentence_tokens[i]), 0) + 1) / (laplace_unigram_freq.get(sentence_tokens[i-1], 0) + len(laplace_unigram_freq)) for i in range(1, len(sentence_tokens))]
        trigram_probs = [(laplace_trigram_freq.get((sentence_tokens[i-2], sentence_tokens[i-1], sentence_tokens[i]), 0) + 1) / (laplace_bigram_freq.get((sentence_tokens[i-2], sentence_tokens[i-1]), 0) + len(laplace_unigram_freq)) for i in range(2, len(sentence_tokens))]
    elif lm_type == "unk":
        # Replace infrequent words with <UNK>
        unk_sentence_tokens = ['<UNK>' if word_freq[token] <= 2 else token for token in sentence_tokens]
        # Calculate the probabilities for the UNK Laplace model
        unk_laplace_unigram_probs = {word: (count + 1) / (unk_laplace_total_unigrams + len(unk_laplace_unigram_freq)) for word, count in unk_laplace_unigram_freq.items()}
        unk_laplace_bigram_probs = {(word1, word2): (count + 1) / (unk_laplace_unigram_freq.get(word1, 0) + len(unk_laplace_unigram_freq)) for (word1, word2), count in unk_laplace_bigram_freq.items()}
        unk_laplace_trigram_probs = {(word1, word2, word3): (count + 1) / (unk_laplace_bigram_freq.get((word1, word2), 0) + len(unk_laplace_unigram_freq)) for (word1, word2, word3), count in unk_laplace_trigram_freq.items()}
        unk_laplace_trigram_probs = {(word1, word2, word3): (count + 1) / (unk_laplace_bigram_freq.get((word1, word2), 0) + len(unk_laplace_unigram_freq)) for (word1, word2, word3), count in unk_laplace_trigram_freq.items()}
        # Replace tokens in sentence with <UNK> where appropriate
        unigram_probs = [unk_laplace_unigram_probs.get(token, 1 / (unk_laplace_total_unigrams + len(unk_laplace_unigram_freq))) for token in unk_sentence_tokens]
        bigram_probs = [(unk_laplace_bigram_freq.get((unk_sentence_tokens[i-1], unk_sentence_tokens[i]), 0) + 1) / (unk_laplace_unigram_freq.get(unk_sentence_tokens[i-1], 0) + len(unk_laplace_unigram_freq)) for i in range(1, len(unk_sentence_tokens))]
        trigram_probs = [(unk_laplace_trigram_probs.get((unk_sentence_tokens[i-2], unk_sentence_tokens[i-1], unk_sentence_tokens[i]), 0) + 1) / (unk_laplace_bigram_freq.get((unk_sentence_tokens[i-2], unk_sentence_tokens[i-1]), 0) + len(unk_laplace_unigram_freq)) for i in range(2, len(unk_sentence_tokens))]
        
    # Calculate the probability of the sentence using linear interpolation
    sentence_prob = 1
    for i in range(len(sentence_tokens)):
        if i == 0:
            sentence_prob *= unigram_probs[i]**lambda_1
        elif i == 1:
            sentence_prob *= (lambda_2*bigram_probs[i-1] + lambda_1*unigram_probs[i])**lambda_1
        else:
            sentence_prob *= (lambda_3*trigram_probs[i-2] + lambda_2*bigram_probs[i-1] + lambda_1*unigram_probs[i])**lambda_1

    return sentence_prob

# Define some test sentences
sentences = ["this is a test sentence"]

# Test the vanilla language model
for sentence in sentences:
    prob = linear_interpolation(sentence, "vanilla")
    print(f"\nVanilla LM: Probability of '{sentence}': {prob:.6f}")

# Test the laplace language model
for sentence in sentences:
    prob = linear_interpolation(sentence, "laplace")
    print(f"Laplace LM: Probability of '{sentence}': {prob:.6f}")

# Test the unk language model
for sentence in sentences:
    prob = linear_interpolation(sentence, "unk")
    print(f"UNK LM: Probability of '{sentence}': {prob:.6f}")


Vanilla LM: Probability of 'this is a test sentence': 0.043030
Laplace LM: Probability of 'this is a test sentence': 0.023936
UNK LM: Probability of 'this is a test sentence': 0.027513


## Evaluation & Perplexity

In [12]:
# This function calculates the perplexity of a given test set using the specified n-gram language model probabilities
def calculate_perplexity(test_set, lm_probs, ngram_type):
    # Get the length of the test set
    N = len(test_set)
    # Initialize the sum of the log probabilities to 0
    log_prob_sum = 0

    # Iterate over the test set and calculate the log probability of each n-gram in the test set
    for i in range(len(test_set)):
        if ngram_type == "unigram":
            # Calculate the log probability of the unigram
            log_prob_sum += math.log(lm_probs.get(test_set[i], 1))
        elif ngram_type == "bigram" and i > 0:
            # Calculate the log probability of the bigram
            log_prob_sum += math.log(lm_probs.get((test_set[i-1], test_set[i]), 1))
        elif ngram_type == "trigram" and i > 1:
            # Calculate the log probability of the trigram
            log_prob_sum += math.log(lm_probs.get((test_set[i-2], test_set[i-1], test_set[i]), 1))

    # Calculate the entropy and perplexity of the test set using the log probabilities
    entropy = -(log_prob_sum / N)
    perplexity = math.pow(2, entropy)

    # Return the perplexity
    return perplexity

# Calculate perplexities for each model and n-gram
vanilla_unigram_perplexity = calculate_perplexity(test_set, vanilla_unigram_probs, "unigram")
vanilla_bigram_perplexity = calculate_perplexity(test_set, vanilla_bigram_probs, "bigram")
vanilla_trigram_perplexity = calculate_perplexity(test_set, vanilla_trigram_probs, "trigram")

laplace_unigram_perplexity = calculate_perplexity(test_set, laplace_unigram_probs, "unigram")
laplace_bigram_perplexity = calculate_perplexity(test_set, laplace_bigram_probs, "bigram")
laplace_trigram_perplexity = calculate_perplexity(test_set, laplace_trigram_probs, "trigram")

unk_unigram_perplexity = calculate_perplexity(test_set, unk_laplace_unigram_probs, "unigram")
unk_bigram_perplexity = calculate_perplexity(test_set, unk_laplace_bigram_probs, "bigram")
unk_trigram_perplexity = calculate_perplexity(test_set, unk_laplace_trigram_probs, "trigram")

# Print the results in a table format
print("\n{:<15} {:<15} {:<15} {:<15}".format("Model", "Unigram", "Bigram", "Trigram"))
print("-" * 60)
print("{:<15} {:<15.2f} {:<15.2f} {:<15.2f}".format("Vanilla", vanilla_unigram_perplexity, vanilla_bigram_perplexity, vanilla_trigram_perplexity))
print("{:<15} {:<15.2f} {:<15.2f} {:<15.2f}".format("Laplace", laplace_unigram_perplexity, laplace_bigram_perplexity, laplace_trigram_perplexity))
print("{:<15} {:<15.2f} {:<15.2f} {:<15.2f}".format("UNK", unk_unigram_perplexity, unk_bigram_perplexity, unk_trigram_perplexity))


Model           Unigram         Bigram          Trigram        
------------------------------------------------------------
Vanilla         120.60          13.61           1.52           
Laplace         121.13          36.27           2.53           
UNK             110.60          27.69           2.34           


## Genetarion

In [13]:
# Define a function to generate a sentence using a language model and a given phrase
def generate_sentence(model, phrase):
    # Print a message indicating which model is being used
    print(f"\nGenerating {model} model...")
    print("-" * 127)
    # Loop through each n-gram type and generate a sentence for each type
    for ngram_type in ["unigram", "bigram", "trigram"]:
        # Generate a sentence using the selected model, phrase, and n-gram type
        generated_sentence = generate_ngram_sentence(model, phrase, ngram_type)
        # Print the generated sentence along with its n-gram type
        print(f"\n{ngram_type.capitalize()}: {generated_sentence}\n")
        print("-" * 127)

# Define a function to generate a sentence using a specific n-gram type and language model
def generate_ngram_sentence(model, phrase, ngram_type):
    # Define start and end tokens for the sentence
    start_token = '<s>'
    end_token = '</s>'
    # Set a maximum length for the sentence
    max_length = 20
    # Initialize the sentence with the start token and the words in the input phrase
    sentence = [start_token] + phrase.split()
    # Initialize the model probabilities to None
    model_probs = None
    
    # Convert the model name to lowercase for easier comparison
    model = model.lower()
    
    # Determine which set of probabilities to use based on the selected model
    if model == 'vanilla':
        model_probs = [vanilla_unigram_probs, vanilla_bigram_probs, vanilla_trigram_probs]
    elif model == 'laplace':
        model_probs = [laplace_unigram_probs, laplace_bigram_probs, laplace_trigram_probs]
    elif model == 'unk':
        model_probs = [unk_laplace_unigram_probs, unk_laplace_bigram_probs, unk_laplace_trigram_probs]
    else:
        # If an invalid model was selected, print an error message and return None
        print('Invalid model selection')
        return

    # Generate the sentence by adding words one at a time until the end token is reached or the maximum length is reached
    while sentence[-1] != end_token and len(sentence) < max_length:
        # Determine the next word to add based on the selected n-gram type
        if ngram_type == "unigram":
            next_word = generate_word(model_probs[0])
        elif ngram_type == "bigram":
            next_word = generate_word(model_probs[1], sentence[-1:])
        elif ngram_type == "trigram":
            if len(sentence) < 2:
                next_word = generate_word(model_probs[1], sentence[-1:])
            else:
                next_word = generate_word(model_probs[2], sentence[-2:])
        # Add the next word to the sentence
        sentence.append(next_word)

    # Return the generated sentence, excluding the start token
    return ' '.join(sentence[1:])

# Define a function to generate a word given a probability distribution
def generate_word(model_prob, context=None):
    # If no context is provided, use the full vocabulary and corresponding probabilities
    if context is None:
        words, probs = zip(*[(word, prob) for word, prob in model_prob.items()])
    # If a context is provided, use only the vocabulary and probabilities corresponding to that context
    else:
        words, probs = zip(*[(word, model_prob.get(tuple(context + [word]), 0)) for word in vocab])
        
    # Cast the probabilities to a numpy array of float64 data type    
    probs = np.array(probs, dtype=np.float64)
    epsilon = 1e-10
    probs = np.add(probs, epsilon)  # Add a small constant to the probabilities
    probs /= probs.sum()
    next_word = np.random.choice(words, p=probs)
    return next_word

# Ask the user for the model and phrase input
model_input = input('\nWhich language model would you like to use? (Vanilla, Laplace, UNK): ').lower()  # Convert to lowercase
phrase_input = input('Please enter a phrase: ')

# Generate the sentence using the selected model and input phrase
generate_sentence(model_input, phrase_input)


Which language model would you like to use? (Vanilla, Laplace, UNK): unk
Please enter a phrase: lets go 

Generating unk model...
-------------------------------------------------------------------------------------------------------------------------------

Unigram: lets go false supportive if the of david go to ve than look that innate my ε make touch

-------------------------------------------------------------------------------------------------------------------------------

Bigram: lets go gi have he september mcillvanney the that he are few this junior bum at eating in spur

-------------------------------------------------------------------------------------------------------------------------------

Trigram: lets go entertainingly manacle caithness commend communication punctuate carton intradermal crested mansuetae unfallen piercy psst super upholsterer strittmatter puddling

---------------------------------------------------------------------------------------------------

In [14]:
## Test the sentence generation function with different models and phrases
#models = ['Vanilla', 'Laplace', 'UNK']
#phrases = ['this is a', 'i love', 'the weather is']
#
#for model in models:
#    for phrase in phrases:
#        generate_sentence(model, phrase)
#        print("-" * 125)

## Sen_Probability

In [15]:
def sen_probability(sentence, model_name, ngram_type):
    # Tokenize sentence into words
    sentence_tokens = sentence.lower().split()

    # Initialize dictionaries to store sentence probabilities
    sentence_probs = {
        "Vanilla": {"unigram": 1, "bigram": None, "trigram": None},
        "Laplace": {"unigram": 1, "bigram": None, "trigram": None},
        "UNK": {"unigram": 1, "bigram": None, "trigram": None}
    }

    # Calculate probabilities for each model and n-gram type
    for i in range(len(sentence_tokens)):
        # Iterate through each model and corresponding probabilities
        for model_name, model_probs in [("Vanilla", (vanilla_unigram_probs, vanilla_bigram_probs, vanilla_trigram_probs)),
                                        ("Laplace", (laplace_unigram_probs, laplace_bigram_probs, laplace_trigram_probs)),
                                        ("UNK", (unk_laplace_unigram_probs, unk_laplace_bigram_probs, unk_laplace_trigram_probs))]:
            # For the first word, calculate the unigram probability and multiply it by the running total
            if i == 0:
                sentence_probs[model_name]["unigram"] *= model_probs[0].get(sentence_tokens[i], 1)
            # For the second word, calculate the bigram probability and multiply it by the running total
            elif i == 1:
                if sentence_probs[model_name]["bigram"] is None:
                    sentence_probs[model_name]["bigram"] = 1
                sentence_probs[model_name]["bigram"] *= model_probs[1].get((sentence_tokens[i-1], sentence_tokens[i]), 1)
            # For all subsequent words, calculate the trigram probability and multiply it by the running total
            else:
                if sentence_probs[model_name]["trigram"] is None:
                    sentence_probs[model_name]["trigram"] = 1
                sentence_probs[model_name]["trigram"] *= model_probs[2].get((sentence_tokens[i-2], sentence_tokens[i-1], sentence_tokens[i]), 1)

    # Print probabilities for the specified model and n-gram type
    print(f"\nSentence: {sentence}")
    print(f"{model_name} {ngram_type} sentence probability:", sentence_probs[model_name][ngram_type])

# Test the sen_probability function with a sample sentence, model, and n-gram type
sentence = "lets go to the bank"
model_name = "Vanilla"
ngram_type = "bigram"

prob = sen_probability(sentence, model_name, ngram_type)


Sentence: lets go to the bank
UNK bigram sentence probability: 1


## Computation

In [16]:
# Record end time of token generation
generation_end = datetime.now()

# Calculate the time taken to generate tokens
generation_time = generation_end - generation_start

# Print the time taken to generate tokens
print('\nGeneration Time(HH::MM:SS:ms) - {}\n\n'.format(generation_time))

# Print the memory used in GB
print("Memory Use: {:.6f} GB".format(RAMusage()))


Generation Time(HH::MM:SS:ms) - 0:22:24.800334


Memory Use: 2.711678 GB
