# <u> ICS2203 – Statistical Natural Language Processing <u>
## <u> Building a Language Model – Part I <u>

## Imports

In [1]:
# standard library imports
import os
import re
import math
import time
from datetime import datetime
from collections import Counter

# third-party library imports
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from spellchecker import SpellChecker
from sklearn.model_selection import train_test_split
import xml.etree.ElementTree as ET

## Extracting and Preprocessing the Selected Corpus

In [2]:
corpus_dir_path = 'corpus'
tokens = []
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

generation_start = datetime.now()

for root_dir, subdirs, files in os.walk(corpus_dir_path):
    for filename in files:
        if filename.endswith('.xml'):
            tree = ET.parse(os.path.join(root_dir, filename))
            root = tree.getroot()
            text = ''
            for element in root.iter():
                if element.text is not None:
                    text += element.text + ' '

            text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
            text = text.lower()
            file_tokens = tokenizer.tokenize(text)

            # Remove stopwords and lemmatize tokens
            file_tokens = [lemmatizer.lemmatize(token) for token in file_tokens if token not in stop_words]

            tokens.extend(file_tokens)

generation_end = datetime.now()
generation_time = generation_end - generation_start

def RAMusage():
    pid = os.getpid()
    py = psutil.Process(pid)
    memoryUse = py.memory_info()[0] / 2.**30
    return memoryUse

# Output list of the first 100 tokens
print(tokens[:100])

['oxford', 'art', 'journal', 'sample', 'containing', '26310', 'word', 'periodical', 'domain', 'art', 'data', 'capture', 'transcription', 'oxford', 'university', 'press', 'bnc', 'xml', 'edition', 'december', '2006', '26310', 'token', '26548', 'w', 'unit', '1082', 'unit', 'distributed', 'licence', 'oxford', 'university', 'computing', 'service', 'behalf', 'bnc', 'consortium', 'material', 'protected', 'international', 'copyright', 'law', 'may', 'copied', 'redistributed', 'way', 'consult', 'bnc', 'web', 'site', 'full', 'licencing', 'distribution', 'condition', 'a6u', 'artjnl', 'oxford', 'art', 'journal', 'oxford', 'university', 'press', 'oxford', '1991', 'w', 'ac', 'humanity', 'art', 'art', 'tag', 'usage', 'updated', 'bnc', 'xml', 'last', 'check', 'bnc', 'world', 'first', 'release', 'redo', 'tagusage', 'table', 'check', 'tagcounts', 'resequenced', 'unit', 'added', 'header', 'added', 'date', 'info', 'updated', 'catrefs', 'updated', 'source', 'title', 'updated', 'title', 'corrected']
Total si

## Computation

In [3]:
print("Total size of corpus: "+str(len(tokens))+" words")
print('Generation Time(HH::MM:SS:ms) - {}\n\n'.format(generation_time))
print("Memory Use: {:.6f} GB".format(RAMusage()))

Total size of corpus: 2093188 words
Generation Time(HH::MM:SS:ms) - 0:00:20.412599


Memory Use: 0.362892 GB


## Split the corpus tokens into test set and train set

In [4]:
# Split the words into train and test sets
train_set, test_set = train_test_split(tokens, test_size=0.2)

# Define the vocabulary from the training set
vocab = set()
for sentence in train_set:
    for word in sentence.split():
        vocab.add(word)
vocab = list(vocab)
vocab.sort()

# Print the size of the train and test sets
print("Train set size:", len(train_set))
print("Test set size:", len(test_set))

Train set size: 1674550
Test set size: 418638


## Vanilla Language Model

In [5]:
# Define function to create n-grams from tokens
def create_vanilla_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

#-------------------------------------------------------------------------------------------------------------------------------

# Create vanilla unigram model
vanilla_unigram_freq = {}
vanilla_total_unigrams = len(train_set)
for word in train_set:
    if word in vanilla_unigram_freq:
        vanilla_unigram_freq[word] += 1
    else:
        vanilla_unigram_freq[word] = 1

# Calculate vanilla unigram probabilities
vanilla_unigram_probs = {}
for word, count in vanilla_unigram_freq.items():
    vanilla_unigram_probs[word] = count / vanilla_total_unigrams
    
#-------------------------------------------------------------------------------------------------------------------------------

# Create vanilla bigram model
vanilla_bigram_freq = {}
vanilla_total_bigrams = len(create_vanilla_ngrams(train_set, 2))
vanilla_bigram_tokens = create_vanilla_ngrams(train_set, 2)
for tokens in vanilla_bigram_tokens:
    if tokens in vanilla_bigram_freq:
        vanilla_bigram_freq[tokens] += 1
    else:
        vanilla_bigram_freq[tokens] = 1

# Calculate vanilla bigram probabilities
vanilla_bigram_probs = {}
for tokens, count in vanilla_bigram_freq.items():
    vanilla_bigram_probs[tokens] = count / vanilla_unigram_freq[tokens[0]]
    
#-------------------------------------------------------------------------------------------------------------------------------

# Create vanilla trigram model
vanilla_trigram_freq = {}
vanilla_total_trigrams = len(create_vanilla_ngrams(train_set, 3))
vanilla_trigram_tokens = create_vanilla_ngrams(train_set, 3)
for tokens in vanilla_trigram_tokens:
    if tokens in vanilla_trigram_freq:
        vanilla_trigram_freq[tokens] += 1
    else:
        vanilla_trigram_freq[tokens] = 1

# Calculate vanilla trigram probabilities
vanilla_trigram_probs = {}
for tokens, count in vanilla_trigram_freq.items():
    vanilla_trigram_probs[tokens] = count / vanilla_bigram_freq[(tokens[0], tokens[1])]

In [6]:
# Testing Vanilla Language Model
print("Vanilla unigram model (subset):")
print({k: vanilla_unigram_freq[k] for k in list(vanilla_unigram_freq)[:10]})
print("\nVanilla unigram probabilities (subset):")
print({k: vanilla_unigram_probs[k] for k in list(vanilla_unigram_probs)[:10]})
print("\nVanilla bigram model (subset):")
print({k: vanilla_bigram_freq[k] for k in list(vanilla_bigram_freq)[:10]})
print("\nVanilla bigram probabilities (subset):")
print({k: vanilla_bigram_probs[k] for k in list(vanilla_bigram_probs)[:10]})
print("\nVanilla trigram model (subset):")
print({k: vanilla_trigram_freq[k] for k in list(vanilla_trigram_freq)[:10]})
print("\nVanilla trigram probabilities (subset):")
print({k: vanilla_trigram_probs[k] for k in list(vanilla_trigram_probs)[:10]})

Vanilla unigram model (subset):
{'cage': 35, 'could': 6133, 'forgotten': 155, 'house': 2053, 'show': 1177, 'scarcity': 12, 'second': 1502, 'exchequer': 14, 'round': 1601, 'tag': 191}

Vanilla unigram probabilities (subset):
{'cage': 2.090113761906184e-05, 'could': 0.0036624764862201787, 'forgotten': 9.256218088441671e-05, 'house': 0.001226001015198113, 'show': 0.0007028753993610224, 'scarcity': 7.166104326535487e-06, 'second': 0.0008969573915380252, 'exchequer': 8.360455047624735e-06, 'round': 0.0009560777522319429, 'tag': 0.00011406049386402317}

Vanilla bigram model (subset):
{('cage', 'could'): 1, ('could', 'forgotten'): 1, ('forgotten', 'house'): 1, ('house', 'show'): 1, ('show', 'scarcity'): 1, ('scarcity', 'second'): 1, ('second', 'exchequer'): 1, ('exchequer', 'round'): 1, ('round', 'tag'): 1, ('tag', 'beside'): 1}

Vanilla bigram probabilities (subset):
{('cage', 'could'): 0.02857142857142857, ('could', 'forgotten'): 0.00016305233980107615, ('forgotten', 'house'): 0.00645161290

## Laplace Language Model

In [7]:
# Define function to create n-grams from tokens
def create_laplace_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

#-------------------------------------------------------------------------------------------------------------------------------

# Create Laplace unigram model
laplace_unigram_freq = {}
laplace_total_unigrams = len(train_set)
for word in train_set:
    if word in laplace_unigram_freq:
        laplace_unigram_freq[word] += 1
    else:
        laplace_unigram_freq[word] = 1

# Calculate Laplace unigram probabilities
laplace_unigram_probs = {}
for word, count in laplace_unigram_freq.items():
    laplace_unigram_probs[word] = (count + 1) / (laplace_total_unigrams + len(laplace_unigram_freq))
    
#-------------------------------------------------------------------------------------------------------------------------------
    
# Create Laplace bigram model
laplace_bigram_freq = {}
laplace_total_bigrams = len(create_laplace_ngrams(train_set, 2))
laplace_bigram_tokens = create_laplace_ngrams(train_set, 2)
for tokens in laplace_bigram_tokens:
    if tokens in laplace_bigram_freq:
        laplace_bigram_freq[tokens] += 1
    else:
        laplace_bigram_freq[tokens] = 1

# Calculate Laplace bigram probabilities
laplace_bigram_probs = {}
for tokens, count in laplace_bigram_freq.items():
    laplace_bigram_probs[tokens] = (count + 1) / (laplace_unigram_freq[tokens[0]] + len(laplace_unigram_freq))
    
#-------------------------------------------------------------------------------------------------------------------------------
    
# Create Laplace trigram model
laplace_trigram_freq = {}
laplace_total_trigrams = len(create_laplace_ngrams(train_set, 3))
laplace_trigram_tokens = create_laplace_ngrams(train_set, 3)
for tokens in laplace_trigram_tokens:
    if tokens in laplace_trigram_freq:
        laplace_trigram_freq[tokens] += 1
    else:
        laplace_trigram_freq[tokens] = 1

# Calculate Laplace trigram probabilities
laplace_trigram_probs = {}
for tokens, count in laplace_trigram_freq.items():
    laplace_trigram_probs[tokens] = (count + 1) / (laplace_bigram_freq[(tokens[0], tokens[1])] + len(laplace_unigram_freq))

In [8]:
# Testing Laplace Language Model
print("Laplace unigram model (subset):")
print({k: laplace_unigram_freq[k] for k in list(laplace_unigram_freq)[:10]})
print("\nLaplace unigram probabilities (subset):")
print({k: laplace_unigram_probs[k] for k in list(laplace_unigram_probs)[:10]})
print("\nLaplace bigram model (subset):")
print({k: laplace_bigram_freq[k] for k in list(laplace_bigram_freq)[:10]})
print("\nLaplace bigram probabilities (subset):")
print({k: laplace_bigram_probs[k] for k in list(laplace_bigram_probs)[:10]})
print("\nLaplace trigram model (subset):")
print({k: laplace_trigram_freq[k] for k in list(laplace_trigram_freq)[:10]})
print("\nLaplace trigram probabilities (subset):")
print({k: laplace_trigram_probs[k] for k in list(laplace_trigram_probs)[:10]})

Laplace unigram model (subset):
{'cage': 35, 'could': 6133, 'forgotten': 155, 'house': 2053, 'show': 1177, 'scarcity': 12, 'second': 1502, 'exchequer': 14, 'round': 1601, 'tag': 191}

Laplace unigram probabilities (subset):
{'cage': 2.078286752769317e-05, 'could': 0.0035411697059686085, 'forgotten': 9.005909262000373e-05, 'house': 0.0011857780528300493, 'show': 0.0006800616096561821, 'scarcity': 7.5049243850003115e-06, 'second': 0.0008676847192811899, 'exchequer': 8.659528136538822e-06, 'round': 0.0009248376049823461, 'tag': 0.00011084196014769691}

Laplace bigram model (subset):
{('cage', 'could'): 1, ('could', 'forgotten'): 1, ('forgotten', 'house'): 1, ('house', 'show'): 1, ('show', 'scarcity'): 1, ('scarcity', 'second'): 1, ('second', 'exchequer'): 1, ('exchequer', 'round'): 1, ('round', 'tag'): 1, ('tag', 'beside'): 1}

Laplace bigram probabilities (subset):
{('cage', 'could'): 3.467346266534908e-05, ('could', 'forgotten'): 3.1358284074695435e-05, ('forgotten', 'house'): 3.4601477

## UNK Language model

In [9]:
# Create a new list with <UNK> tokens for words with count less than or equal to 2
unk_train_set = []
word_freq = {}
for word in train_set:
    if word in word_freq:
        word_freq[word] += 1
    else:
        word_freq[word] = 1
    if word_freq[word] <= 2:
        unk_train_set.append('<UNK>')
    else:
        unk_train_set.append(word)

# Create UNK Laplace unigram model
unk_laplace_unigram_freq = {}
unk_laplace_total_unigrams = len(unk_train_set)
for word in unk_train_set:
    if word in unk_laplace_unigram_freq:
        unk_laplace_unigram_freq[word] += 1
    else:
        unk_laplace_unigram_freq[word] = 1

# Calculate UNK Laplace unigram probabilities
unk_laplace_unigram_probs = {}
for word, count in unk_laplace_unigram_freq.items():
    unk_laplace_unigram_probs[word] = (count + 1) / (unk_laplace_total_unigrams + len(unk_laplace_unigram_freq))

#-------------------------------------------------------------------------------------------------------------------------------

# Create UNK Laplace bigram model
unk_laplace_bigram_freq = {}
unk_laplace_total_bigrams = len(create_laplace_ngrams(unk_train_set, 2))
unk_laplace_bigram_tokens = create_laplace_ngrams(unk_train_set, 2)
for tokens in unk_laplace_bigram_tokens:
    if tokens in unk_laplace_bigram_freq:
        unk_laplace_bigram_freq[tokens] += 1
    else:
        unk_laplace_bigram_freq[tokens] = 1

# Calculate UNK Laplace bigram probabilities
unk_laplace_bigram_probs = {}
for tokens, count in unk_laplace_bigram_freq.items():
    unk_laplace_bigram_probs[tokens] = (count + 1) / (unk_laplace_unigram_freq[tokens[0]] + len(unk_laplace_unigram_freq))

#-------------------------------------------------------------------------------------------------------------------------------

# Create UNK Laplace trigram model
unk_laplace_trigram_freq = {}
unk_laplace_total_trigrams = len(create_laplace_ngrams(unk_train_set, 3))
unk_laplace_trigram_tokens = create_laplace_ngrams(unk_train_set, 3)
for tokens in unk_laplace_trigram_tokens:
    if tokens in unk_laplace_trigram_freq:
        unk_laplace_trigram_freq[tokens] += 1
    else:
        unk_laplace_trigram_freq[tokens] = 1

# Calculate UNK Laplace trigram probabilities
unk_laplace_trigram_probs = {}
for tokens, count in unk_laplace_trigram_freq.items():
    unk_laplace_trigram_probs[tokens] = (count + 1) / (unk_laplace_bigram_freq[(tokens[0], tokens[1])] + len(unk_laplace_unigram_freq))

In [10]:
# Testing UNK Language Model
print("UNK Laplace unigram model (subset):")
print({k: unk_laplace_unigram_freq[k] for k in list(unk_laplace_unigram_freq)[:10]})
print("\nUNK Laplace unigram probabilities (subset):")
print({k: unk_laplace_unigram_probs[k] for k in list(unk_laplace_unigram_probs)[:10]})
print("\nUNK Laplace bigram model (subset):")
print({k: unk_laplace_bigram_freq[k] for k in list(unk_laplace_bigram_freq)[:10]})
print("\nUNK Laplace bigram probabilities (subset):")
print({k: unk_laplace_bigram_probs[k] for k in list(unk_laplace_bigram_probs)[:10]})
print("\nUNK Laplace trigram model (subset):")
print({k: unk_laplace_trigram_freq[k] for k in list(unk_laplace_trigram_freq)[:10]})
print("\nUNK Laplace trigram probabilities (subset):")
print({k: unk_laplace_trigram_probs[k] for k in list(unk_laplace_trigram_probs)[:10]})

UNK Laplace unigram model (subset):
{'<UNK>': 93967, 'well': 9356, 'let': 1643, 'house': 2051, 'day': 2934, 'said': 10439, 'would': 8178, 'could': 6131, 'one': 13665, 'home': 2667}

UNK Laplace unigram probabilities (subset):
{'<UNK>': 0.05517494063738189, 'well': 0.0054941248035925245, 'let': 0.000965303107524432, 'house': 0.00120486738238451, 'day': 0.0017233361439076692, 'said': 0.006130027033184349, 'would': 0.004802441676668084, 'could': 0.0036005101309852902, 'one': 0.008024228873131927, 'home': 0.0015665624640360005}

UNK Laplace bigram model (subset):
{('<UNK>', '<UNK>'): 12144, ('<UNK>', 'well'): 524, ('well', '<UNK>'): 533, ('<UNK>', 'let'): 102, ('let', '<UNK>'): 99, ('<UNK>', 'house'): 105, ('house', '<UNK>'): 122, ('<UNK>', 'day'): 147, ('day', '<UNK>'): 156, ('<UNK>', 'said'): 591}

UNK Laplace bigram probabilities (subset):
{('<UNK>', '<UNK>'): 0.0991355737129517, ('<UNK>', 'well'): 0.0042853994400411394, ('well', '<UNK>'): 0.014090453322075043, ('<UNK>', 'let'): 0.00084

## Linear Interpolation

In [11]:
def linear_interpolation(sentence, lm_type):
    # Set the lambda values for each n-gram model
    lambda_3 = 0.6
    lambda_2 = 0.3
    lambda_1 = 0.1
    
    # Tokenize the sentence
    sentence_tokens = sentence.split()
    
    # Calculate the probabilities of each n-gram model for the sentence
    if lm_type == "vanilla":
        unigram_probs = [vanilla_unigram_probs[token] for token in sentence_tokens]
        bigram_probs = [vanilla_bigram_probs.get((sentence_tokens[i-1], sentence_tokens[i]), 0) for i in range(1, len(sentence_tokens))]
        trigram_probs = [vanilla_trigram_probs.get((sentence_tokens[i-2], sentence_tokens[i-1], sentence_tokens[i]), 0) for i in range(2, len(sentence_tokens))]
    elif lm_type == "laplace":
        unigram_probs = [(laplace_unigram_freq.get(token, 0) + 1) / (laplace_total_unigrams + len(laplace_unigram_freq)) for token in sentence_tokens]
        bigram_probs = [(laplace_bigram_freq.get((sentence_tokens[i-1], sentence_tokens[i]), 0) + 1) / (laplace_unigram_freq.get(sentence_tokens[i-1], 0) + len(laplace_unigram_freq)) for i in range(1, len(sentence_tokens))]
        trigram_probs = [(laplace_trigram_freq.get((sentence_tokens[i-2], sentence_tokens[i-1], sentence_tokens[i]), 0) + 1) / (laplace_bigram_freq.get((sentence_tokens[i-2], sentence_tokens[i-1]), 0) + len(laplace_unigram_freq)) for i in range(2, len(sentence_tokens))]
    elif lm_type == "unk":
        unk_sentence_tokens = ['<UNK>' if word_freq[token] <= 2 else token for token in sentence_tokens]
        unk_laplace_unigram_probs = {word: (count + 1) / (unk_laplace_total_unigrams + len(unk_laplace_unigram_freq)) for word, count in unk_laplace_unigram_freq.items()}
        unk_laplace_bigram_probs = {(word1, word2): (count + 1) / (unk_laplace_unigram_freq.get(word1, 0) + len(unk_laplace_unigram_freq)) for (word1, word2), count in unk_laplace_bigram_freq.items()}
        unk_laplace_trigram_probs = {(word1, word2, word3): (count + 1) / (unk_laplace_bigram_freq.get((word1, word2), 0) + len(unk_laplace_unigram_freq)) for (word1, word2, word3), count in unk_laplace_trigram_freq.items()}
        unk_laplace_trigram_probs = {(word1, word2, word3): (count + 1) / (unk_laplace_bigram_freq.get((word1, word2), 0) + len(unk_laplace_unigram_freq)) for (word1, word2, word3), count in unk_laplace_trigram_freq.items()}
        unigram_probs = [unk_laplace_unigram_probs.get(token, 1 / (unk_laplace_total_unigrams + len(unk_laplace_unigram_freq))) for token in unk_sentence_tokens]
        bigram_probs = [(unk_laplace_bigram_freq.get((unk_sentence_tokens[i-1], unk_sentence_tokens[i]), 0) + 1) / (unk_laplace_unigram_freq.get(unk_sentence_tokens[i-1], 0) + len(unk_laplace_unigram_freq)) for i in range(1, len(unk_sentence_tokens))]
        trigram_probs = [(unk_laplace_trigram_probs.get((unk_sentence_tokens[i-2], unk_sentence_tokens[i-1], unk_sentence_tokens[i]), 0) + 1) / (unk_laplace_bigram_freq.get((unk_sentence_tokens[i-2], unk_sentence_tokens[i-1]), 0) + len(unk_laplace_unigram_freq)) for i in range(2, len(unk_sentence_tokens))]
        
    # Calculate the probability of the sentence using linear interpolation
    sentence_prob = 1
    for i in range(len(sentence_tokens)):
        if i == 0:
            sentence_prob *= unigram_probs[i]**lambda_1
        elif i == 1:
            sentence_prob *= (lambda_2*bigram_probs[i-1] + lambda_1*unigram_probs[i])**lambda_1
        else:
            sentence_prob *= (lambda_3*trigram_probs[i-2] + lambda_2*bigram_probs[i-1] + lambda_1*unigram_probs[i])**lambda_1

    return sentence_prob

## Evaluation

In [12]:
# Iterate through sentences in test set
for sentence in test_set:
    # Tokenize sentence into words
    sentence_tokens = sentence.lower().split()

    # Initialize dictionaries to store sentence probabilities
    sentence_probs = {
        "Vanilla": {"unigram": 1, "bigram": None, "trigram": None},
        "Laplace": {"unigram": 1, "bigram": None, "trigram": None},
        "UNK": {"unigram": 1, "bigram": None, "trigram": None}
    }

    # Calculate probabilities for each model and n-gram type
    for i in range(len(sentence_tokens)):
        for model_name, model_probs in [("Vanilla", (vanilla_unigram_probs, vanilla_bigram_probs, vanilla_trigram_probs)),
                                        ("Laplace", (laplace_unigram_probs, laplace_bigram_probs, laplace_trigram_probs)),
                                        ("UNK", (unk_laplace_unigram_probs, unk_laplace_bigram_probs, unk_laplace_trigram_probs))]:
            if i == 0:
                sentence_probs[model_name]["unigram"] *= model_probs[0].get(sentence_tokens[i], 1)
            elif i == 1:
                if sentence_probs[model_name]["bigram"] is None:
                    sentence_probs[model_name]["bigram"] = 1
                sentence_probs[model_name]["bigram"] *= model_probs[1].get((sentence_tokens[i-1], sentence_tokens[i]), 1)
            else:
                if sentence_probs[model_name]["trigram"] is None:
                    sentence_probs[model_name]["trigram"] = 1
                sentence_probs[model_name]["trigram"] *= model_probs[2].get((sentence_tokens[i-2], sentence_tokens[i-1], sentence_tokens[i]), 1)

    # Print probabilities for each model for current sentence
    #print("Sentence:", sentence)
    #for model_name in sentence_probs:
    #    print(f"{model_name} unigram sentence probability:", sentence_probs[model_name]["unigram"])
    #    if sentence_probs[model_name]["bigram"] is not None:
    #        print(f"{model_name} bigram sentence probability:", sentence_probs[model_name]["bigram"])
    #    else:
    #        print(f"{model_name} bigram sentence probability: N/A")
    #    if sentence_probs[model_name]["trigram"] is not None:
    #        print(f"{model_name} trigram sentence probability:", sentence_probs[model_name]["trigram"])
    #    else:
    #        print(f"{model_name} trigram sentence probability: N/A")
    #print()

## Perplexity

In [13]:
def calculate_perplexity(test_set, lm_probs, ngram_type):
    N = len(test_set)
    log_prob_sum = 0

    for i in range(len(test_set)):
        if ngram_type == "unigram":
            log_prob_sum += math.log(lm_probs.get(test_set[i], 1))
        elif ngram_type == "bigram" and i > 0:
            log_prob_sum += math.log(lm_probs.get((test_set[i-1], test_set[i]), 1))
        elif ngram_type == "trigram" and i > 1:
            log_prob_sum += math.log(lm_probs.get((test_set[i-2], test_set[i-1], test_set[i]), 1))

    entropy = -(log_prob_sum / N)
    perplexity = math.pow(2, entropy)
    return perplexity

# Calculate perplexities for each model and n-gram
vanilla_unigram_perplexity = calculate_perplexity(test_set, vanilla_unigram_probs, "unigram")
vanilla_bigram_perplexity = calculate_perplexity(test_set, vanilla_bigram_probs, "bigram")
vanilla_trigram_perplexity = calculate_perplexity(test_set, vanilla_trigram_probs, "trigram")

laplace_unigram_perplexity = calculate_perplexity(test_set, laplace_unigram_probs, "unigram")
laplace_bigram_perplexity = calculate_perplexity(test_set, laplace_bigram_probs, "bigram")
laplace_trigram_perplexity = calculate_perplexity(test_set, laplace_trigram_probs, "trigram")

unk_unigram_perplexity = calculate_perplexity(test_set, unk_laplace_unigram_probs, "unigram")
unk_bigram_perplexity = calculate_perplexity(test_set, unk_laplace_bigram_probs, "bigram")
unk_trigram_perplexity = calculate_perplexity(test_set, unk_laplace_trigram_probs, "trigram")

# Print the results in a table format
print("{:<15} {:<15} {:<15} {:<15}".format("Model", "Unigram", "Bigram", "Trigram"))
print("-" * 60)
print("{:<15} {:<15.2f} {:<15.2f} {:<15.2f}".format("Vanilla", vanilla_unigram_perplexity, vanilla_bigram_perplexity, vanilla_trigram_perplexity))
print("{:<15} {:<15.2f} {:<15.2f} {:<15.2f}".format("Laplace", laplace_unigram_perplexity, laplace_bigram_perplexity, laplace_trigram_perplexity))
print("{:<15} {:<15.2f} {:<15.2f} {:<15.2f}".format("UNK", unk_unigram_perplexity, unk_bigram_perplexity, unk_trigram_perplexity))

Model           Unigram         Bigram          Trigram        
------------------------------------------------------------
Vanilla         335.19          3.08            1.00           
Laplace         337.23          5.38            1.01           
UNK             292.09          4.76            1.01           


## Genetarion

In [14]:
def generate_sentence(model, phrase):
    print(f"Generating {model} model...")
    print("GENERATED", model.upper(), "SENTENCES:")
    for ngram_type in ["unigram", "bigram", "trigram"]:
        generated_sentence = generate_ngram_sentence(model, phrase, ngram_type)
        print(f"{ngram_type.capitalize()}: {generated_sentence}")

def generate_ngram_sentence(model, phrase, ngram_type):
    start_token = '<s>'
    end_token = '</s>'
    max_length = 20
    sentence = [start_token] + phrase.split()
    model_probs = None
    
    if model == 'Vanilla':
        model_probs = [vanilla_unigram_probs, vanilla_bigram_probs, vanilla_trigram_probs]
    elif model == 'Laplace':
        model_probs = [laplace_unigram_probs, laplace_bigram_probs, laplace_trigram_probs]
    elif model == 'UNK':
        model_probs = [unk_laplace_unigram_probs, unk_laplace_bigram_probs, unk_laplace_trigram_probs]
    else:
        print('Invalid model selection')
        return

    while sentence[-1] != end_token and len(sentence) < max_length:
        if ngram_type == "unigram":
            next_word = generate_word(model_probs[0])
        elif ngram_type == "bigram":
            next_word = generate_word(model_probs[1], sentence[-1:])
        elif ngram_type == "trigram":
            if len(sentence) < 2:
                next_word = generate_word(model_probs[1], sentence[-1:])
            else:
                next_word = generate_word(model_probs[2], sentence[-2:])
        sentence.append(next_word)

    return ' '.join(sentence[1:])

def generate_word(model_prob, context=None):
    if context is None:
        words, probs = zip(*[(word, prob) for word, prob in model_prob.items()])
    else:
        words, probs = zip(*[(word, model_prob.get(tuple(context + [word]), 0)) for word in vocab])

    probs = np.array(probs, dtype=np.float64)  # Cast to float64 data type
    epsilon = 1e-10
    probs = np.add(probs, epsilon)  # Add a small constant to the probabilities
    probs /= probs.sum()
    next_word = np.random.choice(words, p=probs)
    return next_word

# Ask the user for the model and phrase input
model_input = input('Which language model would you like to use? (Vanilla, Laplace, UNK): ')
phrase_input = input('Please enter a phrase: ')

# Generate the sentence using the selected model and input phrase
generate_sentence(model_input, phrase_input)

Which language model would you like to use? (Vanilla, Laplace, UNK): Vanilla
Please enter a phrase: they all went
Generating Vanilla model...
GENERATED VANILLA SENTENCES:
Unigram: they all went er hall funny princess cut data political red plaintiff preece open ireland darlington looked consent location
Bigram: they all went equal love arsenal u heard dodgy avoiding different unit solve want especially mum employer rookie school
Trigram: they all went queueing 22lb demoiselle turban disintegrated ramos alistair unladen encrusting crenistria valdez fossil fart offer chuba 191


In [15]:
# Test the sentence generation function with different models and phrases
#models = ['Vanilla', 'Laplace', 'UNK']
#phrases = ['this is a', 'i love', 'the weather is']
#
#for model in models:
#    for phrase in phrases:
#        generate_sentence(model, phrase)
#        print()

## Sen_Probability

In [16]:
def sen_probability(sentence, model):
    unigram_probs, bigram_probs, trigram_probs = None, None, None
    if model == 'Vanilla':
        unigram_probs, bigram_probs, trigram_probs = vanilla_unigram_probs, vanilla_bigram_probs, vanilla_trigram_probs
    elif model == 'Laplace':
        unigram_probs, bigram_probs, trigram_probs = laplace_unigram_probs, laplace_bigram_probs, laplace_trigram_probs
    elif model == 'UNK':
        unigram_probs, bigram_probs, trigram_probs = unk_laplace_unigram_probs, unk_laplace_bigram_probs, unk_laplace_trigram_probs
    else:
        print('Invalid model selection')
        return

    tokens = ['<s>'] + sentence.split() + ['</s>']
    prob = 1.0

    for i in range(1, len(tokens)):
        if i == 1:
            prob *= unigram_probs.get(tokens[i], 0)
        else:
            bigram = (tokens[i-1], tokens[i])
            bigram_prob = bigram_probs.get(bigram, 0)

            if i >= 2:
                trigram = (tokens[i-2], tokens[i-1], tokens[i])
                trigram_prob = trigram_probs.get(trigram, 0)

                # Apply linear interpolation
                lambda1, lambda2, lambda3 = 1/3, 1/3, 1/3
                prob *= (lambda1 * unigram_probs.get(tokens[i], 0) + lambda2 * bigram_prob + lambda3 * trigram_prob)

    return prob

## Example usage
#models = ['Vanilla', 'Laplace', 'UNK']
#sentence_input = 'this is an example sentence'
#
#for model in models:
#    probability = sen_probability(sentence_input, model)
#    print(f"The probability of the sentence '{sentence_input}' using the {model} model is {probability:.10f}")

The probability of the sentence 'this is an example sentence' using the Vanilla model is 0.0000000000
The probability of the sentence 'this is an example sentence' using the Laplace model is 0.0000000000
The probability of the sentence 'this is an example sentence' using the UNK model is 0.0000000000
