# <u> ICS2203 – Statistical Natural Language Processing <u>
## <u> Building a Language Model – Part I <u>

## Imports

In [1]:
# standard library imports
import os
import re
import math
import time
from datetime import datetime
from collections import Counter

# third-party library imports
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from spellchecker import SpellChecker
from sklearn.model_selection import train_test_split
import xml.etree.ElementTree as ET
import psutil

## Extracting and Preprocessing the Selected Corpus

In [2]:
corpus_dir_path = 'corpus/aca'
sentences = []
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

generation_start = datetime.now()

for root_dir, subdirs, files in os.walk(corpus_dir_path):
    for filename in files:
        if filename.endswith('.xml'):
            tree = ET.parse(os.path.join(root_dir, filename))
            root = tree.getroot()
            text = ''
            for element in root.iter():
                if element.text is not None:
                    # Concatenate text from all elements in the file
                    text += element.text + ' '

            # Remove URLs, numbers, brackets, commas, semicolons, and colons
            text = re.sub(r'https?://\S+|www\.\S+|\d+|[{};:\[\],()]', '', text)
            
            # Convert text to lowercase and split into sentences
            text = text.lower()
            file_sentences = sent_tokenize(text)

            # Process each sentence
            for sentence in file_sentences:
                # Tokenize sentence into words
                sentence_tokens = word_tokenize(sentence)

                # Remove stopwords, numbers, and lemmatize tokens
                sentence_tokens = [lemmatizer.lemmatize(token) for token in sentence_tokens if token not in stop_words and not token.isdigit()]

                # Append lemmatized tokens to list of sentences
                sentences.append(sentence_tokens)

generation_end = datetime.now()
generation_time = generation_end - generation_start

def RAMusage():
    pid = os.getpid()
    py = psutil.Process(pid)
    memoryUse = py.memory_info()[0] / 2.**30
    return memoryUse

# Output list of the first 50 sentences
# print(sentences[:50])

# Output the first 50 sentences and their words
for i, sentence in enumerate(sentences[:10]):
    print(f"Sentence {i+1}:")
    print(" ".join(sentence))
    print()

Sentence 1:
oxford art journal .

Sentence 2:
sample containing word periodical domain art data capture transcription oxford university press bnc xml edition december token w-units s-units distributed licence oxford university computing service behalf bnc consortium .

Sentence 3:
material protected international copyright law may copied redistributed way .

Sentence 4:
consult bnc web site full licencing distribution condition .

Sentence 5:
au artjnl oxford art journal .

Sentence 6:
oxford university press oxford w achumanities art art tag usage updated bnc-xml last check bnc world first release redo tagusage table check tagcounts resequenced s-units added header added date info updated catrefs updated source title updated title corrected tagusage po code revised bnc- header updated initial accession corpus drawn image guy brett certain image matter one desire answer question involuntary response ?

Sentence 7:
seem important answer ‘ objective ’ quality insight history society know

## Computation

In [3]:
# Calculate the total number of sentences in the corpus
num_sentences = len(sentences)

# Calculate the total number of words in the corpus
num_words = sum([len(doc) for doc in sentences])

# Print the total size of the corpus in sentences and words
print("Total size of corpus: {} words, {} sentences".format(num_words, num_sentences))

# Print the time taken to generate the tokens
print('Generation Time (HH:MM:SS:ms): {}\n'.format(generation_time))

# Print the current memory usage of the program
print("Memory Usage: {:.6f} GB".format(RAMusage()))

Total size of corpus: 587467 words, 40214 sentences
Generation Time (HH:MM:SS:ms): 0:00:18.489743

Memory Usage: 0.259476 GB


## Split the corpus tokens into test set and train set

In [4]:
# Split the sentences into train and test sets
train_set, test_set = train_test_split(sentences, test_size=0.2)

# Define the vocabulary from the training set
vocab = set()
for sentence in train_set:
    for word in sentence:
        # Add each unique word in the training set to the vocabulary
        vocab.add(word)
# Convert the vocabulary to a list and sort it alphabetically
vocab = list(set([word for sentence in train_set for word in sentence]))
vocab.sort()

# Print the size of the train and test sets
print("Train set size:", len(train_set))
print("Test set size:", len(test_set))

Train set size: 32171
Test set size: 8043


## Building frequency counts for n-grams

In [5]:
def generate_ngram_counts(sentences, n):
    # Initialize a dictionary to store the frequency counts
    ngram_counts = {}

    # Iterate through each sentence in the corpus
    for sentence in sentences:
        # Create n-grams from the words in the sentence
        ngrams = [tuple(sentence[i:i+n]) for i in range(len(sentence)-n+1)]

        # Iterate through each n-gram in the sentence
        for ngram in ngrams:
            # Update the frequency count for the n-gram
            if ngram in ngram_counts:
                ngram_counts[ngram] += 1
            else:
                ngram_counts[ngram] = 1

    return ngram_counts

## Vanilla Language Model

In [6]:
## Vanilla Language Model

unigram_counts_vanilla = generate_ngram_counts(sentences, 1)
#print("Unigram Counts:", unigram_counts_vanilla)

bigram_counts_vanilla = generate_ngram_counts(sentences, 2)
#print("Bigram Counts:", bigram_counts_vanilla)

trigram_counts_vanilla = generate_ngram_counts(sentences, 3)
#print("Trigram Counts:", trigram_counts_vanilla)

## Laplace Language Model

In [7]:
## Laplace Language Model

def laplace_language_model(vocab, unigram_counts_vanilla, bigram_counts_vanilla, trigram_counts_vanilla):
    # Define vocabulary size
    V = len(vocab)

    # Initialize Laplace-smoothed count dictionaries
    unigram_counts_smoothed = {}
    bigram_counts_smoothed = {}
    trigram_counts_smoothed = {}

    # Set the smoothing parameter
    k = 1

    # Laplace-smooth the unigram counts
    for word in vocab:
        if word in unigram_counts_vanilla:
            unigram_counts_smoothed[word] = unigram_counts_vanilla[word] + k
        else:
            unigram_counts_smoothed[word] = k

    # Laplace-smooth the bigram counts
    for bigram in bigram_counts_vanilla:
        if bigram[0] in unigram_counts_vanilla:
            bigram_counts_smoothed[bigram] = (bigram_counts_vanilla[bigram] + k) / (unigram_counts_vanilla[bigram[0]] + V*k)
        else:
            bigram_counts_smoothed[bigram] = k / V

    # Laplace-smooth the trigram counts
    for trigram in trigram_counts_vanilla:
        if (trigram[0], trigram[1]) in bigram_counts_vanilla:
            trigram_counts_smoothed[trigram] = (trigram_counts_vanilla[trigram] + k) / (bigram_counts_vanilla[(trigram[0], trigram[1])] + V*k)
        else:
            trigram_counts_smoothed[trigram] = k / V**2

    return unigram_counts_smoothed, bigram_counts_smoothed, trigram_counts_smoothed

# Build the Laplace Language Model
unigram_counts_laplace, bigram_counts_laplace, trigram_counts_laplace = laplace_language_model(vocab, unigram_counts_vanilla, bigram_counts_vanilla, trigram_counts_vanilla)

# Print the Laplace-smoothed trigram counts
#print("\nLaplace-smooothed Language Model:")
#print("Unigram Counts:", unigram_counts_laplace)
#print("Bigram Counts:", bigram_counts_laplace)
#print("Trigram Counts:", trigram_counts_laplace)

## UNK Language Model

In [8]:
def replace_rare_words(sentences, threshold):
    # Create a frequency distribution of all words in the corpus
    word_freq = Counter([word for sentence in sentences for word in sentence])

    # Identify words that occur less than the threshold number of times
    rare_words = set([word for word in word_freq if word_freq[word] < threshold])

    # Replace rare words with the UNK token
    new_sentences = []
    for sentence in sentences:
        new_sentence = []
        for word in sentence:
            if word in rare_words:
                new_sentence.append('<UNK>')
            else:
                new_sentence.append(word)
        new_sentences.append(new_sentence)

    return new_sentences

#-------------------------------------------------------------------------------------------------------------------------------

# Replace rare words in the training set with the UNK token
train_set_unk = replace_rare_words(train_set, threshold=2)

# Replace rare words in the test set with the UNK token
test_set_unk = replace_rare_words(test_set, threshold=2)

# Define the UNK vocabulary from the training set
vocab_unk = set()
for sentence in train_set_unk:
    for word in sentence:
        # Add each unique word in the training set to the vocabulary
        vocab_unk.add(word)
# Convert the vocabulary to a list and sort it alphabetically
vocab_unk = list(vocab_unk)
vocab_unk.sort()

# Print the size of the UNK vocabulary
print("UNK Vocabulary size:", len(vocab_unk))
print("-" *125)

#-------------------------------------------------------------------------------------------------------------------------------

def unk_language_model(vocab_unk, train_set_unk):
    # Define vocabulary size
    V = len(vocab_unk)

    # Initialize count dictionaries
    unigram_counts_unk = {}
    bigram_counts_unk = {}
    trigram_counts_unk = {}

    # Set the smoothing parameter
    k = 1

    # Generate n-gram counts for the UNK Language Model
    unigram_counts_unk = generate_ngram_counts(train_set_unk, 1)
    bigram_counts_unk = generate_ngram_counts(train_set_unk, 2)
    trigram_counts_unk = generate_ngram_counts(train_set_unk, 3)

    # Laplace-smooth the unigram counts
    for word in vocab_unk:
        if word in unigram_counts_unk:
            unigram_counts_unk[word] += k
        else:
            unigram_counts_unk[word] = k

    # Laplace-smooth the bigram counts
    for bigram in bigram_counts_unk:
        if bigram[0] in unigram_counts_unk:
            bigram_counts_unk[bigram] = (bigram_counts_unk[bigram] + k) / (unigram_counts_unk[bigram[0]] + V*k)
        else:
            bigram_counts_unk[bigram] = k / V

    # Laplace-smooth the trigram counts
    for trigram in trigram_counts_unk:
        if (trigram[0], trigram[1]) in bigram_counts_unk:
            trigram_counts_unk[trigram] = (trigram_counts_unk[trigram] + k) / (bigram_counts_unk[(trigram[0], trigram[1])] + V*k)
        else:
            trigram_counts_unk[trigram] = k / V**2

    return unigram_counts_unk, bigram_counts_unk, trigram_counts_unk

# Build the UNK Language Model
unigram_counts_unk, bigram_counts_unk, trigram_counts_unk = unk_language_model(vocab_unk, train_set_unk)

# Print the UNK Language Model counts
#print("\nUNK Language Model:\n")
#print("Unigram Counts:", unigram_counts_unk)
#print("Bigram Counts:", bigram_counts_unk)
#print("Trigram Counts:", trigram_counts_unk)

UNK Vocabulary size: 16514
-----------------------------------------------------------------------------------------------------------------------------


## Linear Interpolation

In [9]:
def linear_interpolation(sentence, unigram_counts, bigram_counts, trigram_counts, lambda1=0.1, lambda2=0.3, lambda3=0.6):
    # Initialize the probability to 1
    probability = 1.0

    # Iterate through each trigram in the sentence
    for i in range(2, len(sentence)):
        trigram = (sentence[i-2], sentence[i-1], sentence[i])

        # Calculate the interpolated probability of the trigram using the Laplace-smoothed counts
        unigram_prob = unigram_counts.get(trigram[2], 0) / sum(unigram_counts.values())
        bigram_prob = bigram_counts.get(trigram[1:], 0) / sum(bigram_counts.values())
        trigram_prob = trigram_counts.get(trigram, 0) / sum(trigram_counts.values())
        interpolated_prob = lambda1 * unigram_prob + lambda2 * bigram_prob + lambda3 * trigram_prob

        # Multiply the probability by the interpolated probability of the trigram
        probability *= interpolated_prob

    return probability

# Example sentence
test_sentence = ['oxford', 'art', 'journal']

# Calculate the probability of the test sentence using linear interpolation with the given lambdas
prob = linear_interpolation(test_sentence, unigram_counts_laplace, bigram_counts_laplace, trigram_counts_laplace, lambda1=0.1, lambda2=0.3, lambda3=0.6)

# Print the probability of the test sentence (trigram using the Laplace-smoothed counts)
print("Probability of sentence (trigram using the Laplace-smoothed counts):", prob)

Probability of sentence (trigram using the Laplace-smoothed counts): 6.691399023299299e-06


## Probabilities of Models

In [10]:
def unigram_probability(sentence, unigram_model):
    tokens = sentence.split()
    probability = 0
    total_unigram_count = sum(unigram_model.values())

    for token in tokens:
        # Check if the token exists in the unigram model
        if token in unigram_model:
            token_prob = unigram_model[token] / total_unigram_count
            probability += math.log(token_prob)
        else:
            # If the token is not in the unigram model, you can either skip it or assign a small probability value
            pass

    return probability

def bigram_probability(sentence, bigram_model):
    tokens = sentence.split()
    probability = 0

    for i in range(len(tokens) - 1):
        bigram = (tokens[i], tokens[i + 1])
        probability += math.log(bigram_model.get(bigram, 1e-10))

    return probability


def trigram_probability(sentence, trigram_model):
    tokens = sentence.split()
    probability = 0

    for i in range(len(tokens) - 2):
        trigram = (tokens[i], tokens[i + 1], tokens[i + 2])
        probability += math.log(trigram_model.get(trigram, 1e-10))

    return probability

## Evaluation

In [11]:
def calculate_probs(test_set, model):
    probs = []
    for sentence in test_set:
        probability = 1.0
        for i in range(2, len(sentence)):
            trigram = (sentence[i-2], sentence[i-1], sentence[i])
            if model == "vanilla":
                trigram_count = trigram_counts_vanilla.get(trigram, 0)
                bigram_count = bigram_counts_vanilla.get(trigram[1:], 0)
                if bigram_count != 0:
                    probability *= trigram_count / bigram_count
            elif model == "laplace":
                trigram_count = trigram_counts_laplace.get(trigram, 0)
                bigram_count = bigram_counts_laplace.get(trigram[1:], 0)
                unigram_count = unigram_counts_laplace.get(trigram[2], 0)
                probability *= (trigram_count + 1) / (bigram_count + len(vocab) + 1 * len(vocab))
                probability *= (bigram_count + 1) / (unigram_count + len(vocab) + 1 * len(vocab))
            elif model == "unk":
                trigram_count = trigram_counts_unk.get(trigram, 0)
                bigram_count = bigram_counts_unk.get(trigram[1:], 0)
                unigram_count = unigram_counts_unk.get(trigram[2], 0)
                probability *= (trigram_count + 1) / (bigram_count + len(vocab_unk) + 1 * len(vocab_unk))
                probability *= (bigram_count + 1) / (unigram_count + len(vocab_unk) + 1 * len(vocab_unk))
        probs.append(probability)
    return probs

def print_probs(vanilla_probs, laplace_probs, unk_probs, test_set, test_set_unk):
    for i in range(10):
        print("Sentence {}: {}".format(i + 1, ' '.join(test_set[i])))
        print("Vanilla Model Probability of Sentence {}: {}".format(i + 1, vanilla_probs[i]))
        print("Laplace Model Probability of Sentence {}: {}".format(i + 1, laplace_probs[i]))
        print("UNK Model Probability of Sentence {}: {}".format(i + 1, unk_probs[i]))
        print("-" * 125)

# Calculate the probabilities for each model
vanilla_probs = calculate_probs(test_set, "vanilla")
laplace_probs = calculate_probs(test_set, "laplace")
unk_probs = calculate_probs(test_set_unk, "unk")

# Print the probabilities
print_probs(vanilla_probs, laplace_probs, unk_probs, test_set, test_set_unk)

Sentence 1: place emphasis biotic others environmental factor .
Vanilla Model Probability of Sentence 1: 0.008547008547008546
Laplace Model Probability of Sentence 1: 2.5226909574604952e-48
UNK Model Probability of Sentence 1: 6.489163931678202e-46
-----------------------------------------------------------------------------------------------------------------------------
Sentence 2: ‘ trapline ’ moving directly one food site next apparently remembering previous day fast flier visiting plant producing flower long period .
Vanilla Model Probability of Sentence 2: 4.4288548752834467e-05
Laplace Model Probability of Sentence 2: 4.0501513289479383e-191
UNK Model Probability of Sentence 2: 1.8074007068658315e-181
-----------------------------------------------------------------------------------------------------------------------------
Sentence 3: fact whole point introducing d. assigning relative permittivity dielectric material problem taken care .
Vanilla Model Probability of Sentence 3

## Perplexity

In [24]:
def calculate_perplexity(test_set, unigram_model, bigram_model, trigram_model):
    model_perplexities = []

    for model_probability in [unigram_probability, bigram_probability, trigram_probability]:
        probability_sum = 0
        n = 0

        for sentence in test_set:
            sent = ' '.join(sentence)
            probability_sum += model_probability(sent, unigram_model if model_probability == unigram_probability else bigram_model if model_probability == bigram_probability else trigram_model)
            n += len(sentence)

        model_perplexity = math.exp(-probability_sum / n)
        model_perplexities.append(model_perplexity)

    return tuple(model_perplexities)

vanilla_perplexities = calculate_perplexity(test_set, unigram_counts_vanilla, bigram_counts_vanilla, trigram_counts_vanilla)
laplace_perplexities = calculate_perplexity(test_set, unigram_counts_laplace, bigram_counts_laplace, trigram_counts_laplace)
unk_perplexities = calculate_perplexity(test_set_unk, unigram_counts_unk, bigram_counts_unk, trigram_counts_unk)

print("Model           | Unigram   | Bigram    | Trigram")
print("-------------------------------------------------")
print(f"Vanilla         | {vanilla_perplexities[0]:7.2f}  | {vanilla_perplexities[1]:7.2f} | {vanilla_perplexities[2]:7.2f}")
print(f"Laplace         | {laplace_perplexities[0]:7.2f} | {laplace_perplexities[1]:7.2f} | {laplace_perplexities[2]:7.2f}")
print(f"UNK             | {unk_perplexities[0]:7.2f} | {unk_perplexities[1]:7.2f} | {unk_perplexities[2]:7.2f}")

Model           | Unigram   | Bigram    | Trigram
-------------------------------------------------
Vanilla         |    1.00  |    0.50 |    0.91
Laplace         | 21761.65 | 14262.84 | 3645.54
UNK             | 422616.68 | 3014759.52 | 143104084.28


## Generation

In [21]:
def generate_sentence(model, phrase):
    print(f"Generating {model} model...")
    print("GENERATED", model.upper(), "SENTENCES:")
    for ngram_type in ["unigram", "bigram", "trigram"]:
        generated_sentence = generate_ngram_sentence(model, phrase, ngram_type)
        print(f"{ngram_type.capitalize()}: {generated_sentence}")

def generate_ngram_sentence(model, phrase, ngram_type):
    start_token = '<s>'
    end_token = '</s>'
    max_length = 20
    sentence = [start_token] + phrase.split()
    model_counts = None

    if model == 'Vanilla':
        model_counts = [unigram_counts_vanilla, bigram_counts_vanilla, trigram_counts_vanilla]
    elif model == 'Laplace':
        model_counts = [unigram_counts_laplace, bigram_counts_laplace, trigram_counts_laplace]
    elif model == 'UNK':
        model_counts = [unigram_counts_unk, bigram_counts_unk, trigram_counts_unk]
    else:
        print('Invalid model selection')
        return

    while sentence[-1] != end_token and len(sentence) < max_length:
        if ngram_type == "unigram":
            next_word = generate_word(model_counts[0])
        elif ngram_type == "bigram":
            next_word = generate_word(model_counts[1], sentence[-1:])
        elif ngram_type == "trigram":
            if len(sentence) < 2:
                next_word = generate_word(model_counts[1], sentence[-1:])
            else:
                next_word = generate_word(model_counts[2], sentence[-2:])
        sentence.append(next_word)

    return ' '.join(sentence[1:])

def generate_word(model_count, context=None):
    if context is None:
        words, counts = zip(*[(word, count) for word, count in model_count.items()])
    else:
        vocab = sorted(model_count.keys())  # Sort vocab for consistent ordering
        words_and_counts = []
        for word in vocab:
            key = tuple(context + [word])
            if key in model_count:
                count = model_count[key]
                words_and_counts.append((word, count))

        if len(words_and_counts) == 0:  # Check if there are no words to choose from
            return '</s>'

        words, counts = zip(*words_and_counts)
        
    words = list(words)  # Convert words to a list from a tuple
    counts = np.array(counts, dtype=np.float64)
    epsilon = 1e-10
    counts = np.add(counts, epsilon)
    probs = counts / counts.sum()
    next_word = np.random.choice(words, p=probs)
    return next_word

# Ask the user for the model and phrase input
model_input = input('Which language model would you like to use? (Vanilla, Laplace, UNK): ')
phrase_input = input('Please enter a phrase: ')

# Generate the sentence using the selected model and input phrase
generate_sentence(model_input, phrase_input)

Which language model would you like to use? (Vanilla, Laplace, UNK): Vanilla
Please enter a phrase: they all
Generating Vanilla model...
GENERATED VANILLA SENTENCES:


ValueError: a must be 1-dimensional