a) Build a language model based on n-grams using the Laplace smoothing method for the following models:
- 1-gram
- 2-gram
- 3-gram

Import necessary libraries: nltk, gdown,...

In [172]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
from collections import Counter, defaultdict
import re
import math
import gdown
nltk.download('punkt')  


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Use gdown to download tedtalk.txt from the google drive link.

In [173]:
url=f"https://drive.google.com/uc?export=download&id=1tQ9zW0ihL3Uc6GIge9AIV1pQnHYI7ihI"
output_path="tedtalk.txt"
gdown.download(url,output_path, quiet=False)

Downloading...
From: https://drive.google.com/uc?export=download&id=1tQ9zW0ihL3Uc6GIge9AIV1pQnHYI7ihI
To: c:\HCMUT\Study\HK242\NLP\Exercise\Lab2\Inclass\tedtalk.txt
100%|██████████| 40.3M/40.3M [00:01<00:00, 35.9MB/s]


'tedtalk.txt'

Open the plain text file and read into `corpus`.

In [174]:
# Load text corpus from file
with open('tedtalk.txt', 'r', encoding='utf-8') as file:
    corpus = file.read()


Preprocess the text by lowering, and keeping only `[a-z0-9 .!?\']`.
Then use `nltk.sent_tokenize()` to tokenize text into list of sentences and tokenize each sentence into list of words using `nltk.tokenize()`.

In [175]:
# Preprocess text: lowercase, remove punctuation, tokenize
def preprocess(text):
    text = text.lower()
    cleaned_text = re.sub(r"[^a-z0-9 .!?\']", " ", text)
    return cleaned_text


def split_sentences_and_tokenize(text):
    tokenizer = TreebankWordTokenizer()
    sentences = sent_tokenize(text)
    tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in sentences]
    
    return tokenized_sentences


Function to generate ngrams from sentences. Add `<s>` and `</s>` for 2-gram and 3-gram.

In [176]:
# Generate n-grams
def generate_ngrams_from_sentences(sentences, n):
    ngrams_list = []
    for sentence in sentences:
        padded_sentence=sentence
        if n>1:
            padded_sentence = ['<s>'] * (n - 1) + padded_sentence + ['</s>'] 
        ngrams_list.extend(list(ngrams(padded_sentence, n)))
    return ngrams_list


Function to precompute ngram probablity to optimize runtime.

In [177]:

# Function to compute n-gram probabilities with Laplace smoothing
def compute_ngram_probs(vocabulary_size,token_size, n,ngram_counts,n_1_gram_counts, alpha=1):
    probs = {}
    
    for ngram, count in ngram_counts.items():
        if n == 1:  # For unigrams, no need for prefix_count
            prefix_count = token_size 
        else:  # For bigrams or trigrams, use the prefix
            prefix = ngram[:-1]  
            prefix_count = n_1_gram_counts.get(prefix, 0)
        # Apply Laplace smoothing
        probs[ngram] = (count + alpha) / (prefix_count + alpha * vocabulary_size)
    
    return probs


Preprocess and tokenize based on sentences.

In [178]:
sentences=preprocess(corpus)
sentences = split_sentences_and_tokenize(sentences)
print(sentences[:20])

[['thank', 'you', 'so', 'much', 'chris', '.'], ['and', 'it', "'s", 'truly', 'a', 'great', 'honor', 'to', 'have', 'the', 'opportunity', 'to', 'come', 'to', 'this', 'stage', 'twice', 'i', "'m", 'extremely', 'grateful', '.'], ['i', 'have', 'been', 'blown', 'away', 'by', 'this', 'conference', 'and', 'i', 'want', 'to', 'thank', 'all', 'of', 'you', 'for', 'the', 'many', 'nice', 'comments', 'about', 'what', 'i', 'had', 'to', 'say', 'the', 'other', 'night', '.'], ['and', 'i', 'say', 'that', 'sincerely', 'partly', 'because', 'mock', 'sob', 'i', 'need', 'that', '.'], ['laughter', 'put', 'yourselves', 'in', 'my', 'position', '.'], ['laughter', 'i', 'flew', 'on', 'air', 'force', 'two', 'for', 'eight', 'years', '.'], ['laughter', 'now', 'i', 'have', 'to', 'take', 'off', 'my', 'shoes', 'or', 'boots', 'to', 'get', 'on', 'an', 'airplane', '!'], ['laughter', 'applause', 'i', "'ll", 'tell', 'you', 'one', 'quick', 'story', 'to', 'illustrate', 'what', 'that', "'s", 'been', 'like', 'for', 'me', '.'], ['lau

Calculate total number of tokens, calculate the vocabulary size, count unigram, bigram, trigram and use them to precompute the probabilities.

In [179]:
all_tokens = [token for sentence in sentences for token in sentence]
vocabulary_size = len(set(all_tokens))  
token_size=len(all_tokens)
print("Total tokens: ",token_size)
print("Vocabulary size: ", vocabulary_size)
unigram_counts = Counter(generate_ngrams_from_sentences(sentences, 1))
bigram_counts = Counter(generate_ngrams_from_sentences(sentences, 2)) 
trigram_counts = Counter(generate_ngrams_from_sentences(sentences, 3)) 

unigram_probs = compute_ngram_probs(vocabulary_size,token_size,1,unigram_counts,None)
bigram_probs = compute_ngram_probs(vocabulary_size,token_size,2,bigram_counts,unigram_counts)
trigram_probs = compute_ngram_probs(vocabulary_size,token_size,3,trigram_counts,bigram_counts)


Total tokens:  7835609
Vocabulary size:  69944


In [180]:
print("Sample Unigram Count:", list(unigram_counts.items())[:20])
print("Sample Bigram Count:", list(bigram_counts.items())[:15])
print("Sample Trigram Count:", list(trigram_counts.items())[:15])

print("Sample Unigram Probabilities:", list(unigram_probs.items())[:15])
print("Sample Bigram Probabilities:", list(bigram_probs.items())[:15])
print("Sample Trigram Probabilities:", list(trigram_probs.items())[:15])


Sample Unigram Count: [(('thank',), 5195), (('you',), 108403), (('so',), 57089), (('much',), 9220), (('chris',), 498), (('.',), 400768), (('and',), 240579), (('it',), 115807), (("'s",), 85344), (('truly',), 695), (('a',), 170846), (('great',), 4756), (('honor',), 277), (('to',), 210927), (('have',), 44958), (('the',), 336385), (('opportunity',), 1156), (('come',), 6146), (('this',), 74906), (('stage',), 952)]
Sample Bigram Count: [(('<s>', 'thank'), 3222), (('thank', 'you'), 5001), (('you', 'so'), 356), (('so', 'much'), 1751), (('much', 'chris'), 4), (('chris', '.'), 56), (('.', '</s>'), 400542), (('<s>', 'and'), 60110), (('and', 'it'), 8960), (('it', "'s"), 32904), (("'s", 'truly'), 25), (('truly', 'a'), 20), (('a', 'great'), 1219), (('great', 'honor'), 17), (('honor', 'to'), 27)]
Sample Trigram Count: [(('<s>', '<s>', 'thank'), 3222), (('<s>', 'thank', 'you'), 3194), (('thank', 'you', 'so'), 259), (('you', 'so', 'much'), 261), (('so', 'much', 'chris'), 3), (('much', 'chris', '.'), 4)

Function to calculate probability of a sentence, fall back to computing again using ngram counts if the precomputation is not found.

In [181]:
def calculate_sentence_probability(sentence, ngram_probs,n_1gram_count,token_size,vocabulary_size, n, alpha=1):
    tokens = [word for sentence in split_sentences_and_tokenize(preprocess(sentence)) for word in sentence]
    prob = 1.0

    if n > 1:
        tokens = ['<s>'] * (n - 1) + tokens + ['</s>']  # Add boundary tokens for bigram/trigram

    # Calculate the probability using precomputed smoothed n-gram probabilities
    for i in range(n - 1, len(tokens)):  
        ngram = tuple(tokens[i - n + 1:i + 1])  
        prob_word = ngram_probs.get(ngram, 0)  
        # Apply Laplace smoothing if the n-gram is unseen
        if prob_word == 0:  
            if n!=1:
                prefix = ngram[:-1]  
                prefix_count = n_1gram_count.get(prefix, 0)  
            else:
                prefix_count=token_size
            prob_word = (alpha) / (prefix_count + alpha * vocabulary_size)
        prob *= prob_word

    return prob


Function to calculate perplexity of a sentence, fall back to computing again using ngram counts if the precomputation is not found. The perplexity is calculated using logarith instead of inverse to avoid underflowing.

In [182]:
def calculate_perplexity(sentence, ngram_probs,n_1gram_count,token_size,vocabulary_size, n, alpha=1):
    tokens = [word for sentence in split_sentences_and_tokenize(preprocess(sentence)) for word in sentence]
    if n > 1:
        tokens = ['<s>'] * (n - 1) + tokens + ['</s>']  # Add boundary tokens for bigram/trigram
    total_log_prob = 0.0
    for i in range(n - 1, len(tokens)):  
        ngram = tuple(tokens[i - n + 1:i + 1])  
        prob_word = ngram_probs.get(ngram, 0)  
        
        # Apply Laplace smoothing if the n-gram is unseen
        if prob_word == 0:
            if n!=1:
                prefix = ngram[:-1]  # Prefix is the first (n-1) tokens
                prefix_count = n_1gram_count.get(prefix, 0)  # Get prefix count (smoothed)
            else:
                prefix_count=token_size
            prob_word = (alpha) / (prefix_count + alpha * vocabulary_size)
        # Perplexity calculation using logarith instead of inverse
        total_log_prob += math.log(prob_word) if prob_word > 0 else float('-inf')

    perplexity = math.exp(-total_log_prob / (len(tokens)+1-n)) if total_log_prob > float('-inf') else float('inf')
    return perplexity


b) Calculate the probability of a sentence and compute the Perplexity of a sentence based on 1-gram, 2-gram, and 3-gram models.

The sentence chosen is "I want to speak at ted talk." This will be the base sentence to compare below.

In [183]:
sentence = "I want to speak at ted talk."
unigram_prob = calculate_sentence_probability(sentence, unigram_probs, None,token_size, vocabulary_size, 1)
bigram_prob = calculate_sentence_probability(sentence, bigram_probs,unigram_counts,token_size, vocabulary_size, 2)
trigram_prob = calculate_sentence_probability(sentence, trigram_probs,bigram_counts,token_size, vocabulary_size, 3)

print("Unigram Probability:", unigram_prob)
print("Bigram Probability:", bigram_prob)
print("Trigram Probability:", trigram_prob)

print("Unigram Perplexity:", calculate_perplexity(sentence, unigram_probs, None,token_size, vocabulary_size, 1))
print("Bigram Perplexity:", calculate_perplexity(sentence, bigram_probs,unigram_counts,token_size, vocabulary_size, 2))
print("Trigram Perplexity:", calculate_perplexity(sentence, trigram_probs,bigram_counts,token_size, vocabulary_size, 3))


Unigram Probability: 1.5121885066224436e-21
Bigram Probability: 4.873652655324646e-18
Trigram Probability: 3.169912235179167e-26
Unigram Perplexity: 400.45088554805614
Bigram Perplexity: 83.86325485941741
Trigram Perplexity: 681.1095560648001


c) Analyze the results (Provide your own examples of spelling errors and calculate the probability of two similar sentences, where one has the correct word order and the other has an incorrect word order).



This is an example of spelling errors of the previous sentence, the perplexity is much higher indicating this sentence does not fit with the model.

In [187]:
#Example of spelling errors
sentence = "I went too speak ate ted take."
unigram_prob = calculate_sentence_probability(sentence, unigram_probs, None,token_size, vocabulary_size, 1)
bigram_prob = calculate_sentence_probability(sentence, bigram_probs,unigram_counts,token_size, vocabulary_size, 2)
trigram_prob = calculate_sentence_probability(sentence, trigram_probs,bigram_counts,token_size, vocabulary_size, 3)

print("Unigram Probability:", unigram_prob)
print("Bigram Probability:", bigram_prob)
print("Trigram Probability:", trigram_prob)

print("Unigram Perplexity:", calculate_perplexity(sentence, unigram_probs, None,token_size, vocabulary_size, 1))
print("Bigram Perplexity:", calculate_perplexity(sentence, bigram_probs,unigram_counts,token_size, vocabulary_size, 2))
print("Trigram Perplexity:", calculate_perplexity(sentence, trigram_probs,bigram_counts,token_size, vocabulary_size, 3))


Unigram Probability: 9.250038516821093e-26
Bigram Probability: 5.691648843830339e-30
Trigram Probability: 1.6917893350584108e-35
Unigram Perplexity: 1346.5796822248376
Bigram Perplexity: 1775.8978814480727
Trigram Perplexity: 7303.265341672125


This is an example of wrong order of the above sentence, the perplexity is higher on all 2gram and 3gram models compared with the base sentence but is not as high as the spelling error sentence. This sentence is worse than the base sentence but is better than the spelling error one.

In [185]:
#Example of spelling errors
sentence = "I speak at want to ted talk."
unigram_prob = calculate_sentence_probability(sentence, unigram_probs, None,token_size, vocabulary_size, 1)
bigram_prob = calculate_sentence_probability(sentence, bigram_probs,unigram_counts,token_size, vocabulary_size, 2)
trigram_prob = calculate_sentence_probability(sentence, trigram_probs,bigram_counts,token_size, vocabulary_size, 3)

print("Unigram Probability:", unigram_prob)
print("Bigram Probability:", bigram_prob)
print("Trigram Probability:", trigram_prob)

print("Unigram Perplexity:", calculate_perplexity(sentence, unigram_probs, None,token_size, vocabulary_size, 1))
print("Bigram Perplexity:", calculate_perplexity(sentence, bigram_probs,unigram_counts,token_size, vocabulary_size, 2))
print("Trigram Perplexity:", calculate_perplexity(sentence, trigram_probs,bigram_counts,token_size, vocabulary_size, 3))


Unigram Probability: 1.5121885066224436e-21
Bigram Probability: 8.419420677427076e-23
Trigram Probability: 1.2948934335583005e-34
Unigram Perplexity: 400.45088554805614
Bigram Perplexity: 283.62624884244127
Trigram Perplexity: 5825.152911373741
