In [8]:
import nltk
import string
import math
import re
import contractions
from collections import Counter

In [9]:
pip install contractions

In [10]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## DATA DOWNLOADING


In [11]:
file_path = "tedtalk.txt"
with open(file_path, 'r', encoding = 'utf-8') as f:
  data = f.read()

## PREPROCESSING

In [12]:
# preprocessing data
def preprocess_text(data):
  # convert to lowercase
  text = data.lower()
  # separate text into sentences
  sentences = nltk.sent_tokenize(text)
  processed_stc = []

  for sentence in sentences:
    # expand contractions
    sentence = contractions.fix(sentence)
    # remove punctuation except ., !, ?
    sentence = re.sub(r"[^a-z0-9.!?]", " ", sentence)
    # remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    # tokenize using nltk
    tokens = nltk.word_tokenize(sentence)
    # add <s> at the beginning and </s> at the end for sentence boundary
    processed_stc.append(["<s>"] + tokens + ["</s>"])

  return processed_stc

# processed_data
processed_data = preprocess_text(data)

In [13]:
# test first 5 sentence in processed_data
for sentence in processed_data[:5]:
  print(sentence)

['<s>', 'thank', 'you', 'so', 'much', 'chris', '.', '</s>']
['<s>', 'and', 'it', 'is', 'truly', 'a', 'great', 'honor', 'to', 'have', 'the', 'opportunity', 'to', 'come', 'to', 'this', 'stage', 'twice', 'i', 'am', 'extremely', 'grateful', '.', '</s>']
['<s>', 'i', 'have', 'been', 'blown', 'away', 'by', 'this', 'conference', 'and', 'i', 'want', 'to', 'thank', 'all', 'of', 'you', 'for', 'the', 'many', 'nice', 'comments', 'about', 'what', 'i', 'had', 'to', 'say', 'the', 'other', 'night', '.', '</s>']
['<s>', 'and', 'i', 'say', 'that', 'sincerely', 'partly', 'because', 'mock', 'sob', 'i', 'need', 'that', '.', '</s>']
['<s>', 'laughter', 'put', 'yourselves', 'in', 'my', 'position', '.', '</s>']


## a. Build a language model


In [14]:
# create n-grams from processed tokens
def get_ngrams(tokens, n):
  ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
  return ngrams

In [15]:
# laplace_smoothing formula
def laplace_smoothing(ngram_counts, context_counts, vocab_size, n):
    probs = {}
    for ngram in ngram_counts:
        # unigram: n=1
        if n == 1:
            # Unigram: N + V
            total_tokens = context_counts  # context_counts is total_tokens if n=1
            # calculate the laplace smoothing formula
            prob = (ngram_counts.get(ngram,0) + 1) / (total_tokens + vocab_size)
        else:
            # count(context) + V
            # extract the context (n-1) from the ngram
            context = ngram[:-1]
            # get the count of the context from context_counts
            count_context = context_counts.get(context, 0)
            # calculate the laplace smoothing formula
            prob = (ngram_counts.get(ngram,0) + 1) / (count_context + vocab_size)
        # store in the prob dictionary
        probs[ngram] = prob
    return probs

In [16]:
# models based on ngram with n (n=1,2,3)
def train_ngram_model (tokens, n):
  # generate n-grams from the processed train data
  ngrams = get_ngrams(tokens, n)
  # count the frequency of each n-gram in train data using the Counter class
  ngram_counts = Counter(ngrams)

  # If n is 1 (unigram), calculate the total number of tokens
  if n==1:
    context_counts = sum(ngram_counts.values())
  else:
    # with n>1: generate (n-1)-grams (context) from the processed train data
    context_ngrams = get_ngrams(tokens, n-1)
    # count the frequency of each (n-1)-gram (context) using the Counter class
    context_counts = Counter(context_ngrams)
  # calculate the vocab size (number of unique tokens in token list)
  vocab_size = len(set(tokens))
  # return the n-gram counts, context counts, and vocabulary size
  return ngram_counts, context_counts, vocab_size


### Train Model

In [17]:
# flatten the data
flat_data = [word for sentence in processed_data for word in sentence]

# unigram model: n=1
# trains a unigram model and calculates the smoothed probabilities for unigrams
uni_ngram_counts, uni_context_counts, vocab_size = train_ngram_model(flat_data, 1)
unigram_prob = laplace_smoothing(uni_ngram_counts, uni_context_counts, vocab_size, 1)

# bigram model: n=2
# trains a bigram model and calculates the smoothed probabilities for bigrams
bi_ngram_counts, bi_context_counts, vocab_size = train_ngram_model(flat_data, 2)
bigram_prob = laplace_smoothing(bi_ngram_counts, bi_context_counts, vocab_size, 2)

# trigram model: n=3
# trains a trigram model and calculates the smoothed probabilities for trigrams
tri_ngram_counts, tri_context_counts, vocab_size = train_ngram_model(flat_data, 3)
trigram_prob = laplace_smoothing(tri_ngram_counts, tri_context_counts, vocab_size, 3)

## b. Calculate the probability of a sentence and compute the Perplexity of a sentence based on 1-gram, 2-gram, and 3-gram models.

In [18]:
# calculate the probability of a sentence
def calculate_sentence_prob(sentence, ngram_probs,context_counts, n, vocab_size):
  prob = 1.0
  for i in range(len(sentence)-n+1):
    # extract the n-gram from the sentence
    ngram = tuple(sentence[i:i+n])
    # process if tokens are not in tokens list (train data)
    if n == 1:
        total_tokens = context_counts  # context_counts for n=1
        count_context = total_tokens
    else:
        context = ngram[:-1]
        count_context = context_counts.get(context, 0) # others n-grams

    if ngram in ngram_probs:
      prob *= ngram_probs[ngram] # multiply the probability by the n-gram's probability find in the prob dictionary of train data
    else:
      prob *= 1/(count_context + vocab_size) # apply Laplace smoothing if the n-gram is not found, count_ngrams = 0
  return prob

In [19]:
# perplexity
def perplexity(sentence, ngram_probs,context_counts, n,vocab_size):
  prob = calculate_sentence_prob(sentence, ngram_probs,context_counts, n,vocab_size)
  N = len(sentence) # the length of input sentence
  # calculates the perplexity from the prob
  return math.pow(prob, -1/N)

In [20]:
# calculate the probability and perplexity based on 1-gram, 2-gram and 3-gram models
ex = "I want to go to school"
ex = preprocess_text(ex)
ex = [word for sentence in ex for word in sentence]
print(ex)

vocab_size = len(set(flat_data))  # size of vocabulary
print(vocab_size)
# 1-gram
uni_prob = calculate_sentence_prob(ex, unigram_prob,uni_context_counts,1,vocab_size)
uni_perplexity = perplexity(ex, unigram_prob,uni_context_counts, 1,vocab_size)

# 2-gram
bi_prob = calculate_sentence_prob(ex, bigram_prob,bi_context_counts,2,vocab_size)
bi_perplexity = perplexity(ex, bigram_prob,bi_context_counts, 2, vocab_size)

# 3-gram
tri_prob = calculate_sentence_prob(ex, trigram_prob,tri_context_counts,3,vocab_size)
tri_perplexity = perplexity(ex, trigram_prob,tri_context_counts, 3, vocab_size)

print(f"1-gram probability: {uni_prob}, 1-gram perplexity: {uni_perplexity}")
print(f"2-gram probability: {bi_prob}, 2-gram perplexity: {bi_perplexity}")
print(f"3-gram probability: {tri_prob}, 3-gram perplexity: {tri_perplexity}")

['<s>', 'i', 'want', 'to', 'go', 'to', 'school', '</s>']
69371
1-gram probability: 1.3281553436964184e-17, 1-gram perplexity: 128.70454711879646
2-gram probability: 5.088333895898929e-16, 2-gram perplexity: 81.59772486118717
3-gram probability: 1.8867762518029128e-16, 3-gram perplexity: 92.37085208501146


### c. Analyze the results

### An example with spelling errors

In [21]:
# test an example with spelling errors
spelling_error_sentence = "i watn to conect with you"
correct_spelling_sentence = "i want to connect with you"
# preprocessing
spelling_error_sentence = preprocess_text(spelling_error_sentence)
spelling_error_sentence = [word for sentence in spelling_error_sentence for word in sentence]
print("spelling error sentence: ", spelling_error_sentence)

correct_spelling_sentence = preprocess_text(correct_spelling_sentence)
correct_spelling_sentence = [word for sentence in correct_spelling_sentence for word in sentence]
print("correct spelling sentence: ", correct_spelling_sentence)

# calculate the probability and perplexity
# 1-gram
uni_spelling_error_sentence_prob = calculate_sentence_prob(spelling_error_sentence, unigram_prob, uni_context_counts, 1, vocab_size)
uni_spelling_error_sentence_perplexity = perplexity(spelling_error_sentence, unigram_prob, uni_context_counts,1,vocab_size)

uni_correct_spelling_sentence_prob = calculate_sentence_prob(correct_spelling_sentence, unigram_prob,uni_context_counts,1,vocab_size)
uni_correct_spelling_sentence_perplexity = perplexity(correct_spelling_sentence, unigram_prob,uni_context_counts, 1, vocab_size)

# print the results
# 1-gram
print(f"1-gram spelling error sentence probability: {uni_spelling_error_sentence_prob}, spelling error sentence perplexity: {uni_spelling_error_sentence_perplexity}")
print(f"1-gram correct spelling sentence probability: {uni_correct_spelling_sentence_prob}, incorrect sentence perplexity: {uni_correct_spelling_sentence_perplexity}")

# 2-gram
bi_spelling_error_sentence_prob = calculate_sentence_prob(spelling_error_sentence, bigram_prob,bi_context_counts,2, vocab_size)
bi_spelling_error_sentence_perplexity = perplexity(spelling_error_sentence, bigram_prob, bi_context_counts, 2, vocab_size)

bi_correct_spelling_sentence_prob = calculate_sentence_prob(correct_spelling_sentence, bigram_prob,bi_context_counts,2, vocab_size)
bi_correct_spelling_sentence_perplexity = perplexity(correct_spelling_sentence, bigram_prob,bi_context_counts, 2, vocab_size)

# print the results
# 2-gram
print(f"2-gram spelling error sentence probability: {bi_spelling_error_sentence_prob}, spelling error sentence perplexity: {bi_spelling_error_sentence_perplexity}")
print(f"2-gram correct spelling sentence probability: {bi_correct_spelling_sentence_prob}, incorrect sentence perplexity: {bi_correct_spelling_sentence_perplexity}")

# 3-gram
tri_spelling_error_sentence_prob = calculate_sentence_prob(spelling_error_sentence, trigram_prob,tri_context_counts,3, vocab_size)
tri_spelling_error_sentence_perplexity = perplexity(spelling_error_sentence, trigram_prob,tri_context_counts, 3, vocab_size)

tri_correct_spelling_sentence_prob = calculate_sentence_prob(correct_spelling_sentence, trigram_prob,tri_context_counts,3, vocab_size)
tri_correct_spelling_sentence_prob = perplexity(correct_spelling_sentence, trigram_prob,tri_context_counts, 3, vocab_size)

# print the results
# 3-gram
print(f"3-gram spelling error sentence probability: {tri_spelling_error_sentence_prob}, spelling error sentence perplexity: {tri_spelling_error_sentence_perplexity}")
print(f"3-gram correct spelling sentence probability: {tri_correct_spelling_sentence_prob}, incorrect sentence perplexity: {tri_correct_spelling_sentence_prob}")


spelling error sentence:  ['<s>', 'i', 'watn', 'to', 'conect', 'with', 'you', '</s>']
correct spelling sentence:  ['<s>', 'i', 'want', 'to', 'connect', 'with', 'you', '</s>']
1-gram spelling error sentence probability: 6.929980831167371e-25, spelling error sentence perplexity: 1046.9079453620172
1-gram correct spelling sentence probability: 4.4441343372001544e-18, incorrect sentence perplexity: 147.57969304568198
2-gram spelling error sentence probability: 9.552223057173248e-30, spelling error sentence perplexity: 4241.182327565566
2-gram correct spelling sentence probability: 7.60112745573867e-18, incorrect sentence perplexity: 138.0035467647108
3-gram spelling error sentence probability: 6.263621601062258e-30, spelling error sentence perplexity: 4470.919093292865
3-gram correct spelling sentence probability: 296.7808113242535, incorrect sentence perplexity: 296.7808113242535


### Conclusion:
- Probability:
The correct spelling sentence has a significantly higher probability than the spelling error sentence.
- Perplexity:
The perplexity of the spelling error sentence is significantly higher than that of the correctly spelled sentence, indicating that the model finds the misspelled sentence much more difficult to predict."

### Test with two similar sentences, where one has the correct word order and the other has an incorrect word order

In [22]:
# test two similar sentences, where one has the correct word order and the other has an incorrect word order
correct_sentence = "I want to speak at ted talk."
incorrect_sentence = "I to want at speak talk ted."

# preprocessing
correct_sentence = preprocess_text(correct_sentence)
correct_sentence = [word for sentence in correct_sentence for word in sentence]
print("correct sentence: ", correct_sentence)

incorrect_sentence = preprocess_text(incorrect_sentence)
incorrect_sentence = [word for sentence in incorrect_sentence for word in sentence]
print("incorrect sentence: ", incorrect_sentence)

# calculate the probability and perplexity
# 1-gram
correct_uni_prob = calculate_sentence_prob(correct_sentence, unigram_prob, uni_context_counts, 1, vocab_size)
correct_uni_perplexity = perplexity(correct_sentence, unigram_prob, uni_context_counts,1,vocab_size)

wrong_uni_prob = calculate_sentence_prob(incorrect_sentence, unigram_prob,uni_context_counts,1,vocab_size)
wrong_uni_perplexity = perplexity(incorrect_sentence, unigram_prob,uni_context_counts, 1, vocab_size)

# 2-gram
correct_bi_prob = calculate_sentence_prob(correct_sentence, bigram_prob,bi_context_counts,2, vocab_size)
correct_bi_perplexity = perplexity(correct_sentence, bigram_prob, bi_context_counts, 2, vocab_size)

wrong_bi_prob = calculate_sentence_prob(incorrect_sentence, bigram_prob,bi_context_counts,2, vocab_size)
wrong_bi_perplexity = perplexity(incorrect_sentence, bigram_prob,bi_context_counts, 2, vocab_size)

# 3-gram
correct_tri_prob = calculate_sentence_prob(correct_sentence, trigram_prob,tri_context_counts,3, vocab_size)
correct_tri_perplexity = perplexity(correct_sentence, trigram_prob,tri_context_counts, 3, vocab_size)

wrong_tri_prob = calculate_sentence_prob(incorrect_sentence, trigram_prob,tri_context_counts,3, vocab_size)
wrong_tri_perplexity = perplexity(incorrect_sentence, trigram_prob,tri_context_counts, 3, vocab_size)

# print the results
# 1-gram
print(f"1-gram correct sentence probability: {correct_uni_prob}, correct sentence perplexity: {correct_uni_perplexity}")
print(f"1-gram incorrect sentence probability: {wrong_uni_prob}, incorrect sentence perplexity: {wrong_uni_perplexity}")

# 2-gram
print(f"2-gram correct sentence probability: {correct_bi_prob}, correct sentence perplexity: {correct_bi_perplexity}")
print(f"2-gram incorrect sentence probability: {wrong_bi_prob}, incorrect sentence perplexity: {wrong_bi_perplexity}")

# 3-gram
print(f"3-gram correct sentence probability: {correct_tri_prob}, correct sentence perplexity: {correct_tri_perplexity}")
print(f"3-gram incorrect sentence probability: {wrong_tri_prob}, incorrect sentence perplexity: {wrong_tri_perplexity}")


correct sentence:  ['<s>', 'i', 'want', 'to', 'speak', 'at', 'ted', 'talk', '.', '</s>']
incorrect sentence:  ['<s>', 'i', 'to', 'want', 'at', 'speak', 'talk', 'ted', '.', '</s>']
1-gram correct sentence probability: 1.6500270200665751e-24, correct sentence perplexity: 238.91911336091727
1-gram incorrect sentence probability: 1.6500270200665751e-24, incorrect sentence perplexity: 238.91911336091727
2-gram correct sentence probability: 7.018851426172904e-19, correct sentence perplexity: 65.36923387132117
2-gram incorrect sentence probability: 2.8407779798543517e-31, incorrect sentence perplexity: 1134.1128486744187
3-gram correct sentence probability: 8.270165684663899e-26, correct sentence perplexity: 322.2912975783582
3-gram incorrect sentence probability: 2.0222862409738123e-37, incorrect sentence perplexity: 4671.06313454491


### Conclusion:
- Probability: In both 2-gram and 3-gram models, the correct sentence has a significantly higher probability than the incorrect sentence. The 1-gram model does not show this difference because it does not consider word order.
- Perplexity: The perplexity is much higher for the incorrect sentence in both 2-gram and 3-gram models, indicates they are hard to predict due to unusual orders.