In [68]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
import math
import random

from nltk.tokenize import word_tokenize
from nltk import ngrams
from collections import Counter

In [2]:
# Downloads

# Agenda

1. Dataset Loading
2. Data Preprocessing
- Lowercasing
- Removing special characters
- Tokenization
3. Generation of N-grams
- Bigrams
- Trigrams
4. Probability Distribution using Maximum Likelihood Estimator (MLE)
5. Smoothing Techniques
- Laplace Smoothing
6. Evaluation using Perplexity
7. Sentence generation using N-grams

### 1. Dataset Loading


In [3]:
# Loading the data
with open("THE SEVEN HABITS OF HIGHLY EFFECTIVE PEOPLE.txt", encoding = "utf-8") as file:
    book = file.read()

In [9]:
# Printing the first 500 characters
print(book[:500])

the seven habits of highly effective people brought to you by flyheart
the seven habits of highly effective people
stephen r. covey
the seven habits of highly effective people brought to you by flyheart
stephen covey has written a remarkable book about the human condition, so elegantly written, so
understanding of our embedded concerns, so useful for our organization and personal lives, that it's
going to be my gift to everyone i know.
-- warren bennis, author of on becoming a leader
i've never 


### 2. Data Preprocessing
##### - Lowercasing


In [10]:
# Lowercasing the text
book = book.lower()

print(book[:500])

the seven habits of highly effective people brought to you by flyheart
the seven habits of highly effective people
stephen r. covey
the seven habits of highly effective people brought to you by flyheart
stephen covey has written a remarkable book about the human condition, so elegantly written, so
understanding of our embedded concerns, so useful for our organization and personal lives, that it's
going to be my gift to everyone i know.
-- warren bennis, author of on becoming a leader
i've never 


##### - Removing special characters


In [12]:
# Removing the special charachters
book = re.sub(r"[^a-zA-Z0-9\s]", "", book)

print(book[:500])

the seven habits of highly effective people brought to you by flyheart
the seven habits of highly effective people
stephen r covey
the seven habits of highly effective people brought to you by flyheart
stephen covey has written a remarkable book about the human condition so elegantly written so
understanding of our embedded concerns so useful for our organization and personal lives that its
going to be my gift to everyone i know
 warren bennis author of on becoming a leader
ive never known any t


##### - Tokenization


In [15]:
tokens = word_tokenize(book)

print(tokens[:100])

['the', 'seven', 'habits', 'of', 'highly', 'effective', 'people', 'brought', 'to', 'you', 'by', 'flyheart', 'the', 'seven', 'habits', 'of', 'highly', 'effective', 'people', 'stephen', 'r', 'covey', 'the', 'seven', 'habits', 'of', 'highly', 'effective', 'people', 'brought', 'to', 'you', 'by', 'flyheart', 'stephen', 'covey', 'has', 'written', 'a', 'remarkable', 'book', 'about', 'the', 'human', 'condition', 'so', 'elegantly', 'written', 'so', 'understanding', 'of', 'our', 'embedded', 'concerns', 'so', 'useful', 'for', 'our', 'organization', 'and', 'personal', 'lives', 'that', 'its', 'going', 'to', 'be', 'my', 'gift', 'to', 'everyone', 'i', 'know', 'warren', 'bennis', 'author', 'of', 'on', 'becoming', 'a', 'leader', 'ive', 'never', 'known', 'any', 'teacher', 'or', 'mentor', 'on', 'improving', 'personal', 'effectiveness', 'to', 'generate', 'such', 'an', 'overwhelmingly', 'positive', 'reaction', 'this']


##### - Bigrams


In [23]:
# Creating the bigrams
bigrams = list(ngrams(tokens, 2))

# Showing the first 10
bigrams[:10]

[('the', 'seven'),
 ('seven', 'habits'),
 ('habits', 'of'),
 ('of', 'highly'),
 ('highly', 'effective'),
 ('effective', 'people'),
 ('people', 'brought'),
 ('brought', 'to'),
 ('to', 'you'),
 ('you', 'by')]

In [27]:
# Counting the bigrams
bigrams_freq = Counter(bigrams)

# Showing the 15 most common
bigrams_freq.most_common(15)

[(('of', 'the'), 534),
 (('in', 'the'), 427),
 (('to', 'you'), 222),
 (('to', 'the'), 216),
 (('to', 'be'), 210),
 (('on', 'the'), 208),
 (('seven', 'habits'), 204),
 (('and', 'the'), 200),
 (('the', 'seven'), 197),
 (('habits', 'of'), 195),
 (('highly', 'effective'), 190),
 (('effective', 'people'), 187),
 (('of', 'highly'), 185),
 (('brought', 'to'), 175),
 (('you', 'by'), 173)]

##### - Trigrams


In [25]:
# Creating the trigrams
trigrams = list(ngrams(tokens,3))

# Showing the first 10
trigrams[:10]

[('the', 'seven', 'habits'),
 ('seven', 'habits', 'of'),
 ('habits', 'of', 'highly'),
 ('of', 'highly', 'effective'),
 ('highly', 'effective', 'people'),
 ('effective', 'people', 'brought'),
 ('people', 'brought', 'to'),
 ('brought', 'to', 'you'),
 ('to', 'you', 'by'),
 ('you', 'by', 'flyheart')]

In [30]:
#Counting the trigrams
trigrams_freq = Counter(trigrams)

# Showing the most common 15
trigrams_freq.most_common(15)

[(('the', 'seven', 'habits'), 197),
 (('habits', 'of', 'highly'), 184),
 (('of', 'highly', 'effective'), 183),
 (('highly', 'effective', 'people'), 183),
 (('seven', 'habits', 'of'), 182),
 (('brought', 'to', 'you'), 173),
 (('to', 'you', 'by'), 173),
 (('effective', 'people', 'brought'), 172),
 (('people', 'brought', 'to'), 172),
 (('you', 'by', 'flyheart'), 172),
 (('circle', 'of', 'influence'), 49),
 (('as', 'well', 'as'), 41),
 (('you', 'want', 'to'), 41),
 (('you', 'have', 'to'), 34),
 (('in', 'harmony', 'with'), 32)]

### 4. Probability Distribution using Maximum Likelihood Estimator (MLE)


In [33]:
# Defining a function for the probability calculation
def bigram_probability(bigram, bigram_freq, unigram_freq):
    return bigram_freq[bigram] / unigram_freq[bigram[0]]

In [43]:
# Counting the unigrams
unigrams_freq = Counter(tokens)

# Showing the most common 15
unigrams_freq.most_common(15)

[('the', 5530),
 ('to', 3846),
 ('and', 3470),
 ('of', 3223),
 ('a', 2103),
 ('you', 2017),
 ('in', 1889),
 ('i', 1399),
 ('that', 1380),
 ('is', 1290),
 ('it', 1153),
 ('we', 1064),
 ('your', 923),
 ('with', 848),
 ('or', 786)]

In [45]:
# Testing the model
test_bigram = ("the", "man")

# Calculating the probability
probability = bigram_probability(test_bigram, bigrams_freq, unigrams_freq)

# Printing the results
print(f"The probability that the word {test_bigram[1]} follows the word {test_bigram[0]} is {probability*100:.2f}%")

The probability that the word man follows the word the is 0.16%


### 5. Smoothing Techniques
##### - Laplace Smoothing

In [48]:
# Defining the function
def bigram_laplace_probability (bigram, bigram_freq, unigram_freq, vocab_size):
    return (bigram_freq[bigram] + 1 / (unigram_freq[bigram[0]] + vocab_size))

# Calculating the probability
vocab_size = len(unigram_freq)
prob_laplace = bigram_laplace_probability(test_bigram, bigrams_freq, unigrams_freq, vocab_size)

# Printing the result
print(f"The Laplace Smoothing Probability that the word {test_bigram[1]} follows the word {test_bigram[0]} is {prob_laplace:.2f}%")

The Laplace Smoothing Probability that the word man follows the word the is 9.00%


### 6. Evaluation using Perplexity


In [62]:
# Defining the function
def calculate_perplexity(test_data, bigram_freq, unigram_freq, vocab_size):
    perplexity = 0
    N = len(test_data)-1
    # For loop for each word in the bigram
    for i in range(N):
        bigram = test_data[i], test_data[i+1]
        
        # Calculating the probability using the laplace smoothing function
        prob = bigram_laplace_probability(bigram, bigram_freq, unigram_freq, vocab_size)
        
        # Calculating the perplexity in the for loop
        perplexity += math.log2(prob)
    
    
    perplexity = 2**(-perplexity/N)
    return perplexity

In [67]:
# Calculating the perplexity
perplexity_score = calculate_perplexity(tokens, bigrams_freq, unigrams_freq, vocab_size)

# Printing the result
print(f"Perplexity score: {perplexity_score}")

Perplexity score: 0.22997943577694507


### 7. Sentence generation using N-grams

In [72]:
# Defining the function
def generate_sentence(bigram_freq, start_word, max_len):
    current_word = start_word
    sentence = [(current_word)]
    for _ in range (max_len-1):
        next_word_candidates = [(bigram[1], freq) for bigram, freq in bigram_freq.items() if bigram[0] == current_word]
        if next_word_candidates:
            next_word = max(next_word_candidates, key = lambda x: x[1])[0]
            sentence.append(next_word)
            current_word = next_word
        else:
            break
    return " ".join(sentence)

In [76]:
# Generating a sentence
generate_sentence(bigrams_freq, "the", 10)

'the seven habits of the seven habits of the seven'