Without NLTK

In [None]:
from collections import defaultdict
import re
import numpy as np
import pandas as pd

In [None]:
# Input text data
data = "There is a big garden. Children play in a garden. They play inside beautiful garden."

# Preprocess the data by adding (eos) tags and splitting into sentences
data = re.sub(r'\.', ' (eos)', data)
sentences = data.split('. ')

In [None]:
# Generate n-grams without NLTK
def generate_ngrams(text, n):
    words = text.split()
    ngrams = []
    for i in range(len(words) - n + 1):
        ngram = tuple(words[i:i+n])
        ngrams.append(ngram)
    return ngrams

unigrams = generate_ngrams(data, 1)
bigrams = generate_ngrams(data, 2)
trigrams = generate_ngrams(data, 3)

print(unigrams)
print(bigrams)
print(trigrams)

[('There',), ('is',), ('a',), ('big',), ('garden',), ('(eos)',), ('Children',), ('play',), ('in',), ('a',), ('garden',), ('(eos)',), ('They',), ('play',), ('inside',), ('beautiful',), ('garden',), ('(eos)',)]
[('There', 'is'), ('is', 'a'), ('a', 'big'), ('big', 'garden'), ('garden', '(eos)'), ('(eos)', 'Children'), ('Children', 'play'), ('play', 'in'), ('in', 'a'), ('a', 'garden'), ('garden', '(eos)'), ('(eos)', 'They'), ('They', 'play'), ('play', 'inside'), ('inside', 'beautiful'), ('beautiful', 'garden'), ('garden', '(eos)')]
[('There', 'is', 'a'), ('is', 'a', 'big'), ('a', 'big', 'garden'), ('big', 'garden', '(eos)'), ('garden', '(eos)', 'Children'), ('(eos)', 'Children', 'play'), ('Children', 'play', 'in'), ('play', 'in', 'a'), ('in', 'a', 'garden'), ('a', 'garden', '(eos)'), ('garden', '(eos)', 'They'), ('(eos)', 'They', 'play'), ('They', 'play', 'inside'), ('play', 'inside', 'beautiful'), ('inside', 'beautiful', 'garden'), ('beautiful', 'garden', '(eos)')]


In [None]:
def create_bigram_table(bigrams):
    table = defaultdict(lambda: defaultdict(int))
    for bigram in bigrams:
        prev_word, curr_word = bigram
        table[prev_word][curr_word] += 1

    # Convert the nested defaultdict to a Pandas DataFrame
    rows = sorted(table.keys())
    columns = sorted({word for words in table.values() for word in words})
    data = [[table[prev_word][curr_word] for curr_word in columns] for prev_word in rows]

    df = pd.DataFrame(data, index=rows, columns=columns)
    return df

bigram_table_df = create_bigram_table(bigrams)
display(bigram_table_df)

Unnamed: 0,(eos),Children,They,a,beautiful,big,garden,in,inside,is,play
(eos),0,1,1,0,0,0,0,0,0,0,0
Children,0,0,0,0,0,0,0,0,0,0,1
There,0,0,0,0,0,0,0,0,0,1,0
They,0,0,0,0,0,0,0,0,0,0,1
a,0,0,0,0,0,1,1,0,0,0,0
beautiful,0,0,0,0,0,0,1,0,0,0,0
big,0,0,0,0,0,0,1,0,0,0,0
garden,3,0,0,0,0,0,0,0,0,0,0
in,0,0,0,1,0,0,0,0,0,0,0
inside,0,0,0,0,1,0,0,0,0,0,0


In [None]:
# Calculate sentence probabilities using the bigram table DataFrame
def calculate_sentence_probability(sentence, bigram_df):
    words = sentence.split()
    probability = 1.0
    for i in range(len(words) - 1):
        prev_word = words[i]
        curr_word = words[i + 1]
        bigram_count = bigram_df.loc[prev_word, curr_word]
        prev_word_count = sum(bigram_df.loc[prev_word])
        bigram_probability = bigram_count / prev_word_count
        probability *= bigram_probability
    return probability

In [None]:
import math

sentence = "They play in a big garden"

sentence_prob = calculate_sentence_probability(sentence, bigram_table_df)
print(f"Probability of '{sentence}': {sentence_prob:.6f}")

N = len(sentence) - 2  # Exclude <s> and </s> tokens
perplexity = math.pow(1 / sentence_prob, 1 / N)

print("Perplexity:", perplexity)

Probability of 'They play in a big garden': 0.250000
Perplexity: 1.062127176862691


With NLTK

In [None]:
import nltk
nltk.download('punkt')
from nltk.util import ngrams
from nltk import word_tokenize

sentence = "The quick brown fox jumps over the lazy dog."

tokens = word_tokenize(sentence)
unigrams_nltk = ngrams(tokens, 1)
bigrams_nltk = ngrams(tokens, 2)
trigrams_nltk = ngrams(tokens, 3)

print("Unigrams (NLTK):", list(unigrams_nltk))
print("Bigrams (NLTK):", list(bigrams_nltk))
print("Trigrams (NLTK):", list(trigrams_nltk))


Unigrams (NLTK): [('The',), ('quick',), ('brown',), ('fox',), ('jumps',), ('over',), ('the',), ('lazy',), ('dog',), ('.',)]
Bigrams (NLTK): [('The', 'quick'), ('quick', 'brown'), ('brown', 'fox'), ('fox', 'jumps'), ('jumps', 'over'), ('over', 'the'), ('the', 'lazy'), ('lazy', 'dog'), ('dog', '.')]
Trigrams (NLTK): [('The', 'quick', 'brown'), ('quick', 'brown', 'fox'), ('brown', 'fox', 'jumps'), ('fox', 'jumps', 'over'), ('jumps', 'over', 'the'), ('over', 'the', 'lazy'), ('the', 'lazy', 'dog'), ('lazy', 'dog', '.')]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
