<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/NLP-Projects/blob/main/Text%20Generation%20N-Grams%20-%20Probabilistic%20Model/Text_Generation_N_gram_Model_With_Chain_Rule_%7C_Markov_Assumptions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Libraries

In [28]:
import requests
import re
import random
import nltk
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Download Dataset

In [29]:
url = "https://www.gutenberg.org/files/2701/2701-0.txt"
# Get text
text = requests.get(url).text

# Filter mobydick corpus
start = text.find("CHAPTER 1. Loomings.")       # Start of text
end = text.find("End of Project Gutenberg")     # End of text
text = text[start:end]                          # Full text

# Text Cleaning

In [30]:
text = re.sub(r"[^a-zA-Z\s]", "", text).lower()
tokens  = word_tokenize(text)
tokens = ['<s>', '<s>'] + tokens + ['</s>']
print(f"Total words: {len(tokens)}")

Total words: 212471


In [31]:
tokens[:10]

['<s>',
 '<s>',
 'chapter',
 'loomings',
 'chapter',
 'the',
 'carpetbag',
 'chapter',
 'the',
 'spouterinn']

# Creating `unigram`, `bigram`, `trigram`

In [32]:
unigrams = Counter(tokens)
bigrams = Counter([(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)])
trigrams = Counter([(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens)-2)])
print(f"Bigram samples: ")
for pair, count in bigrams.most_common(10):
    print(f"{pair} -> {count}")

print("\n\nTigrams samples: ")
for triplet, count in bigrams.most_common(10):
    print(f"{triplet} -> {count}")

Bigram samples: 
('of', 'the') -> 1887
('in', 'the') -> 1179
('to', 'the') -> 729
('from', 'the') -> 441
('of', 'his') -> 372
('and', 'the') -> 370
('on', 'the') -> 356
('of', 'a') -> 333
('at', 'the') -> 329
('to', 'be') -> 327


Tigrams samples: 
('of', 'the') -> 1887
('in', 'the') -> 1179
('to', 'the') -> 729
('from', 'the') -> 441
('of', 'his') -> 372
('and', 'the') -> 370
('on', 'the') -> 356
('of', 'a') -> 333
('at', 'the') -> 329
('to', 'be') -> 327


## Calculating `Chain Rule Of Probability`

In [33]:
def trigram_prob(w1, w2, w3):
    bigram = (w1, w2)
    trigram = (w1, w2, w3)
    if bigrams[bigram] == 0:
        return 0
    return trigrams[trigram] / bigrams[bigram]

## Generating Text

In [35]:
def generate_sentence(start=('<s>', '<s>'), max_len=20):
    w1, w2 = start
    sentence = [w1, w2]

    for _ in range(max_len):
        candidates = [(w3, trigram_prob(w1, w2, w3))
                      for (w1_, w2_, w3) in trigrams
                      if w1_ == w1 and w2_ == w2]

        if not candidates:
            break

        w3 = max(candidates, key=lambda x: x[1])[0]

        if w3 == "</s>":
            break

        sentence.append(w3)
        w1, w2 = w2, w3

    return ' '.join(sentence[2:])

# Test
print("📝 Generated Sentence:")
print(generate_sentence())

📝 Generated Sentence:
chapter loomings chapter the pequod was as a general thing the mores the pity so if any strange face were
