<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/NLP-Projects/blob/main/Text%20Generation%20-%20Advanced%20N-Gram%20Model/Text_Generation_Advanced_N_Gram_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libs

In [40]:
import requests
import re
import math
import random
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# Utility

> *Load & Preprocess Moby Dick from Project Gutenberg*

In [41]:
def load_moby_dick():
    url = "https://www.gutenberg.org/files/2701/2701-0.txt"
    response = requests.get(url)
    text = response.text

    # Optional: Cut off metadata
    start = text.find("CHAPTER 1. Loomings.")       # Start of text
    end = text.find("End of Project Gutenberg")     # End of text
    text = text[start:end]                          # Full text
    return text

> *Preprocess Text*

In [42]:
def preprocess(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # remove punctuation
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = ['<s>', '<s>'] + tokens + ['</s>']
    return tokens

> *Train Trigram Model*

In [49]:
def build_ngram_models(tokens):
    trigram_counts = defaultdict(int)
    bigram_counts = defaultdict(int)
    unigram_counts = defaultdict(int)

    for i in range(len(tokens) - 2):
        w1, w2, w3 = tokens[i], tokens[i+1], tokens[i+2]
        trigram_counts[(w1, w2, w3)] += 1
        bigram_counts[(w1, w2)] += 1
        unigram_counts[w1] += 1
    unigram_counts[tokens[-2]] += 1
    unigram_counts[tokens[-1]] += 1

    vocab = set(tokens)
    return trigram_counts, bigram_counts, unigram_counts, vocab

> *Interpolated Probability*

In [51]:
def interpolated_prob(w1, w2, w3, tri, bi, uni, V, l1=0.6, l2=0.3, l3=0.1):
    trigram = tri.get((w1, w2, w3), 0)
    bigram = bi.get((w2, w3), 0)
    unigram = uni.get(w3, 0)

    p_tri = trigram / bi.get((w1, w2), 1)
    p_bi = bigram / uni.get(w2, 1)
    p_uni = unigram / sum(uni.values())

    return l1 * p_tri + l2 * p_bi + l3 * p_uni


> *Compute Perplexity*

In [52]:
def compute_perplexity(tokens, tri, bi, uni, vocab):
    log_prob = 0
    N = 0
    V = len(vocab)

    for i in range(2, len(tokens)):
        w1, w2, w3 = tokens[i-2], tokens[i-1], tokens[i]
        prob = interpolated_prob(w1, w2, w3, tri, bi, uni, V)
        log_prob += math.log(prob + 1e-10)  # to prevent log(0)
        N += 1
    return math.exp(-log_prob / N)


> *Text Generation With Temperature*

In [53]:
def generate_text(tri, bi, uni, vocab,
                  max_words=30, temperature=1.0,
                  seed=['<s>', '<s>']):
    generated = seed.copy()
    V = len(vocab)
    for _ in range(max_words):
        w1, w2 = generated[-2], generated[-1]
        probs = []
        candidates = list(vocab)
        for w3 in candidates:
            p = interpolated_prob(w1, w2, w3, tri, bi, uni, V)
            probs.append(p ** (1 / temperature))
        total = sum(probs)
        probs = [p / total for p in probs]
        next_word = random.choices(candidates, weights=probs, k=1)[0]
        if next_word == '</s>':
            break
        generated.append(next_word)
    return ' '.join(generated[2:])

# Run

In [54]:
# Run the pipeline
text = load_moby_dick()
tokens = preprocess(text)
print(f"Total Tokens: {len(tokens)}")

trigram_counts, bigram_counts, unigram_counts, vocab = build_ngram_models(tokens)

Total Tokens: 212471


In [55]:
tri, bi, uni, vocab = build_ngram_models(tokens)
print("Perplexity:", compute_perplexity(tokens, tri, bi, uni, vocab))

Perplexity: 4.952905608522148


In [56]:
# Generate text
print("Sample generated texts:")
for temp in [0.7, 1.0, 1.5]:
    print(f"\nTemperature: {temp}")
    print(generate_text(tri, bi, uni, vocab, max_words=40, temperature=temp))

Sample generated texts:

Temperature: 0.7
chapter loomings call me that he can not be as it seemed madness it flew from right to them for theirs and that the modern to the captain for some few hands are wanted from the fiery dart that he

Temperature: 1.0
only consist in hard words were spoken is involuntary consternation commanded three soaked biscuits ye know not even of the tackles may hold by oh sir the hatchway and peered down from this if i touching plenty with my timber

Temperature: 1.5
awhaling fastened beloved fellowcreatures have after such escapes lord hammer the course the head his unmomentous matter this internal carekilling melted naturally conceit climbing the circumference duelled yarman captains overboarddown comes slowly stealthily ignited his legs coming but and plastertied
