
# Lesson 3 — N-gram Language Models & Cross-Entropy
**Goal:** Train unigram/bigram/trigram language models on your corpus; sample text; compute cross-entropy.

**What you'll learn**
- Conditional probabilities P(w_t | w_{t-1}, ...)
- Add-k smoothing to avoid zeros
- Sampling with temperature
- Measuring model quality with cross-entropy/perplexity


In [None]:

from pathlib import Path
import re, random, math, collections

data_dir = Path("../data")
text = ""
for fname in ["space.txt","animals.txt","minecraft.txt"]:
    text += (data_dir / fname).read_text(encoding="utf-8") + "\n"

tokens = re.findall(r"[a-zA-Z']+|[.,!?;:]", text.lower())


In [None]:

def ngrams(tokens, n):
    for i in range(len(tokens)-n+1):
        yield tuple(tokens[i:i+n])

def train_ngram(tokens, n=2, k=1.0):
    counts = collections.Counter(ngrams(tokens, n))
    ctx_counts = collections.Counter(ngrams(tokens, n-1)) if n>1 else None
    vocab = sorted(set(tokens))
    V = len(vocab)
    def prob(context, w):
        if n == 1:
            return (counts[(w,)] + k) / (len(tokens) + k*V)
        else:
            c = counts[context + (w,)]
            ctx = ctx_counts[context]
            return (c + k) / (ctx + k*V)
    return prob, vocab

unigram, V1 = train_ngram(tokens, 1, k=1.0)
bigram, V2 = train_ngram(tokens, 2, k=0.5)
trigram, V3 = train_ngram(tokens, 3, k=0.1)
len(V1), len(V2), len(V3)


In [None]:

import random

def sample(prob, vocab, n=2, max_len=30, temperature=1.0, seed=None):
    random.seed(seed)
    result = []
    if n == 1:
        context = ()
    elif n == 2:
        context = (random.choice(vocab),)
    else:
        context = (random.choice(vocab), random.choice(vocab))

    for _ in range(max_len):
        # build distribution
        scores = []
        for w in vocab:
            p = prob(context, w)
            scores.append(p ** (1.0/temperature))
        s = sum(scores)
        r = random.random() * s
        cum = 0.0
        for w, sc in zip(vocab, scores):
            cum += sc
            if cum >= r:
                result.append(w)
                break
        # advance context
        if n == 1:
            context = ()
        elif n == 2:
            context = (w,)
        else:
            context = (context[-1], w)
    return " ".join(result)

print("Bigram sample:", sample(bigram, V2, n=2, temperature=0.8, seed=42))
print("Trigram sample:", sample(trigram, V3, n=3, temperature=0.9, seed=7))


In [None]:

def cross_entropy(prob, vocab, tokens, n):
    # evaluate on held-out tail
    split = int(0.8*len(tokens))
    test = tokens[split:]
    H = 0.0
    count = 0
    for i in range(len(test)):
        if n == 1:
            context = ()
        elif n == 2:
            if i < 1: continue
            context = (test[i-1],)
        else:
            if i < 2: continue
            context = (test[i-2], test[i-1])
        p = max(prob(context, test[i]), 1e-12)
        H += -math.log2(p)
        count += 1
    return H / max(count,1)

for n, p, V in [(1, unigram, V1),(2, bigram, V2),(3, trigram, V3)]:
    H = cross_entropy(p, V, tokens, n)
    ppl = 2**H
    print(f"{n}-gram: cross-entropy={H:.2f}, perplexity={ppl:.2f}")



### Challenges
- Change smoothing `k` and see effects on perplexity.
- Add more themed text to `data/` and re-train.
