In [5]:
import unicodedata, re
from pathlib import Path
from collections import Counter

## Normalizing Sentences in each text file

In [7]:
def normalize_sentence(s, replace_numbers = False):
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("“", "\"").replace("”", "\"").replace("‘", "'").replace("’", "'")
    s = s.replace("—", "-").replace("–", "-").replace("…", "...")
    s = s.lower()
    if replace_numbers:
        s = re.sub(r"\d+([.,]\d+)*", "<num>", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def load_and_normalize(filepath: Path, replace_numbers=False):
    with filepath.open("r", encoding="utf-8", errors="replace") as f:
        lines = [normalize_sentence(line, replace_numbers) for line in f if line.strip()]
    # Add sentence boundary markers
    lines = [f"<s> {line} </s>" for line in lines if line]
    return lines

def build_vocab(sentences, min_freq=2):
    counter = Counter()
    for line in sentences:
        for token in line.split():
            counter[token] += 1
    vocab = {tok for tok, c in counter.items() if c >= min_freq}
    vocab.add("<UNK>")
    return vocab, counter


def replace_rare(sentences, vocab):
    new_sentences = []
    for line in sentences:
        tokens = line.split()
        new_line = " ".join(tok if tok in vocab else "<UNK>" for tok in tokens)
        new_sentences.append(new_line)
    return new_sentences

def save_sentences(sentences, out_path: Path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        for line in sentences:
            f.write(line + "\n")

base_raw = Path('/Users/nsumesh/Documents/GitHub/642HW2/ptbdataset')
base_proc = Path('/Users/nsumesh/Documents/GitHub/642HW2/normalizedptbdataset')
files = {
    "train": "ptb.train.txt",
    "valid": "ptb.valid.txt",
    "test":  "ptb.test.txt"
}

print("Normalizing & adding sentence boundaries...")
train_sentences = load_and_normalize(base_raw / files["train"], replace_numbers=True)
valid_sentences = load_and_normalize(base_raw / files["valid"], replace_numbers=True)
test_sentences  = load_and_normalize(base_raw / files["test"],  replace_numbers=True)

print("Building vocabulary...")
vocab, counter = build_vocab(train_sentences, min_freq=2)
print(f"Vocab size: {len(vocab)}")

print("Replacing rare words with <UNK>...")
train_sentences = replace_rare(train_sentences, vocab)
valid_sentences = replace_rare(valid_sentences, vocab)
test_sentences  = replace_rare(test_sentences, vocab)

print("Saving processed files...")
save_sentences(train_sentences, base_proc / "train.final.txt")
save_sentences(valid_sentences, base_proc / "valid.final.txt")
save_sentences(test_sentences,  base_proc / "test.final.txt")

with (base_proc / "vocab.txt").open("w", encoding="utf-8") as vf:
    for token in sorted(vocab):
        vf.write(token + "\n")



Normalizing & adding sentence boundaries...
Building vocabulary...
Vocab size: 9950
Replacing rare words with <UNK>...
Saving processed files...


# N Gram Models

In [8]:
def read_sentences(path):
    sents = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            sents.append(line.split())
    return sents

def build_n_gram_counts(sentences, n):
    ngram_counts = Counter() 
    context_counts = Counter()
    for sentence in sentences:
        length = len(sentence)
        if length<n:
            continue
        for i in range(length-n+1):
            ngram = tuple(sentence[i:i+n])
            ngram_counts[ngram]+=1
            if n>1:
                context = ngram[:-1]
                context_counts[context]+=1
    return ngram_counts, context_counts

def building_ngrams(sentences, max_n = 4):
    results = {}
    for i in range(1, max_n+1):
        results[i] = build_n_gram_counts(sentences,i)
    return results


path_dir = Path('/Users/nsumesh/Documents/GitHub/642HW2/normalizedptbdataset/train.final.txt')
sentences = read_sentences(path_dir)

all_counts = building_ngrams(sentences, max_n=4)
unigram_count, unigram_context = all_counts[1]
bigram_count, bigram_context = all_counts[2]
trigram_count, trigram_context = all_counts[3]
fourgram_count, fourgram_context = all_counts[4]



# MLE Probabilities

In [11]:
import math
def mle_probabilities(word, context, ngram_count, context_count):
    ngram = context + (word,)
    frequency = ngram_count.get(ngram,0)
    if(len(context)==0):
        denominator = sum(ngram_count.values())
    else:
        denominator = context_count.get(context,0)
    if denominator==0.0 or frequency==0.0:
        return 0.0
    return frequency/denominator

In [14]:
print("P('the') =", mle_probabilities("the", (), unigram_count, {}))
print("P('market'|'the') =", mle_probabilities("market", ("the",), bigram_count, bigram_context))


P('the') = 0.05225094863722486
P('market'|'the') = 0.00992712231632854


In [18]:
def compute_perplexity(sentences, ngram_count, context_count, n):
    total_log_probabilities = 0.0
    tokens = 0
    for sentence in sentences:
        for i in range(n-1, len(sentence)):
            context = tuple(sentence[i:i+n]) if n>1 else ()
            word = sentence[i]
            mle_probability = mle_probabilities(word, context, ngram_count, context_count)
            if mle_probability==0.0:
                return float("inf")
            total_log_probabilities+=math.log2(mle_probability)
            tokens+=1
    if tokens==0.0:
        return float("inf")
    average_log_probabilities = total_log_probabilities/tokens
    return 2**(-average_log_probabilities)

unigram_perplexity = compute_perplexity(sentences, unigram_count, unigram_context, n=1)
bigram_perplexity = compute_perplexity(sentences, bigram_count, bigram_context, n=2)
trigram_perplexity = compute_perplexity(sentences, trigram_count, trigram_context, n=3)
fourgram_perplexity = compute_perplexity(sentences, fourgram_count, fourgram_context, n=4)

print("Unigram Perplexity: ", unigram_perplexity)
print("Bigram Perplexity: ", bigram_perplexity)
print("Trigram Perplexity: ", trigram_perplexity)
print("Four gram Perplexity: ", fourgram_perplexity)

Unigram Perplexity:  616.5898425739796
Bigram Perplexity:  inf
Trigram Perplexity:  inf
Four gram Perplexity:  inf


In [19]:
def laplace_probability(word, context, ngram_count, context_count):
    ngram = context + (word,)
    frequency = ngram_count.get(ngram,0)+1
    if(len(context)==0):
        denominator = sum(ngram_count.values()) + len(vocab)
    else:
        denominator = context_count.get(context,0) + len(vocab)
    # if denominator==0.0 or frequency==0.0:
    #     return 0.0
    return frequency/denominator 


def compute_perplexity_with_laplace(sentences, ngram_count, context_count, n):
    total_log_probabilities = 0.0
    tokens = 0
    for sentence in sentences:
        for i in range(n-1, len(sentence)):
            context = tuple(sentence[i:i+n]) if n>1 else ()
            word = sentence[i]
            laplace_prob = laplace_probability(word, context, ngram_count, context_count)
            if laplace_prob==0.0:
                return float("inf")
            total_log_probabilities+=math.log2(laplace_prob)
            tokens+=1
    if tokens==0.0:
        return float("inf")
    average_log_probabilities = total_log_probabilities/tokens
    return 2**(-average_log_probabilities)


unigram_perplexity_with_laplace = compute_perplexity_with_laplace(sentences, unigram_count, unigram_context, n=1)
bigram_perplexity_with_laplace = compute_perplexity_with_laplace(sentences, bigram_count, bigram_context, n=2)
trigram_perplexity_with_laplace = compute_perplexity_with_laplace(sentences, trigram_count, trigram_context, n=3)
fourgram_perplexity_with_laplace = compute_perplexity_with_laplace(sentences, fourgram_count, fourgram_context, n=4)

print("Unigram Perplexity with Laplace: ", unigram_perplexity_with_laplace)
print("Bigram Perplexity with Laplace: ", bigram_perplexity_with_laplace)
print("Trigram Perplexity with Laplace: ", trigram_perplexity_with_laplace)
print("Fourgram Perplexity with Laplace: ", fourgram_perplexity_with_laplace)


Unigram Perplexity with Laplace:  616.7887358737605
Bigram Perplexity with Laplace:  9949.999998582753
Trigram Perplexity with Laplace:  9949.999998779924
Fourgram Perplexity with Laplace:  9949.99999899671
