In [5]:
import unicodedata, re
from pathlib import Path
from collections import Counter

## Normalizing Sentences in each text file

In [6]:
def normalize_sentence(s, replace_numbers = False):
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("“", "\"").replace("”", "\"").replace("‘", "'").replace("’", "'")
    s = s.replace("—", "-").replace("–", "-").replace("…", "...")
    s = s.lower()
    if replace_numbers:
        s = re.sub(r"\d+([.,]\d+)*", "<num>", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def load_and_normalize(filepath: Path, replace_numbers=False):
    with filepath.open("r", encoding="utf-8", errors="replace") as f:
        lines = [normalize_sentence(line, replace_numbers) for line in f if line.strip()]
    # Add sentence boundary markers
    lines = [f"<s> {line} </s>" for line in lines if line]
    return lines

def build_vocab(sentences, min_freq=2):
    counter = Counter()
    for line in sentences:
        for token in line.split():
            counter[token] += 1
    vocab = {tok for tok, c in counter.items() if c >= min_freq}
    vocab.add("<UNK>")
    return vocab, counter


def replace_rare(sentences, vocab):
    new_sentences = []
    for line in sentences:
        tokens = line.split()
        new_line = " ".join(tok if tok in vocab else "<UNK>" for tok in tokens)
        new_sentences.append(new_line)
    return new_sentences

def save_sentences(sentences, out_path: Path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        for line in sentences:
            f.write(line + "\n")

base_raw = Path('/Users/nsumesh/Documents/GitHub/642HW2/ptbdataset')
base_proc = Path('/Users/nsumesh/Documents/GitHub/642HW2/normalizedptbdataset')
files = {
    "train": "ptb.train.txt",
    "valid": "ptb.valid.txt",
    "test":  "ptb.test.txt"
}

print("Normalizing & adding sentence boundaries...")
train_sentences = load_and_normalize(base_raw / files["train"], replace_numbers=True)
valid_sentences = load_and_normalize(base_raw / files["valid"], replace_numbers=True)
test_sentences  = load_and_normalize(base_raw / files["test"],  replace_numbers=True)

print("Building vocabulary...")
vocab, counter = build_vocab(train_sentences, min_freq=2)
print(f"Vocab size: {len(vocab)}")

print("Replacing rare words with <UNK>...")
train_sentences = replace_rare(train_sentences, vocab)
valid_sentences = replace_rare(valid_sentences, vocab)
test_sentences  = replace_rare(test_sentences, vocab)

print("Saving processed files...")
save_sentences(train_sentences, base_proc / "train.final.txt")
save_sentences(valid_sentences, base_proc / "valid.final.txt")
save_sentences(test_sentences,  base_proc / "test.final.txt")

with (base_proc / "vocab.txt").open("w", encoding="utf-8") as vf:
    for token in sorted(vocab):
        vf.write(token + "\n")



Normalizing & adding sentence boundaries...
Building vocabulary...
Vocab size: 9950
Replacing rare words with <UNK>...
Saving processed files...


# N Gram Models

In [None]:
def read_sentences(path):
    sents = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            sents.append(line.split())
    return sents

def build_n_gram_counts(sentences, n):
    ngram_counts = Counter() 
    context_counts = Counter()
    for sentence in sentences:
        length = len(sentence)
        if length<n:
            continue
        for i in range(length-n+1):
            ngram = tuple(sentence[i:i+n])
            ngram_counts[ngram]+=1
            if n>1:
                context = ngram[:-1]
                context_counts[context]+=1
    return ngram_counts, context_counts

def building_ngrams(sentences, max_n = 4):
    results = {}
    for i in range(1, max_n+1):
        results[i] = build_n_gram_counts(sentences,i)
    return results

