In [1]:
import numpy
import torch
import matplotlib.pyplot as plt
%matplotlib inline

# E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. 
# Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?
# E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. 
# Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [2]:
words = open('names.txt', 'r').read().splitlines()

In [3]:
g = torch.Generator().manual_seed(2147483647)
perm = torch.randperm(len(words), generator=g)

n = len(words)
n_train = int(0.8 * n)
n_dev = int(0.1 * n)

train_words = [words[i] for i in perm[:n_train]]
dev_words = [words[i] for i in perm[n_train:n_train + n_dev]]
test_words = [words[i] for i in perm[n_train + n_dev:]]

In [4]:
chars = sorted(list(set(''.join(words))))
ctoi = {c: i+1 for i,c in enumerate(chars)}
ctoi['.'] = 0
N = torch.zeros((27, 27), dtype=torch.int32)
itoc = {i: c for c, i in ctoi.items()}

In [5]:
import torch.nn.functional as F
# NN trigram model
def make_trigram_tensors(words_list):
    xs, ys = [], []
    for w in words_list:
        chs = list(f"..{w}.")
        for i in range(len(chs)-2):
            ix1, ix2, y1 = ctoi[chs[i]], ctoi[chs[i+1]], ctoi[chs[i+2]]
            xs.append([ix1, ix2])
            ys.append(y1)
    xs = torch.tensor(xs).long()
    ys = torch.tensor(ys)
    xs = xs[:, 0] * 27 + xs[:, 1] # Index into a flatten 27*27 = 0,...,728 sequence of numbers.
    return xs, ys

In [6]:
def train_nn(xs, ys, reg_param=0.01):
    #num_examples = xs.shape[0] 
    g = torch.Generator().manual_seed(2147483647)
    W = torch.randn((27*27, 27), generator=g, requires_grad=True) # dim (729, 27)
    for _ in range(300):
        logits = W[xs] # (num_examples, 27)
        loss = F.cross_entropy(logits, ys)
        #counts = logits.exp() # softmax step 1
        #probs = counts / counts.sum(1, keepdim=True) # softmax step 2
        #loss = -probs[torch.arange(num_examples), ys].log().mean() + reg_param*(W**2).mean() # nll = (-1/n) * sum(log(probs)) + regularization

        # update
        W.grad = None    
        loss.backward()

        W.data += -100 * W.grad
    return W

In [7]:
def cross_entropy(logits, ys):
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    loss = -probs[torch.arange(ys.shape[0]), ys].log().mean()
    return loss

In [8]:
xs, ys = make_trigram_tensors(train_words)

In [9]:
# E03: Evaluate regularization based on dev loss for NN of trigram
min_loss, min_W = None, None 
for i in [0.0001, 0.001, 0.01, 1, 10]:
    W = train_nn(xs, ys, reg_param=i)
    train_loss_nn = F.cross_entropy(W[xs], ys)
    print(f"Train loss: {train_loss_nn.item()} for lambda: {i}")
    with torch.no_grad():
        x_dev, y_dev = make_trigram_tensors(dev_words)
        logits = W[x_dev]
        dev_loss_nn = F.cross_entropy(logits, y_dev)
        print(f"Dev loss: {dev_loss_nn.item()} for lambda: {i}")
        loss_val = dev_loss_nn.item()
        if min_loss is None or loss_val < min_loss:
            min_loss = loss_val
            min_W = W.detach().clone()

W = min_W

Train loss: 2.2626869678497314 for lambda: 0.0001
Dev loss: 2.2859621047973633 for lambda: 0.0001
Train loss: 2.2626869678497314 for lambda: 0.001
Dev loss: 2.2859621047973633 for lambda: 0.001
Train loss: 2.2626869678497314 for lambda: 0.01
Dev loss: 2.2859621047973633 for lambda: 0.01
Train loss: 2.2626869678497314 for lambda: 1
Dev loss: 2.2859621047973633 for lambda: 1
Train loss: 2.2626869678497314 for lambda: 10
Dev loss: 2.2859621047973633 for lambda: 10


In [10]:
# E02: Evaluate trigram nll on dev and test splits for NN:
with torch.no_grad():
    x_dev, y_dev = make_trigram_tensors(dev_words)
    logits = W[x_dev]
    dev_loss_nn = F.cross_entropy(logits, y_dev)

    x_test, y_test = make_trigram_tensors(test_words)
    logits = W[x_test]
    test_loss_nn = F.cross_entropy(logits, y_test)

    print(f"{test_loss_nn=}")
    print(f"{dev_loss_nn=}")


test_loss_nn=tensor(2.2784)
dev_loss_nn=tensor(2.2860)


In [11]:
# Sampling
g = torch.Generator().manual_seed(2147483647)
for i in range(5):
  out, ix1, ix2 = [] ,0, 0
  while True:
    ctx = torch.tensor([ix1 * 27 + ix2])
    xenc = F.one_hot(ctx, num_classes=27*27).float()
    logits = xenc @ W # prediction for log counts
    counts = logits.exp() # counts
    p = counts / counts.sum(1, keepdim=True) # probabilities for next char by the model
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itoc[ix])
    if ix == 0:
      break
    ix1, ix2 = ix2, ix
  print(''.join(out))




cexzdfzjglkuriana.
kayhdell.
imilea.
noluwan.
ka.


In [12]:
# Trigram Histogram Model
N = torch.zeros((27, 27, 27), dtype=torch.int32)
for w in train_words:
    chs = list(f"..{w}.")
    for i in range(len(chs)-2):
        ix1, ix2, y1 = ctoi[chs[i]], ctoi[chs[i+1]], ctoi[chs[i+2]]
        N[ix1, ix2, y1]+=1

In [13]:
P = (N+1).float() # laplace smoothing
P /= P.sum(2, keepdim=True)

In [14]:
log_likelihood = 0.0
n = 0

for w in train_words:
    chs = list(f"..{w}.")
    for i in range(len(chs)-2):
        ix1, ix2, y1 = ctoi[chs[i]], ctoi[chs[i+1]], ctoi[chs[i+2]]
        prob = P[ix1, ix2, y1]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n+=1

nll = -log_likelihood/n
print(f"{nll=}")

nll=tensor(2.2147)


In [15]:
g = torch.Generator().manual_seed(2147483647)
for i in range(5):
  out = []
  ix1, ix2 = 0, 0
  while True:
    p = P[ix1, ix2]
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itoc[ix])
    if ix == 0:
      break
    ix1, ix2 = ix2, ix
  print(''.join(out))

ce.
bra.
jalius.
ilaziandrevonimilea.
nylanak.


In [16]:
def trigram_counting_nll(words_list, probs_table):
    log_likelihood, n = 0.0, 0
    for w in words_list:
        chs = list(f"..{w}.")
        for i in range(len(chs) - 2):
            ix1, ix2, ix3 = ctoi[chs[i]], ctoi[chs[i+1]], ctoi[chs[i+2]]
            log_likelihood += torch.log(probs_table[ix1,ix2,ix3])
            n+=1
    return -log_likelihood/n

In [17]:
# E02: Evaluate trigram nll on dev and test splits for counting model:
dev_loss_count = trigram_counting_nll(dev_words, P)
test_loss_count = trigram_counting_nll(test_words, P)
print(f"{dev_loss_count=}")
print(f"{test_loss_count=}")

dev_loss_count=tensor(2.2443)
test_loss_count=tensor(2.2364)
