In [1]:
words = open('names.txt', 'r').read().splitlines()

In [2]:
from string import ascii_lowercase
stoi = {s:i+1 for i,s in enumerate(ascii_lowercase)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
stoi
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [3]:
tokens = [list(f'.{w}.') for w in words]
tokens[0]

['.', 'e', 'm', 'm', 'a', '.']

## E01
train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [6]:
import torch

BiLM = torch.zeros((27, 27), dtype=torch.int32)
TriLM = torch.zeros((27, 27, 27), dtype=torch.int32)

for chs in tokens:
    # bigram
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1, ix2 = stoi[ch1], stoi[ch2]
        BiLM[ix1, ix2] += 1
        
    # trigram
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1, ix2, ix3 = stoi[ch1], stoi[ch2], stoi[ch3]
        TriLM[ix1, ix2, ix3] += 1
        
BiP = (BiLM + 1).float()
BiP /= BiP.sum(1, keepdims=True)

TriP = (TriLM + 1).float()
TriP /= TriP.sum(2, keepdims=True)

In [7]:
bi_log_likelihood = 0.0
bi_n = 0

tri_log_likelihood = 0.0
tri_n = 0


for chs in tokens:
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1, ix2 = stoi[ch1], stoi[ch2]
        prob = BiP[ix1, ix2]
        logprob = torch.log(prob)
        
        bi_log_likelihood += logprob
        bi_n += 1
        
    # trigram
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1, ix2, ix3 = stoi[ch1], stoi[ch2], stoi[ch3]
        prob = TriP[ix1, ix2, ix3]
        logprob = torch.log(prob)
        
        tri_log_likelihood += logprob
        tri_n += 1

bi_loss = -bi_log_likelihood / bi_n
tri_loss = -tri_log_likelihood / tri_n

print(f'{bi_loss} vs {tri_loss} -> improved? {"YES" if tri_loss < bi_loss else "NO"}')
        

2.4543561935424805 vs 2.092747449874878 -> improved? YES


In [14]:
tri_n

196113

## E02
split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [6]:
import torch


train_data, dev_data, test_data = torch.utils.data.random_split(tokens, [0.8, 0.1, 0.1], generator=g)

# train models
BiLM = torch.zeros((27, 27), dtype=torch.int32)
TriLM = torch.zeros((27, 27, 27), dtype=torch.int32)

for chs in train_data:
    # bigram
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1, ix2 = stoi[ch1], stoi[ch2]
        BiLM[ix1, ix2] += 1
        
    # trigram
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1, ix2, ix3 = stoi[ch1], stoi[ch2], stoi[ch3]
        TriLM[ix1, ix2] += 1

BiP = (BiLM + 1).float()
BiP /= BiP.sum(1, keepdims=True)

TriP = (TriLM+1).float()
TriP /= TriP.sum(2, keepdims=True)

# evaluate models
def eval_nll(data):
    bi_lll = tri_lll = 0.0
    bi_n = tri_n = 0
    
    for chs in data:
        for ch1, ch2 in zip(chs, chs[1:]):
            ix1, ix2 = stoi[ch1], stoi[ch2]
            
            logprob = torch.log(BiP[ix1, ix2])
            bi_lll += logprob
            bi_n += 1

        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1, ix2, ix3 = stoi[ch1], stoi[ch2], stoi[ch3]
            
            logprob = torch.log(TriP[ix1, ix2, ix3])
            tri_lll += logprob
            tri_n += 1
            
    return (-bi_lll/bi_n).item(), (-tri_lll/tri_n).item()


dev_bi_nll, dev_tri_nll = eval_nll(dev_data)
print(f'on dev set: {dev_bi_nll} vs {dev_tri_nll}')
      

test_bi_nll, test_tri_nll = eval_nll(test_data)
print(f'on test set: {test_bi_nll} vs {test_tri_nll}')

# bi becomes worse, tri becomes better?

NameError: name 'g' is not defined

## E03
use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

In [None]:
P = (TriLM+0.5).float()
P /= P.sum(2, keepdims=True)

# evaluate models
def eval_nll(data):
    lll = 0.0
    n = 0
    
    for chs in data:
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1, ix2, ix3 = stoi[ch1], stoi[ch2], stoi[ch3]
            
            logprob = torch.log(P[ix1, ix2, ix3])
            lll += logprob
            n += 1
            
    return (-lll/n).item()


dev_nll = eval_nll(dev_data)
print(f'on dev set: {dev_nll}')
      

test_nll = eval_nll(test_data)
print(f'on test set: {test_nll}')


In [None]:
"""
 E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. 
 Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?
"""
import random

# for w in words[:10]:
new_words = words.copy()
random.shuffle(new_words)
capacity = len(new_words)

base_idx = capacity // 10
train_set = new_words[:base_idx * 8]
dev_set = new_words[base_idx * 8:base_idx * 9]
test_set = new_words[base_idx * 9:]

print(f'train_set: {len(train_set)}, dev_set: {len(dev_set)}, test_set: {len(test_set)}')


# train language model
bigram_lm = BigramLM()
trigram_lm = TrigramLM()
for train_data in train_set:
    bigram_lm.update_counts(train_data)
    trigram_lm.update_counts(train_data)
bigram_lm.calculate_probabilities()
trigram_lm.calculate_probabilities()

# evaluate dev set
for dev_data in dev_set:
    bigram_dev_loss = bigram_lm.calculate_loss(dev_data)
    trigram_dev_loss = trigram_lm.calculate_loss(dev_data)
    print(f"Bigram/Trigram dev loss: {bigram_dev_loss:.4f} vs {trigram_dev_loss:.4f}")

# evaluate test set
for test_data in test_set:
    bigram_test_loss = bigram_lm.calculate_loss(test_data)
    trigram_test_loss = trigram_lm.calculate_loss(test_data)
    print(f"Bigram/Trigram test loss: {bigram_test_loss:.4f} vs {trigram_test_loss:.4f}")
