# L01_E02

Split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

Let's do this with the counting model.

In [None]:
import torch

In [None]:
import random
random.seed(42)

words = open('../names.txt','r').read().splitlines()

random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

words_tr = words[:n1]
words_dev = words[n1:n2]
words_te = words[n2:]

In [None]:
chars = sorted(list(set(''.join(words_tr))))
len(chars)

26

In [None]:
ctoi = {c : i+1 for i,c in enumerate(chars)}
ctoi['.'] = 0

In [None]:
itoc = {i:c for c,i in ctoi.items()}

In [None]:
num_chars = len(ctoi.keys())
num_chars

27

# Bigram Model

In [None]:
def train_bigram(words):
    N = torch.ones((num_chars,num_chars), dtype=torch.int32)
    for word in words:
        chs = '.' + word + '.'
        for ch1,ch2 in zip(chs,chs[1:]):
            N[ctoi[ch1],ctoi[ch2]] += 1
    P = N.float()
    P /= P.sum(1, keepdim=True)
    return P

In [None]:
P = train_bigram(words_tr)

## Model evaluation

In [None]:
def evaluate_average_negloglikelihood_bigram(words):
    log_likelihood = 0.
    n = 0
    for word in words:
        chs = '.' + word + '.'
        for ch1,ch2 in zip(chs,chs[1:]):
            prob = P[ctoi[ch1],ctoi[ch2]]
            logprob = torch.log(prob)
            log_likelihood += logprob
            n += 1
    
    print(f'{log_likelihood=:.4f}') 
    print(f'{n=}')
    nll = -log_likelihood
    print(f'{nll/n=:.4f}') 

In [None]:
# performance on train
evaluate_average_negloglikelihood_bigram(words_tr)

log_likelihood=-448229.5938
n=182625
nll/n=2.4544


In [None]:
# performance on dev
evaluate_average_negloglikelihood_bigram(words_dev)

log_likelihood=-55579.4883
n=22655
nll/n=2.4533


In [None]:
# performance on test
evaluate_average_negloglikelihood_bigram(words_te)

log_likelihood=-56214.3711
n=22866
nll/n=2.4584


Performance on the dev and test sets don't show overfitting.

# Trigram model

In [None]:
stoi = {}
for i0,c0 in sorted(itoc.items(), key=lambda kv: kv[0]):
    for i1,c1 in sorted(itoc.items(), key=lambda kv: kv[0]):
        #print((i0*num_chars) + i1,c0,c1)
        stoi[(c0,c1)] = (i0*num_chars) + i1

In [None]:
stoi[('c','a')], stoi['a','n']

(82, 41)

In [None]:
def train_trigram(words):
    N = torch.ones((num_chars*num_chars, num_chars), 
                   dtype=torch.int32
                  )
    for word in words:
        chs = '..'+word+'.'
        for ch1,ch2,ch3 in zip(chs,chs[1:],chs[2:]):
            row = stoi[(ch1,ch2)]
            column = ctoi[ch3]
            N[row,column] += 1
    P = N.float()
    P /= P.sum(1, keepdim=True)
    return P

In [None]:
P = train_trigram(words_tr)

## Model evaluation

In [None]:
def evaluate_average_negloglikelihood_trigram(words):
    log_likelihood = 0.
    n = 0
    for word in words:
        chs = '..' + word + '.'
        for ch1,ch2,ch3 in zip(chs,chs[1:],chs[2:]):
            #print(ch1,ch2,ch3)
            prob = P[stoi[ch1,ch2],ctoi[ch3]]
            logprob = torch.log(prob)
            log_likelihood += logprob
            n += 1
    
    print(f'{log_likelihood=:.4f}') 
    print(f'{n=}')
    nll = -log_likelihood
    print(f'{nll/n=:.4f}') 

In [None]:
evaluate_average_negloglikelihood_trigram(words_tr)

log_likelihood=-404643.9062
n=182625
nll/n=2.2157


In [None]:
evaluate_average_negloglikelihood_trigram(words_dev)

log_likelihood=-50666.9961
n=22655
nll/n=2.2365


In [None]:
evaluate_average_negloglikelihood_trigram(words_te)

log_likelihood=-51158.6094
n=22866
nll/n=2.2373


There seems to be overfitting on the training set. The performance on the dev and test sets are worse than on the training set.