In [1]:
# Exercises:
# E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?
# E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?
# E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?
# E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?
# E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?
# E06: meta-exercise! Think of a fun/interesting exercise and complete it.

## E01

In [3]:
# loading the data
import torch
import matplotlib.pyplot as plt
%matplotlib inline
import torch.nn.functional as F
from tqdm import tqdm


words = open("names.txt", 'r').read().splitlines()

# words des
print("length : ",len(words))
print("max word length : ",max(len(w) for w in words))
print("min word length : ",min(len(w) for w in words))
print(words[:5])

length :  32033
max word length :  15
min word length :  2
['emma', 'olivia', 'ava', 'isabella', 'sophia']


In [4]:
# preprocessing
N = torch.zeros((27,702), dtype=torch.int32)

a = '.abcdefghijklmnopqrstuvwxyz'
d_stoi = {}
count = 0
for i in a:
    for j in a[1:]:
        d_stoi[(i+j)] = count
        count += 1

s_stoi = {}
count = 0
for i in a:
    s_stoi[i] = count
    count += 1
s_stoi

# reverse stoi
d_itos = {i:s for s,i in d_stoi.items()}
s_itos = {i:s for s,i in s_stoi.items()}


for w in tqdm(words):
    w = '.' + w + '.'
    length = len(w)
    if length > 1:
        for i in range(length - 1): 
            try: 
                ix1 = d_stoi[(w[i:i+2])]
                ix2 = s_stoi[(w[i+2])]
            except:
                continue
            N[ix2, ix1] += 1

100%|██████████████████████████████████████████████████████████████████████████| 32033/32033 [00:19<00:00, 1648.18it/s]


In [5]:
out = []
P = (N+1).float()
row_sum = P.sum(1, keepdim=True) 

P /= row_sum
P[0].sum()

tensor(1.)

In [8]:
# loss
log_likelihood = 0.0
counter = 0

for w in tqdm(words):
    w = '.' + w + '.'
    length = len(w)
    if length > 1:
        for i in range(length - 1): 
            try: 
                ix1 = d_stoi[(w[i:i+2])]
                ix2 = s_stoi[(w[i+2])]
            except:
                continue
            prob = P[ix2, ix1]
            logprob = torch.log(prob)
            log_likelihood += logprob
            counter +=1



negative_log_likelihood = -log_likelihood
avg_negative_log_likelihood = negative_log_likelihood / counter

print(f'{log_likelihood=}')
print(f'{negative_log_likelihood=}')
print(f'{avg_negative_log_likelihood=}')

100%|██████████████████████████████████████████████████████████████████████████| 32033/32033 [00:13<00:00, 2382.96it/s]

log_likelihood=tensor(-919222.3750)
negative_log_likelihood=tensor(919222.3750)
avg_negative_log_likelihood=tensor(4.6872)





In [10]:
# sampling
g = None
out = []
for i in range(5):
    name =''
    is_start = True
    while True:
        if is_start:
            ix = torch.multinomial(P[0][:26], num_samples=1, replacement=True,generator = g).item()
            name +=d_itos[ix]
            ix = torch.multinomial(P[:,ix], num_samples=1, replacement=True,generator = g).item()
            name += s_itos[ix]
            is_start = False
        else:
            to_fed = name[-2:] # last two string
            ix = d_stoi[to_fed]
            ix = torch.multinomial(P[:,ix], num_samples=1, replacement=True,generator = g).item()
            name += s_itos[ix]
        if ix == 0 or len(name) > 15:  #if its find '.'
            if len(name) > 15: name += '.'
            break

    out.append(name[1:-1])
print(out)

['wijjgtodisbscoj', 'wingushtpjfvjcj', 'kaysen', 'mcla', 'jzwmkwqdcmqgkxv']


In [44]:
P.shape

torch.Size([27, 702])

In [139]:
# neural network approch
import torch.nn.functional as F


# my code
xs, ys = [], []
for w in words:   #['nishant']:
    w = '.' + w + '.'
    length = len(w)
    if length > 1:
        for i in range(length - 1): 
            try: 
                ix1 = d_stoi[(w[i:i+2])]
                ix2 = s_stoi[(w[i+2])]
            except:
                continue
            xs.append(ix1)
            ys.append(ix2)
            
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('no of examples: ', num)

# initialize the networks
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((702,27), generator=g, requires_grad=True)

no of examples:  196113


In [140]:
# gradient descent

for k in range(50):
    # forward pass
    xenc = F.one_hot(xs, num_classes=702).float() #cast to float becaue we feed float in NN
    logits = xenc @ W #log-counts
    counts = logits.exp() #eqivalent to N from above
    prob = counts / counts.sum(1, keepdim=True)
    loss = -prob[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
    print(loss.item())

    # backward pass
    W.grad = None #set grad to zero gradient
    loss.backward()

    # update
    W.data += -50 * W.grad

3.769960403442383
3.6858088970184326
3.609027624130249
3.538339614868164
3.473231792449951
3.413414478302002
3.358614444732666
3.3084981441497803
3.262662172317505
3.22066593170166
3.1820740699768066
3.1464831829071045
3.113539934158325
3.082942008972168
3.0544345378875732
3.027801752090454
3.002861976623535
2.9794585704803467
2.957456350326538
2.9367332458496094
2.9171831607818604
2.8987069129943848
2.8812150955200195
2.864625930786133
2.8488636016845703
2.833861827850342
2.8195576667785645
2.8058955669403076
2.792825937271118
2.780303478240967
2.7682878971099854
2.756743907928467
2.745638370513916
2.734942674636841
2.72463059425354
2.7146782875061035
2.705064535140991
2.695770263671875
2.686777353286743
2.6780686378479004
2.669631004333496
2.6614489555358887
2.653510570526123
2.645803928375244
2.638317823410034
2.631042242050171
2.6239681243896484
2.617086172103882
2.6103880405426025
2.6038661003112793


In [150]:
# sampling
g = None
out = []
for i in range(5):
    name =''
    is_start = True
    while True:
        if is_start:
            ix = torch.multinomial(P[0][:26], num_samples=1, replacement=True,generator = g).item()
            name +=d_itos[ix]
            is_start = False
        else:
            to_fed = name[-2:] # last two string
            ix = d_stoi[to_fed]
        
        xenc = F.one_hot(torch.tensor([ix]), num_classes=702).float()
        logits = xenc @ W # predict log-counts
        counts = logits.exp() # counts, equivalent to N
        p = counts / counts.sum(1, keepdims=True)
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        name += s_itos[ix]
        
        if ix == 0 or len(name) > 15:  #if its find '.'
            if len(name) > 15: name += '.'
            break

    out.append(name)
print(out)

['.rah.', '.jaicdmari.', '.xiiqydcxsobmtha.', '.faijblla.', '.raapiqton.']


In [104]:
# checking each step
nulls = torch.zeros(7)
for i in range(7):
    x = xs[i].item()
    y = ys[i].item()
    print("trigram example :: ",d_itos[x],s_itos[y])
    p = prob[i,y]
    print("probablity NN thinks ",p.item())
    logp = torch.log(p)
    
    print("log likeyhood", logp.item())
    null = -logp
    print("negetive log likeyhood", null.item())
    nulls[i] = null
    print('\n\n\n')
print("avg negetive nll is :: ", nulls.mean().item())

trigram example ::  .n i
probablity NN thinks  0.040418144315481186
log likeyhood -3.2084765434265137
negetive log likeyhood 3.2084765434265137




trigram example ::  ni s
probablity NN thinks  0.021364973857998848
log likeyhood -3.8460023403167725
negetive log likeyhood 3.8460023403167725




trigram example ::  is h
probablity NN thinks  0.0185163002461195
log likeyhood -3.9891037940979004
negetive log likeyhood 3.9891037940979004




trigram example ::  sh a
probablity NN thinks  0.10089954733848572
log likeyhood -2.2936298847198486
negetive log likeyhood 2.2936298847198486




trigram example ::  ha n
probablity NN thinks  0.10444221645593643
log likeyhood -2.2591214179992676
negetive log likeyhood 2.2591214179992676




trigram example ::  an t
probablity NN thinks  0.08236471563577652
log likeyhood -2.496598243713379
negetive log likeyhood 2.496598243713379




trigram example ::  nt .
probablity NN thinks  0.004763565491884947
log likeyhood -5.346758842468262
negetive log likey

## E02

In [12]:
from torch.utils.data import random_split

In [26]:
train_set, dev_set, test_set = torch.utils.data.random_split(words, [0.8, 0.1, 0.1])

In [27]:
train_set = list(train_set)
dev_set = list(dev_set)
test_set = list(test_set)

3203