In [314]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [315]:
words = open('names.txt', 'r').read().splitlines()

In [316]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}


In [317]:
# build dataset

def build_dataset(words):
    block_size = 3
    X, Y = [], []
    for w in words:
        context = [0] * block_size 
        for ch in w + '.':
            ix = stoi[ch] # index of current character
            X.append(context)
            Y.append(ix)
            # print(''.join(itos[i] for i in context), '--->', itos[ix])
          
            context = context[1:] + [ix] # sliding window with indexes
     
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

X_tr, Y_tr = build_dataset(words[:n1])
X_dev, Y_dev = build_dataset(words[n1:n2])
X_te, Y_te = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [318]:
# lookup table for embed
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)
# hidden layer 
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
# output layer
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [319]:
# number of params
sum(p.nelement() for p in parameters) 

3481

In [320]:
for p in parameters:
    p.requires_grad = True

In [321]:
lri = []
lossi = []

In [325]:
for _ in range(10000):
    # mini batch construct
    ix = torch.randint(0, X_tr.shape[0], (32,))
    
    # forward pass
    emb = C[X_tr[ix]] # (32, 3, 2)
    embv = emb.view(emb.shape[0], 6) # 32, 6
    
    h = torch.tanh(embv @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32, 27)
    loss = F.cross_entropy(logits, Y_tr[ix])
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    lr = 0.1
    for p in parameters:
        p.data += -lr * p.grad


In [326]:
print(loss.item())

2.3437340259552


In [327]:
# global loss (not local mini batch)

emb = C[X_dev] # (32, 3, 2)
embv = emb.view(emb.shape[0], 6) # 32, 6

h = torch.tanh(embv @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
print(logits.shape)
loss = F.cross_entropy(logits, Y_dev)
loss.item()

torch.Size([22655, 27])


2.409420967102051