In [None]:
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F
from math import sqrt
%matplotlib inline

In [None]:
words = open('names.txt', 'r').read().splitlines()
words[:8], len(words)

(['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia'],
 32033)

In [None]:
print(max(len(w) for w in words))

15


In [None]:
chars = sorted(list(set(''.join(words))))

In [None]:
stio =  { s:i+1 for i,s in enumerate(chars)}
stio['.'] = 0
iots = {i:s for s,i in stio.items()}
vocab_size = len(iots)
print(vocab_size)
print(iots)

27
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [None]:
def build_dataset(words):
    
    block_size = 3
    X, Y = [],[]

    for w in words:
        #print(w)
        context = [0] * block_size
        for ch in w +'.':
            ix = stio[ch]
            X.append(context)
            Y.append(ix)
            #print(''.join(iots[i] for i in context),'---->',iots[ix])
            context = context[1:] + [ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X,Y
import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xts, Yts = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [None]:
def cmp(s, dt, t):
    ex = torch.all(dt == t.grad()).item()
    app = torch.allclose(dt, t.grad())
    maxdiff = (dt - t.grad).abs().max().item()
    print(f' {s:15s} | exacte: {str(ex):5s} | approximate: {str(app):5s} |maxdiff: {maxdiff}')

In [None]:
n_dim = 10 # embedding vector dimentionality
B = 3 # block size
n_hidden = 64 # number of nuerons in hidden layer


g= torch.Generator().manual_seed(2147483647)
C =torch.randn((vocab_size,n_dim),generator=g)
#layer 1
W1 = torch.randn(((n_dim*B),n_hidden),generator=g) * (5/3)/((n_dim*B)**0.5)
b1 = torch.randn(            n_hidden,generator=g) * 0.01  
#layer 2
W2 = torch.randn((n_hidden,vocab_size),generator=g)* 0.01
b2 = torch.randn(           vocab_size,generator=g)*0.01
#batchnorm parameters
bnrgain = torch.ones(1,n_hidden) * 0.01 + 1.0
bnrbias = torch.zeros(1,n_hidden)* 0.01 


parameters = [C,W1,b1,W2,b2, bnrgain, bnrbias]
print(sum(p.nelement() for p in parameters)) # total number of parameters
for p in parameters:
    p.requires_grad=True

4137


In [None]:
batch_size = 32
n = batch_size
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator =g)
xb, yb = Xtr[ix], Ytr[ix]

In [None]:
emb = C[xb] # embedding the characters into vectors
embcat = emb.view(emb.shape[0], -1) # concatinating the vector embeddings
# Linear Layer 1
hprebn = embcat @ W1 +b1 # hidden layer pre-activation
# Batchnorm layer
bnmeani = 1/n*hprebn.sum(0,keepdim=True)
bndiff = hprebn - bnmeani
bndiff2 = bndiff**2
bnvar = 1/(n-1)*bndiff2.sum(0, keepdim=True) # Bessel's correction(n-1, not n)
bnvar_inv = (bnvar + 1e-5)**-0.5
bnraw = bndiff * bnvar_inv
hpreact = bnrgain * bnraw +bnrbias
# non-linearity
h = torch.tanh(hpreact) # hidden layer
#Linear layer 2
logits = h @ W2 + b2 #output layer
# cross entropy loss (same as F.cross_entropy(logits,yb))
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes # substract maxes for numerical stability
counts = norm_logits.exp()
counts_sum = counts.sum(1,keepdim=True)
counts_sum_inverse = counts_sum**-1 # better to get backdrop than 1.0/counts.sum
probs = counts *counts_sum_inverse 
logprobs = probs.log()
loss = -logprobs[range(n), yb].mean()

# pytorch backward pass
for p in parameters:
    p.grad = None
for t in [logprobs,probs, counts, counts_sum, counts_sum_inverse,
         norm_logits, logit_maxes, logits, h, hpreact, bnraw,
         bnvar_inv, bnvar, bndiff2, bndiff,hprebn, bnmeani,
         embcat, emb]:
    t.retain_grad()
loss.backward()
loss

tensor(3.2877, grad_fn=<NegBackward0>)