In [108]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [109]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [110]:
len(words)

32033

In [111]:
# build the vocabulary of character and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [112]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:
    #print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        #print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] #crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

In [113]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [114]:
# to iste co C[5] kvoli niecomu
# F.one_hot(torch.tensor(5), num_classes=27).float() @ C

In [115]:
# all these 5 lines do the same
#torch.cat([emb[:, 0 :], emb[:, 1 :], emb[:, 2 :]], 1).shape
#torch.cat(torch.unbind(emb, 1), 1).shape
#h = emb.view(32, 6) @ W1 + b1
#h = emb.view(emb.shape[0], 6) @ W1 + b1

#h = emb.view(-1, 6) @ W1 + b1 # -1 means the rest
#H = torch.tanh(h)
#h.shape, h

In [116]:
# respectable --------------------------

In [117]:
X.shape, Y.shape # dataset

(torch.Size([228146, 3]), torch.Size([228146]))

In [155]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]
sum(p.nelement() for p in parameters) # number of parameters in total


3481

In [156]:
for p in parameters:
    p.requires_grad = True

In [163]:
lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre
#lrs

In [159]:
for i in range (1000):
    # mini batch construct
    ix = torch.randint(0, X.shape[0], (32,))

    # forward pass
    emb = C[X[ix]] # (32, 3, 2)
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32, 27)
    # >>>>>>>>>>
    #counts = logits.exp()
    #prob = counts / counts.sum(1, keepdim=True)
    #loss = -prob[torch.arange(32), Y].log().mean()
    # <<<<<<<<<
    loss = F.cross_entropy(logits, Y[ix]) # same as in >>> <<< above
    #print(loss.item())
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    lr = lrs[i]
    for p in parameters:
        p.data += lr * p.grad
        
print(loss.item())

2.6447060108184814


tensor([208569,  32566,  73859,  25563, 167026, 119916, 169080,  88694, 131823,
         59006,  54476, 156819, 130764,  25019, 153666, 193100,  87966,  26282,
        176207,  57838, 136379, 127555,  88340,  35713,  60873, 212202, 149550,
        112415, 174931,  69059, 195442, 153269])

tensor(17.7697)