In [50]:
import torch
import torch.nn.functional as F

In [5]:
words = open("names.txt", "r").read().splitlines()
words[:3], len(words)

(['emma', 'olivia', 'ava'], 32033)

In [16]:
chars = sorted(list(set("".join(words))))
stoi = {s : i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i : s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [30]:
block_size = 3 # context length
X, Y = [], []

for w in words[:5]:
    print(w)
    context = [0]*block_size
    w = w + '.'
    for ch in w:
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '----->', itos[ix])
        context = context[1:] + [ix]
X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... -----> e
..e -----> m
.em -----> m
emm -----> a
mma -----> .
olivia
... -----> o
..o -----> l
.ol -----> i
oli -----> v
liv -----> i
ivi -----> a
via -----> .
ava
... -----> a
..a -----> v
.av -----> a
ava -----> .
isabella
... -----> i
..i -----> s
.is -----> a
isa -----> b
sab -----> e
abe -----> l
bel -----> l
ell -----> a
lla -----> .
sophia
... -----> s
..s -----> o
.so -----> p
sop -----> h
oph -----> i
phi -----> a
hia -----> .


In [31]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [115]:
C = torch.randn((27,2))

In [116]:
C[5]

tensor([-1.3788, -0.9180])

In [117]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([-1.3788, -0.9180])

In [118]:
C[X].shape

torch.Size([32, 3, 2])

In [119]:
embed = C[X]

In [120]:
torch.cat(torch.unbind(embed, dim=1), dim=1).shape

torch.Size([32, 6])

In [121]:
embed.view(32, 6).shape

torch.Size([32, 6])

In [122]:
torch.allclose(torch.cat(torch.unbind(embed, dim=1), dim=1), embed.view(32, -1))

True

In [123]:
W1 = torch.randn((6, 100))
b = torch.randn(100)

W1.shape, b.shape

(torch.Size([6, 100]), torch.Size([100]))

In [124]:
h = torch.tanh(embed.view(-1, 6) @ W1 + b)

In [125]:
h.max(), h.min()

(tensor(1.), tensor(-1.))

In [126]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

W2.shape, b2.shape

(torch.Size([100, 27]), torch.Size([27]))

In [127]:
logits = h @ W2 + b2

In [128]:
logits.shape

torch.Size([32, 27])

In [129]:
counts = logits.exp()

In [130]:
prob = counts / counts.sum(1, keepdims=True)

In [131]:
prob[torch.arange(32), Y]

tensor([7.3747e-08, 1.5062e-03, 9.8346e-04, 6.3733e-12, 6.4747e-06, 8.5408e-08,
        2.1884e-05, 2.8229e-13, 5.2777e-08, 2.3752e-02, 7.6170e-10, 2.2334e-01,
        1.5659e-03, 1.7326e-09, 5.4590e-20, 8.1002e-02, 1.0532e-10, 2.2040e-10,
        1.6080e-14, 1.9203e-04, 7.9729e-12, 3.3736e-05, 2.9409e-08, 6.9668e-13,
        2.8155e-01, 3.1396e-10, 4.6177e-05, 4.6805e-10, 1.5679e-06, 2.1458e-12,
        8.3028e-07, 1.3051e-05])

In [141]:
loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(16.4618)

In [140]:
F.cross_entropy(logits, Y)

tensor(16.4618)

## Cleaned up version

In [363]:
def build_dataset(words):
    
    block_size = 3 # context length
    X, Y = [], []

    for w in words:
    #     print(w)
        context = [0]*block_size
        w = w + '.'
        for ch in w:
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
    #         print(''.join(itos[i] for i in context), '----->', itos[ix])
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

In [366]:
n1 = int(0.8*(len(words)))
n2 = int(0.9*(len(words)))

X_train, y_train = build_dataset(words[:n1])
X_dev, y_dev = build_dataset(words[n1:n2])
X_test, y_test = build_dataset(words[n2:])

In [367]:
X_train.shape, y_train.shape, X_dev.shape, y_dev.shape, X_test.shape, y_test.shape, 

(torch.Size([182778, 3]),
 torch.Size([182778]),
 torch.Size([22633, 3]),
 torch.Size([22633]),
 torch.Size([22735, 3]),
 torch.Size([22735]))

In [437]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,10))
W1 = torch.randn((30, 300))
b1 = torch.randn(300)
W2 = torch.randn((300, 27))
b2 = torch.randn(27)

parameters = [C, W1, b1, W2, b2]

In [438]:
sum(p.nelement() for p in parameters)

17697

In [439]:
for p in parameters:
    p.requires_grad = True

In [440]:
lre = torch.linspace(-3, 0, steps = 1000)
lrs = 10 ** lre
lrs.shape

torch.Size([1000])

In [445]:
for i in range(50000):
    
    # Minibatches
    ix = torch.randint(0, X_train.shape[0], (32,))
    
    # Forward Pass
    embed = C[X_train[ix]]
    h = torch.tanh(embed.view(-1, 30) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y_train[ix])
    
    # Backward Pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Update Gradients
    lr = 0.01
    for p in parameters:
        p.data += -lr * p.grad  # 0.1 is Learning Rate
        
    # Validation
    
    idx = torch.randint(0, X_dev.shape[0], (32,))
    embed_val = C[X_dev[idx]]
    h_val = torch.tanh(embed_val.view(-1, 30) @ W1 + b1)
    logits_val = h_val @ W2 + b2
    loss_val = F.cross_entropy(logits_val, y_dev[idx])
    
    # Stats
    
    lossi.append(loss.item())
    steps.append(i)
    
    if i % 2000 == 0:
        print(f'epoch: {i+1}, training loss: {loss.item():.4f}, validation loss: {loss_val.item():.4f}')

        
# print(loss.item())

epoch: 1, training loss: 2.2223, validation loss: 2.8618
epoch: 2001, training loss: 1.9269, validation loss: 2.5002
epoch: 4001, training loss: 2.3494, validation loss: 2.0611
epoch: 6001, training loss: 1.8800, validation loss: 2.7169
epoch: 8001, training loss: 1.7848, validation loss: 2.3725
epoch: 10001, training loss: 2.1874, validation loss: 2.1815
epoch: 12001, training loss: 1.9738, validation loss: 2.1180
epoch: 14001, training loss: 2.2982, validation loss: 2.2361
epoch: 16001, training loss: 2.2282, validation loss: 2.5089
epoch: 18001, training loss: 1.8204, validation loss: 2.0717
epoch: 20001, training loss: 1.7439, validation loss: 2.4989
epoch: 22001, training loss: 2.1031, validation loss: 2.2280
epoch: 24001, training loss: 2.4291, validation loss: 2.2066
epoch: 26001, training loss: 1.9766, validation loss: 2.6143
epoch: 28001, training loss: 1.8027, validation loss: 2.4425
epoch: 30001, training loss: 2.2747, validation loss: 2.5886
epoch: 32001, training loss: 2.0

In [448]:
embed = C[X_test]
h = torch.tanh(embed.view(-1, 30) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, y_test)
print(loss)

tensor(2.4489, grad_fn=<NllLossBackward0>)


In [461]:
g = torch.Generator().manual_seed(2147483647)

for _ in range(20):
    out = []
    context = [0]*block_size
    while True:
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break
            
    print(''.join(itos[i] for i in out))

junide.
janasia.
pres.
amaira.
koibrito.
sabellee.
kalinaaura.
brenias.
deyah.
rwy.
bras.
jaina.
larie.
fartumeryn.
demiy.
ponneliah.
jaylinora.
yarion.
kaelandryed.
miki.
