In [1101]:
# load dataset

words = open('names.txt', 'r').read().splitlines()

print(f'{len(words)=}')
print(f'{words[:5]=}')

len(words)=32033
words[:5]=['emma', 'olivia', 'ava', 'isabella', 'sophia']


In [1102]:
chars = list('.') + sorted(set(''.join(words)))
stoi = {s: i for i, s in enumerate(chars) }
# stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}

print(f'{stoi=}')
print(f'{itos=}')


stoi={'.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
itos={0: '.', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}


In [1103]:
"""
# prepare dataset

inputs/X -> Labels/Y

... -> e
..e -> m
.em -> m
emm -> a
mma -> .



# dimensions

a -> [0.001, 0.0321]
b -> [0.009, 0.0056]
...
e -> [0.003, 0.0038]
...
. -> [0.004, 0.0042]




# example forward pass

l1 = [0.004, 0.0042, 0.004, 0.0042, 0.003, 0.0038] @ W1 # where [0.004, 0.0042, 0.004, 0.0042, 0.003, 0.0038] = [..e]
l2 = tanh(l1 @ W2) + b
l3 = softmax(l2)

"""

'\n# prepare dataset\n\ninputs/X -> Labels/Y\n\n... -> e\n..e -> m\n.em -> m\nemm -> a\nmma -> .\n\n\n\n# dimensions\n\na -> [0.001, 0.0321]\nb -> [0.009, 0.0056]\n...\ne -> [0.003, 0.0038]\n...\n. -> [0.004, 0.0042]\n\n\n\n\n# example forward pass\n\nl1 = [0.004, 0.0042, 0.004, 0.0042, 0.003, 0.0038] @ W1 # where [0.004, 0.0042, 0.004, 0.0042, 0.003, 0.0038] = [..e]\nl2 = tanh(l1 @ W2) + b\nl3 = softmax(l2)\n\n'

In [1104]:
# data preperation

import torch

X, Y = [], []
block_size = 3

for w in words:
    context = [0] * block_size
    
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)

        # print(f'{context=} -> {ix}')
        # print(f'{[itos[i] for i in context]} -> {ch}')
        
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [1105]:
# embedding matrix

embedding_size = 2
C = torch.randn((len(chars), embedding_size))        # lookup embeddings
emb = C[X]                                           # shape is _, 3, 2 ; here 3 is the context window, 2 is the embedding size


# emb = torch.concat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1) # problem: its bound to the context size and we would need to manually extract the dimensions if we change the context size.
# emb = torch.concat(torch.unbind(emb, 1), 1)  # problem: not memory efficient. Creates a new memory to to perform this operation.
emb = emb.view(emb.shape[0], 6)

print(emb.shape)

W1 = torch.randn((6, 100))
b1 = torch.tensor(100)


torch.Size([228146, 6])


In [1106]:
h = torch.tanh(emb @ W1 + b1)

h.shape

torch.Size([228146, 100])

In [1107]:
# output layer
W2 = torch.randn((100, 27))
h2 = torch.randn(27)
logits = h @ W2 + h2
logits.shape

torch.Size([228146, 27])

In [1108]:
counts = logits.exp()
probs = counts / counts.sum(dim=1, keepdim=True)
probs.shape

torch.Size([228146, 27])

In [1109]:
# loss = -probs[torch.arange(32), Y].log().mean()

In [1110]:
# ------------------------ nice formatting ------------------------

In [1111]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [1112]:
sum(p.nelement() for p in parameters)

3481

In [1113]:
for p in parameters:
    p.requires_grad = True

In [1122]:
import torch.nn.functional as F

for _ in range(1):
    # forward pass
    emb = C[X]
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    loss

    print(loss.item())

    # backward pass
    for p in parameters:
        p.grad = None

    loss.backward()

    for p in parameters:
        p.data += -0.1 * p.grad

# loss

2.5517654418945312


In [1123]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):

    out = []
    context = [0] * block_size  # initialize with all ...
    while True:
        emb = C[torch.tensor(context)]  # (1, block_size, d)
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break

    print(''.join(itos[i] for i in out))


careaheate.
hlelhc.
mrr.
rehty.
halan.
keja.
huen.
dpeyyrt.
kaeei.
nerania.
ceriiv.
kalein.
hham.
poi.
desinn.
sroilea.
jad.
qirqaelo.
dearyni.
ji.
