## In this Colab we:
- 1
- 2

## Result:
- 1
- 2

In [30]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [51]:
# architecture
EMB_SIZE = 2        # dimention of embedding space
WINDOW = 3          # how many previous element to consider
HIDDEN_SIZE = 100   # size of hidden layer
# training
LR = 0.1
N_EPOCHS = 100
BATCH_SIZE = 64

In [3]:
words = open('../docs/names.txt', 'r').read().splitlines()
print(f'{len(words)=}')
words[:7]

len(words)=32033


['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia']

In [4]:
# build vocab + Loop-Up tables
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

vocab_size = len(chars + ['.'])

In [5]:
# BUILD DATA
# ex. 'emma'
# context -> output   torch.tensor:
#  [...]  -> e        [ 0, 0, 0] -> [5]
#  [..e]  -> m        [ 0, 0, 5] -> [13]
#  [.em]  -> m        [ 0, 5,13] -> [13]
#  [emm]  -> a        [ 5,13,13] -> [1]
#  [mma]  -> .        [13,13, 1] -> [0]


X, Y = [], []

for w in words:

    context = [0] * WINDOW
    for ch in w + '.' :
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)

        context = context[1:] + [ix] # update context

X = torch.tensor(X)
Y = torch.tensor(Y)
print(f'{X.shape=} {Y.shape=}')

X.shape=torch.Size([228146, 3]) Y.shape=torch.Size([228146])


In [59]:
g = torch.Generator().manual_seed(2147483647)

# BUILD EMBEDDINGS (aka loop-up)
C = torch.randn((vocab_size,EMB_SIZE)) # untrained

# BUILD HIDDEN LAYER
hidden_W = torch.randn((WINDOW*EMB_SIZE, HIDDEN_SIZE))
hidden_b = torch.randn(HIDDEN_SIZE)

# BUILD OUT LAYER
out_W = torch.randn((HIDDEN_SIZE, vocab_size))
out_b = torch.randn(vocab_size)

parameters = [C, hidden_W, hidden_b, out_W, out_b]
for p in parameters:
    p.requires_grad = True

In [60]:
for epoch in range(10*N_EPOCHS):

    # Forward-pass

    # mini batch
    ixs = torch.randint(0, X.shape[0], (BATCH_SIZE,))

    emb = C[X[ixs]]                          # (N, 3, 2)
    emb = emb.view(-1, WINDOW*EMB_SIZE) #  concat by tokens -> (N , 6) 

    h = torch.tanh(emb @ hidden_W + hidden_b)
    logits = h @ out_W + out_b

    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdims=True)
    # loss = - prob[torch.arange(len(X)) , Y].log().mean()
    loss = F.cross_entropy(logits, Y[ixs])
    (epoch % 50 == 0) and print(f'{loss.data=}')
    
    # training
    for p in parameters:
        p.grad = None
    loss.backward()

    #update
    for p in parameters:
        p.data += -LR * p.grad



loss.data=tensor(14.3273)
loss.data=tensor(5.0338)
loss.data=tensor(3.0541)
loss.data=tensor(2.9878)
loss.data=tensor(2.8682)
loss.data=tensor(2.7189)
loss.data=tensor(2.7376)
loss.data=tensor(2.7006)
loss.data=tensor(2.9306)
loss.data=tensor(2.7773)
loss.data=tensor(2.3117)
loss.data=tensor(2.4526)
loss.data=tensor(2.6149)
loss.data=tensor(2.7410)
loss.data=tensor(2.4811)
loss.data=tensor(2.4871)
loss.data=tensor(2.8404)
loss.data=tensor(2.6644)
loss.data=tensor(2.6711)
loss.data=tensor(2.7765)


**prob** : 
∀ contex os size WINDOW, prob ⇛ vocab distribution (of the next element)

The idea is to train it so that it matchs Y

In [45]:
X.shape

torch.Size([228146, 3])

In [28]:
sum(p.nelement() for p in [C, hidden_W, hidden_b, out_W, out_b])

3481

In [53]:
lre = torch.linspace(-3, 0, 10)
lrs = 10**lre
lrs

tensor([0.0010, 0.0022, 0.0046, 0.0100, 0.0215, 0.0464, 0.1000, 0.2154, 0.4642,
        1.0000])