# MLP

Following https://www.youtube.com/watch?v=TCH_1BHY58I&t=188s

# Rebuilding Dataset ( -  00:12:00)

- load dataset
- build character-index lookup table
- build (xxx) -> y dataset (as tensors)


In [1]:
from pathlib import Path
import torch

In [2]:
names = Path('../data/names.txt').open().read().splitlines()
names[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
characters = sorted(list(set(''.join(names))))
characters.insert(0, '.')

stoi = {c: i for i,c in enumerate(characters)}
itos = {i: c for c,i in stoi.items()}


In [4]:
from itertools import tee

def sliding_window(iterable, size):
    iters = tee(iterable, size)
    for i, it in enumerate(iters):
        for _ in range(i):
            next(it, None)
    return zip(*iters)


In [5]:
numbers = list(range(5))
for window in sliding_window(numbers, 3):
    print(window)

(0, 1, 2)
(1, 2, 3)
(2, 3, 4)


In [6]:
xs = []
ys = []
context_size = 3

for name in names[:5]:
    name = context_size*['.'] + list(name) + ['.']
    for chars in sliding_window(name, context_size+1):
        print(f'{chars[:context_size]} ---> {chars[-1]}')
        indices = list(map(lambda c: stoi[c], chars))
        xs.append((indices[:context_size]))
        ys.append(indices[-1])

xs = torch.tensor(xs)
ys = torch.tensor(ys)

('.', '.', '.') ---> e
('.', '.', 'e') ---> m
('.', 'e', 'm') ---> m
('e', 'm', 'm') ---> a
('m', 'm', 'a') ---> .
('.', '.', '.') ---> o
('.', '.', 'o') ---> l
('.', 'o', 'l') ---> i
('o', 'l', 'i') ---> v
('l', 'i', 'v') ---> i
('i', 'v', 'i') ---> a
('v', 'i', 'a') ---> .
('.', '.', '.') ---> a
('.', '.', 'a') ---> v
('.', 'a', 'v') ---> a
('a', 'v', 'a') ---> .
('.', '.', '.') ---> i
('.', '.', 'i') ---> s
('.', 'i', 's') ---> a
('i', 's', 'a') ---> b
('s', 'a', 'b') ---> e
('a', 'b', 'e') ---> l
('b', 'e', 'l') ---> l
('e', 'l', 'l') ---> a
('l', 'l', 'a') ---> .
('.', '.', '.') ---> s
('.', '.', 's') ---> o
('.', 's', 'o') ---> p
('s', 'o', 'p') ---> h
('o', 'p', 'h') ---> i
('p', 'h', 'i') ---> a
('h', 'i', 'a') ---> .


In [7]:
xs.shape

torch.Size([32, 3])

# Embedding Matrix ( - 00:18:00)

- Create Embedding Matrix with embedding size 2 (for now)

In [8]:
C = torch.randn((27,2), requires_grad=True)

# Hidden Layer ( - 00:29:00)

- Hidden layer of dimension (6, 100)
    - weights and biases
- Transform input from (batch_size, 3, embedding_size) to (batch_size, 3*embedding_size) to enable multiplication
- activation function: tanh


In [9]:
X = C[xs]
X.shape

torch.Size([32, 3, 2])

In [10]:
W = torch.randn((6,100), requires_grad=True)
b = torch.randn((1, 100), requires_grad=True)

In [11]:
batch_size = X.shape[0]
reshaped = X.view(batch_size, -1)
reshaped.shape


torch.Size([32, 6])

In [12]:
((reshaped @ W) + b).shape

torch.Size([32, 100])

In [13]:
activation = torch.tanh((reshaped @ W) + b)

# Output layer ( - 00:33:00)

- from hidden_size(100) to 27 (amount of characters)
- neg. likelihood like last notebook

## Last layer

In [14]:
activation.shape

torch.Size([32, 100])

In [15]:
W2 = torch.randn((100,27), requires_grad=True)
b2 = torch.randn(27, requires_grad=True)


In [16]:
output = (activation @ W2) + b2
output.shape

torch.Size([32, 27])

## Loss

In [17]:
import torch.nn.functional as F

From the documentation of [CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss):

- input has to be a Tensor of size $(C)$ for unbatched input, $(minibatch, C)$ for batched input
- The target should be class indices in the range $[0, C)$ where $C$ is the number of classes
- Reduction is by default 'mean'

In [18]:
print('Input shape: ', output.shape)
print('Target shape: ', ys.shape)
print('Target values: ', ys)

Input shape:  torch.Size([32, 27])
Target shape:  torch.Size([32])
Target values:  tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])


In [19]:
F.cross_entropy(output, ys)

tensor(23.7773, grad_fn=<NllLossBackward0>)

Doing this by hand:

In [20]:
# softmax
counts = output.exp()
probs = counts / counts.sum(dim=1, keepdim=True)

# neg log likelihood
logs = - probs.log()
loss = logs[torch.arange(len(ys)), ys]
loss.mean()

tensor(23.7773, grad_fn=<MeanBackward0>)

# Refactoring ( - 00:38:00)

- Define all matrices in one place
- collect parameters in list
    - count parameters, should be 3481
- Use seed generator
- define forward pass with F.cross_entropy()

In [25]:
context_size = 3
embedding_size = 2
hidden_layer_size = 100

#### Dataset

In [37]:
xs = []
ys = []

for name in names[:5]:
    name = context_size*['.'] + list(name) + ['.']
    for chars in sliding_window(name, context_size+1):
        indices = list(map(lambda c: stoi[c], chars))
        xs.append((indices[:context_size]))
        ys.append(indices[-1])

xs = torch.tensor(xs)
ys = torch.tensor(ys)

xs.shape

torch.Size([32, 3])

#### NN Setup

In [49]:
g = torch.Generator().manual_seed(2147483647)

C = torch.randn((27, embedding_size), requires_grad=True, generator=g)
W1 = torch.randn((embedding_size*context_size, hidden_layer_size), requires_grad=True, generator=g)
b1 = torch.randn(hidden_layer_size, requires_grad=True, generator=g)
W2 = torch.randn((hidden_layer_size, 27), requires_grad=True, generator=g)
b2 = torch.randn(27, requires_grad=True, generator=g)

parameters = [C, W1, b1, W2, b2]

print('Number of parameters: ', sum(p.nelement() for p in parameters))

Number of parameters:  3481


#### Forward pass

In [50]:
embeddings = C[xs].view(-1, embedding_size*context_size)

inner = torch.tanh((embeddings @ W1) + b1)
logits = (inner @ W2) + b2

loss = F.cross_entropy(logits, ys)
loss.item()

17.769710540771484

# Training loop ( - )

- forward pass
- backward pass
    - zero grads
    - update by learning rate
- repeat n times for the one batch (intentional overfitting)
    - loss should go to ~.25 (1000 iterations)
- comnpare logits max against ys (manually)
    - should be close to actual indices

- Train on full dataset
    - slow
- instead use minibatches of size 32
    - randomly drawn in each iteration