<a href="https://colab.research.google.com/github/osamja/zero-to-hero/blob/main/lecture_3/MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [108]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

- Cheatsheet: https://github.com/karpathy/nn-zero-to-hero/blob/master/lectures/makemore/makemore_part2_mlp.ipynb


In [109]:
# !wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt    # Uncomment to download names.txt
names = open('names.txt', 'r').read().splitlines()

In [110]:
# Create a dictionary of characters and their indices
chars = sorted(list(set(''.join(names))))
char2idx = {c: i+1 for i, c in enumerate(chars)}
idx2char = {i: c for i, c in enumerate(chars)}

char2idx['.'] = 0  # ? andrej has this set to zero but doesn't that replace the first character in the alphabet?
idx2char[0] = '.'

In [111]:
# Build the dataset
block_size = 3 # context length: how many characters to consider before predicting the next character
X, Y = [], []

for name in names[:1]:
    name = name + '.'
    context = [0] * block_size      # The first context is a block of zeros which enables padding
    for c in name:
        X.append(context)
        Y.append(char2idx[c])      # The next character is context[-1] (the last character in the context)
        # print(''.join([idx2char[i] for i in context]), '->', c)
        context = context[1:] + [char2idx[c]]   # Crop & shift the context by one character
        
X = torch.tensor(X, dtype=torch.long)   # Should we be using long or ints?
Y = torch.tensor(Y, dtype=torch.long)

num_examples = X.shape[0]
print('Number of examples:', num_examples)

Number of examples: 5


In [112]:
# Embed the characters
x_enc = torch.nn.functional.one_hot(X, num_classes=27).float()
x_enc.shape

C = torch.randn((27, 2))    # Two dimensional embedding
C[5] == F.one_hot(torch.tensor(5), num_classes=27).float() @ C


tensor([True, True])

In [113]:
# wow this array indexing is so cool
emb = C[X]
emb.shape

torch.Size([5, 3, 2])

In [114]:
emb[2][2]

tensor([-1.2201,  1.4294])

In [115]:
# construct the hidden layer
W1 = torch.randn(6, 100)
b1 = torch.randn(100)

In [116]:

res = emb.reshape(5, 6)
res

tensor([[-1.0347, -0.7114, -1.0347, -0.7114, -1.0347, -0.7114],
        [-1.0347, -0.7114, -1.0347, -0.7114, -0.1496,  0.6886],
        [-1.0347, -0.7114, -0.1496,  0.6886, -1.2201,  1.4294],
        [-0.1496,  0.6886, -1.2201,  1.4294, -1.2201,  1.4294],
        [-1.2201,  1.4294, -1.2201,  1.4294, -1.0414,  1.0597]])

In [117]:
emb.view(5, 6)

tensor([[-1.0347, -0.7114, -1.0347, -0.7114, -1.0347, -0.7114],
        [-1.0347, -0.7114, -1.0347, -0.7114, -0.1496,  0.6886],
        [-1.0347, -0.7114, -0.1496,  0.6886, -1.2201,  1.4294],
        [-0.1496,  0.6886, -1.2201,  1.4294, -1.2201,  1.4294],
        [-1.2201,  1.4294, -1.2201,  1.4294, -1.0414,  1.0597]])

In [118]:
# -1 means "whatever is needed to make the shape work"- copilot. haha same as num_examples
emb.view(-1, 6) == emb.reshape(num_examples, 6) 

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True]])

In [119]:
# calculate the hidden layer
h = emb.view(5, 6) @ W1 + b1
h = torch.tanh(h)
h.min(), h.max()

(tensor(-1.0000), tensor(1.))

In [120]:
# create the output layer
W2 = torch.randn(100, 27)
b2 = torch.randn(27) 

logits = h @ W2 + b2

In [121]:
logits.shape

torch.Size([5, 27])

In [125]:
# Let's get our fake counts and then normalize them into a probability distribution
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)
prob.shape

torch.Size([5, 27])

In [127]:
# Lets use y to get the actual probability
prob[torch.arange(num_examples), Y]

tensor([8.5821e-13, 1.9780e-10, 5.6864e-15, 3.1944e-02, 2.0523e-03])