In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

### Loading data

In [2]:
with open('../names.txt', 'r') as file:
    names = file.read().split()

print('Total names:', len(names))

Total names: 32033


In [3]:
# Form stoi and itos
vocab = sorted(list(set(''.join(names))))
stoi = {s: i+1 for i, s in enumerate(vocab)}
stoi['.'] = 0
itos = {stoi[s]: s for s in stoi}

In [24]:
block_size = 3
X = []
Y = []

for word in names[:5]:
    word = ['.']*block_size + list(word) + ['.']
    for ind in range(3, len(word)):
        X.append([stoi[x] for x in word[ind-3:ind]])
        Y.append(stoi[word[ind]])
        # print(''.join(word[ind-3:ind]), '--->', word[ind])

X = torch.tensor(X)
Y = torch.tensor(Y)

In [21]:
X.shape, Y.shape, X.dtype

(torch.Size([32, 3]), torch.Size([32]), torch.float32)

### Data Indexing
Indexing 27 characters in 2D space. Therefore, the dimension of `C` is `27 x 2`

In [71]:
C = torch.randn(27, 2)

In [72]:
C[0], C.dtype

(tensor([-1.8310, -2.0545]), torch.float32)

In [77]:
emb = C[X]
print(emb.shape) # All the integers in X are embedded in 2D

torch.Size([32, 3, 2])


### Hidden layer

- `W1` weights are initialized randomly
- Number of inputs to hidden layer is `block_size * 2` or 3*2. We have encoded every character in the X to 2D space using `C`
- Transform the input embedding shape from `num x 3 x 2` to `num x 6`. This enables matrix multiplication `emb @ W1`

In [74]:
W1 = torch.randn(6, 100)
b1 = torch.randn(100)

In [75]:
W1.shape, b1.shape

(torch.Size([6, 100]), torch.Size([100]))

In [78]:
# output of the hidden layer
h = emb.view(emb.shape[0], 6) @ W1 + b1

# We can also do -1 instead of shape, pytorch will infer the value.

In [79]:
h = torch.tanh(h) # numbers between -1 and 1

In [80]:
# h.shape[0]: number of entries in dataset
# h.shape[1]: number of neurons we chose
h.shape

torch.Size([32, 100])

### Output layer

In [81]:
# Now we need to get probability distribution over 27 characters. 
W2 = torch.randn(100, 27)
b2 = torch.rand(27)

logits = (h @ W2) + b2

In [82]:
counts = logits.exp()

In [83]:
probs = counts / counts.sum(1, keepdims=True)

In [84]:
probs.shape

torch.Size([32, 27])

In [85]:
probs[0].sum()

tensor(1.0000)

In [86]:
probs[torch.arange(X.shape[0]), Y]

tensor([1.6911e-05, 1.2131e-07, 1.1433e-06, 2.8225e-10, 4.1201e-08, 2.6069e-11,
        2.3683e-10, 4.7285e-05, 3.7934e-11, 9.4034e-07, 2.2148e-07, 9.7094e-07,
        1.3954e-09, 2.4389e-07, 8.8684e-03, 3.1274e-10, 9.7286e-06, 5.3712e-12,
        3.0116e-03, 1.6982e-04, 1.3049e-06, 8.7927e-11, 3.6239e-13, 4.6360e-12,
        1.5429e-09, 1.2455e-09, 2.1575e-11, 6.5571e-05, 1.9210e-16, 1.9229e-07,
        9.0600e-06, 9.1933e-05])

In [89]:
loss = -probs[torch.arange(32), Y].log().mean()
loss

tensor(17.3849)