In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [8]:
chars = sorted(list(set(''.join(words))))

stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [13]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict teh next one
X, Y = [], []
for w in words[:5]:
    
    print(w)
    context = [0] * block_size # padding with .'s of block_size count
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '----->', itos[ix])
        context = context[1:] + [ix] # crop & append

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... -----> e
..e -----> m
.em -----> m
emm -----> a
mma -----> .
olivia
... -----> o
..o -----> l
.ol -----> i
oli -----> v
liv -----> i
ivi -----> a
via -----> .
ava
... -----> a
..a -----> v
.av -----> a
ava -----> .
isabella
... -----> i
..i -----> s
.is -----> a
isa -----> b
sab -----> e
abe -----> l
bel -----> l
ell -----> a
lla -----> .
sophia
... -----> s
..s -----> o
.so -----> p
sop -----> h
oph -----> i
phi -----> a
hia -----> .


In [14]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [15]:
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

In [16]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [17]:
# Let's try to map 27 characters into a 2 dimensional space

C = torch.randn((27,2))

#### Before trying to embed all integers in "X", let's try embedding a single integer, say - 5

In [18]:
# One way could be just index into 5th row of C
C[5]

tensor([ 1.1143, -0.9153])

In [22]:
# Other way could be doing 
# "dot" multiplication of One-Hot vector of 5 (made with 27 classes)
# with C

F.one_hot(torch.tensor(5), num_classes=27).float() @ C

# This gives exact same thing as above

tensor([ 1.1143, -0.9153])

### Above - there are 2 things to be learnt
1. When direct indexing into matrix C
    - It can be seen as the embedding matrix of characters
    - Those embeddings of corresponding chars can be fed into the first layer of the network
2. Instead, we can take **One-Hot** repr of characters:
    - Matrix multiply them to C, which can be thought of as weights of first layer (also called embed layer)
    - The result of this would be identical to the approach 1.

### We're going to just use the 1st approach, index into the C to get embeddings

In [23]:
# It is not straight forward to index simultaneoudly for (32,3) matrix of integers from C
# We have to use pytorch indexing 

# We can index using list

C[[5,6,7]]

tensor([[ 1.1143, -0.9153],
        [ 0.4073, -0.3208],
        [ 0.6581,  0.0968]])

In [24]:
# PyTorch indexing also works with tensor

C[torch.tensor([5,6,7])]

tensor([[ 1.1143, -0.9153],
        [ 0.4073, -0.3208],
        [ 0.6581,  0.0968]])

In [25]:
# WE can also repeat a row multiple times

C[torch.tensor([5,6,7,7,7])]

tensor([[ 1.1143, -0.9153],
        [ 0.4073, -0.3208],
        [ 0.6581,  0.0968],
        [ 0.6581,  0.0968],
        [ 0.6581,  0.0968]])

In [26]:
# Importantly, we can also index with multi-dimensional tensor

C[X]

tensor([[[ 0.0462,  0.5760],
         [ 0.0462,  0.5760],
         [ 0.0462,  0.5760]],

        [[ 0.0462,  0.5760],
         [ 0.0462,  0.5760],
         [ 1.1143, -0.9153]],

        [[ 0.0462,  0.5760],
         [ 1.1143, -0.9153],
         [-1.8946, -0.4674]],

        [[ 1.1143, -0.9153],
         [-1.8946, -0.4674],
         [-1.8946, -0.4674]],

        [[-1.8946, -0.4674],
         [-1.8946, -0.4674],
         [ 1.1002, -0.1904]],

        [[ 0.0462,  0.5760],
         [ 0.0462,  0.5760],
         [ 0.0462,  0.5760]],

        [[ 0.0462,  0.5760],
         [ 0.0462,  0.5760],
         [-0.3499, -1.1790]],

        [[ 0.0462,  0.5760],
         [-0.3499, -1.1790],
         [-1.4511,  0.0666]],

        [[-0.3499, -1.1790],
         [-1.4511,  0.0666],
         [-0.5311, -0.4391]],

        [[-1.4511,  0.0666],
         [-0.5311, -0.4391],
         [ 1.4045, -0.2465]],

        [[-0.5311, -0.4391],
         [ 1.4045, -0.2465],
         [-0.5311, -0.4391]],

        [[ 1.4045, -0

In [28]:
C[X].shape

# This way we can embed all integers in X into 2 dimensional embeddings

torch.Size([32, 3, 2])

In [33]:
X[13,2]

tensor(1)

In [30]:
C[X][13,2]

tensor([ 1.1002, -0.1904])

In [31]:
# above is same as
C[1]

tensor([ 1.1002, -0.1904])

In [34]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [35]:
W1 = torch.randn((6,100)) #6 - 2-d embeddings for 3 ; 100 - number of neurons taken in 1st hidden layer
b1 = torch.randn(100)

In [40]:
# Now we would ideally want to do "Wx + b"

# emb @ W1 + b1

# But this would not directly work

- We need to transform the tensor **emb** into a form
    - such that **matrix multiplication** can work with **W1**

In [42]:
# Two ways to do it are shown here:

emb.view(32,6) == torch.cat(torch.unbind(emb, 1), 1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [44]:
emb.view(32,6).shape, torch.cat(torch.unbind(emb, 1), 1).shape

(torch.Size([32, 6]), torch.Size([32, 6]))

In [46]:
# Hence, we can do Wx + b as this

# emb.view(32,6) @ W1 + b1
#This has a problem that we are hard-coding first dimenssion

In [48]:
# Other ways to do the transformation are:

print(emb.view(emb.shape[0], 6).shape)
# or
print(emb.view(-1, 6).shape) # This just calculates what must be the number of first dimension, given the second

torch.Size([32, 6])
torch.Size([32, 6])


In [51]:
# Therefore: we now calculate hidden layer activations

h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
h.shape

torch.Size([32, 100])

In [50]:
h

tensor([[-0.7979,  0.9663, -0.6629,  ..., -0.4421,  0.4280, -0.2430],
        [-0.9910, -0.9892, -0.8089,  ...,  0.9648,  0.0535, -0.6373],
        [ 0.9282,  0.4420, -0.1531,  ..., -0.7087,  0.0041,  0.9991],
        ...,
        [ 0.7473, -0.4259, -0.0105,  ...,  0.9512, -0.9927,  0.9660],
        [-0.9771, -0.9630, -0.8933,  ...,  0.9997, -0.9995,  0.9859],
        [-0.9997, -0.9730, -0.9361,  ...,  0.9996,  0.9196, -0.8704]])

In [52]:
W2 = torch.randn((100,27))
b2 = torch.randn(27)

In [54]:
logits = h @ W2 + b2
logits.shape

torch.Size([32, 27])

In [55]:
counts = logits.exp()
counts.shape

torch.Size([32, 27])

In [56]:
(counts.sum(1, keepdim=True)).shape

torch.Size([32, 1])

In [57]:
probs = counts / counts.sum(1, keepdim=True)
probs.shape

torch.Size([32, 27])

In [58]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [59]:
Y.shape

torch.Size([32])

### Now we,
    - index into rows of probs
        - and pluck out probabilities assigned to the correct character

In [62]:
probs[torch.arange(32), Y]

tensor([3.4117e-09, 1.0071e-05, 1.7430e-04, 8.7363e-08, 9.7624e-12, 1.2563e-06,
        9.2550e-03, 5.5457e-03, 8.3474e-01, 2.1267e-08, 3.8001e-04, 5.1306e-07,
        4.3669e-06, 3.7449e-14, 4.5443e-05, 1.5580e-02, 1.6101e-09, 3.4126e-08,
        8.9654e-04, 4.5376e-13, 6.2242e-10, 1.5743e-01, 2.6351e-11, 3.0559e-07,
        7.2201e-11, 1.4816e-06, 3.9673e-07, 4.1669e-07, 4.8723e-07, 6.3202e-01,
        4.9001e-05, 4.5692e-08])

In [63]:
loss = probs[torch.arange(32), Y].log().mean()
loss

tensor(-13.9268)

In [64]:
# --------------------- Doing all of above: TOGETHER --------------------------------

In [65]:
X.shape, Y.shape

(torch.Size([32, 3]), torch.Size([32]))

In [66]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator = g)
W1 = torch.randn((6,100), generator = g)
b1 = torch.randn(100, generator = g)
W2 = torch.randn((100,27), generator = g)
b2 = torch.randn(27, generator = g)
parameters = [C, W1, b1, W2, b2]

In [67]:
# Total parameters in model
sum(p.nelement() for p in parameters)

3481

In [68]:
emb = C[X] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
loss = probs[torch.arange(32), Y].log().mean()
loss

tensor(-17.7697)