In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('./names.txt').read().splitlines()

In [3]:
len(words)

32033

In [4]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i + 1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [5]:
# building the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []

for w in words[:5]:
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        # inputs for the NN
        X.append(context)
        # labels
        Y.append(ix)
        print(''.join(itos[i] for i in context), '----->', itos[ix])
        context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

... -----> e
..e -----> m
.em -----> m
emm -----> a
mma -----> .
... -----> o
..o -----> l
.ol -----> i
oli -----> v
liv -----> i
ivi -----> a
via -----> .
... -----> a
..a -----> v
.av -----> a
ava -----> .
... -----> i
..i -----> s
.is -----> a
isa -----> b
sab -----> e
abe -----> l
bel -----> l
ell -----> a
lla -----> .
... -----> s
..s -----> o
.so -----> p
sop -----> h
oph -----> i
phi -----> a
hia -----> .


In [6]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

#### Building the Embedding table

27 possible characters

2 dimensional space

In [7]:
C = torch.randn((27,2))
C

tensor([[-0.5307,  0.7660],
        [ 1.7500,  1.0730],
        [-1.0957, -1.7143],
        [-0.7296, -1.2371],
        [ 1.4910,  0.2870],
        [-1.1985, -0.0122],
        [-0.3911,  0.1643],
        [ 1.8135, -0.6466],
        [ 1.3510,  1.4806],
        [-0.4878,  0.2521],
        [-2.3165, -1.1246],
        [ 0.7617,  0.3252],
        [ 0.2554,  0.3870],
        [ 0.4926, -0.4814],
        [ 1.6519, -1.7160],
        [ 0.2935, -0.1889],
        [ 0.1316,  1.9305],
        [ 0.3195,  1.1404],
        [-0.2963, -0.0370],
        [-0.4838, -0.3150],
        [ 0.6331, -0.7384],
        [-1.3462, -1.6900],
        [-1.6964,  0.7554],
        [-0.0104,  0.2560],
        [ 0.2085,  0.8774],
        [-0.6445,  1.0307],
        [ 0.0355, -0.7504]])

In [8]:
# one integer embedding
C[5]

tensor([-1.1985, -0.0122])

In [9]:
# prev. lecture method: one-hot encoding for the input
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([-1.1985, -0.0122])

In [10]:
# embedding one integer is easy, but simultaneously is quite complex
C[X]

tensor([[[-0.5307,  0.7660],
         [-0.5307,  0.7660],
         [-0.5307,  0.7660]],

        [[-0.5307,  0.7660],
         [-0.5307,  0.7660],
         [-1.1985, -0.0122]],

        [[-0.5307,  0.7660],
         [-1.1985, -0.0122],
         [ 0.4926, -0.4814]],

        [[-1.1985, -0.0122],
         [ 0.4926, -0.4814],
         [ 0.4926, -0.4814]],

        [[ 0.4926, -0.4814],
         [ 0.4926, -0.4814],
         [ 1.7500,  1.0730]],

        [[-0.5307,  0.7660],
         [-0.5307,  0.7660],
         [-0.5307,  0.7660]],

        [[-0.5307,  0.7660],
         [-0.5307,  0.7660],
         [ 0.2935, -0.1889]],

        [[-0.5307,  0.7660],
         [ 0.2935, -0.1889],
         [ 0.2554,  0.3870]],

        [[ 0.2935, -0.1889],
         [ 0.2554,  0.3870],
         [-0.4878,  0.2521]],

        [[ 0.2554,  0.3870],
         [-0.4878,  0.2521],
         [-1.6964,  0.7554]],

        [[-0.4878,  0.2521],
         [-1.6964,  0.7554],
         [-0.4878,  0.2521]],

        [[-1.6964,  0

In [11]:
C[X].shape

torch.Size([32, 3, 2])

In [12]:
C[X][13,2]

tensor([1.7500, 1.0730])

In [13]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [14]:
emb[0]

tensor([[-0.5307,  0.7660],
        [-0.5307,  0.7660],
        [-0.5307,  0.7660]])

#### Building the hidden layer

In [15]:
W1 = torch.randn((6,100))
b1 = torch.randn(100)

In [16]:
h = torch.tanh(emb.view(-1,6) @ W1 + b1)# the -1 infers the number of elements

In [17]:
h # values between -1 and 1

tensor([[ 0.8879,  0.9848,  0.9971,  ..., -0.9713, -0.8866,  0.9075],
        [ 0.5855,  0.9477,  0.9746,  ..., -0.9842, -0.7439,  0.9727],
        [-0.2427, -0.7997,  0.1473,  ..., -0.2374, -0.5185,  0.9241],
        ...,
        [ 0.9998,  0.7438,  0.9969,  ..., -0.8692, -0.9468,  0.6833],
        [ 0.9931,  0.9991, -0.6982,  ...,  0.9699, -0.9749,  0.9876],
        [ 0.9994,  0.3470, -0.1610,  ...,  0.9627, -0.9550, -0.4018]])

In [18]:
h.shape

torch.Size([32, 100])

#### Creating the last layer

In [19]:
# 100 for the layer defined as hyperparameter
# 27 possible values 
W2 = torch.randn((100,27))
b2 = torch.randn(27)

In [20]:
logits = h @ W2 + b2

In [21]:
logits.shape

torch.Size([32, 27])

In [23]:
counts = logits.exp()

In [24]:
probs = counts / counts.sum(1, keepdims=True)

In [25]:
probs.shape

torch.Size([32, 27])

In [26]:
probs[0].sum()

tensor(1.0000)

In [29]:
torch.arange(32)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

#### Implementing the negative loss likelihood

In [31]:
# index into the rows of prob and pluck out the probability to the correct character
loss = -probs[torch.arange(32), Y].log().mean()
loss

tensor(20.3209)

--------------------------- SUMMARY ----------------------------------------------

In [32]:
X.shape, Y.shape

(torch.Size([32, 3]), torch.Size([32]))

In [39]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27,2), generator=g)
W1 = torch.randn((6,100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100,27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [40]:
sum(p.nelement() for p in parameters) # number of parameters in total

3481

In [42]:
emb = C[X]
h = torch.tanh(emb.view(-1,6) @ W1 + b1) # (32,100)
logits = h @ W2 + b2 # (32,27)
counts = logits.exp()
prob = counts / counts.sum(1,keepdims=True)
loss = -prob[torch.arange(32), Y].log().mean()
loss


tensor(17.7697)