In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

Matplotlib is building the font cache; this may take a moment.


In [2]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [4]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [5]:
print(stoi)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}


In [46]:
block_size = 3
X, Y = [], []
for w in words:
    context = [0] * block_size
    for ch in w + ".":
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        #print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [47]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [48]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
print("n1 : " + str(n1))
print("n2 : " + str(n2))
Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

n1 : 25626
n2 : 28829
torch.Size([182441, 3]) torch.Size([182441])
torch.Size([22902, 3]) torch.Size([22902])
torch.Size([22803, 3]) torch.Size([22803])


In [49]:
C = torch.randn((27, 2))

In [50]:
C

tensor([[-1.2778, -0.3333],
        [ 0.0580, -0.8089],
        [ 0.8278, -0.1835],
        [ 0.3911,  0.1388],
        [ 0.5126, -0.6226],
        [ 0.9749,  0.4200],
        [-0.1155,  2.5391],
        [-0.1014,  0.2406],
        [ 0.8564, -0.2857],
        [ 1.2239,  0.1581],
        [ 0.4094,  1.6831],
        [-0.3934, -0.6766],
        [-0.7361,  1.0588],
        [-1.2080, -0.5804],
        [-1.7171, -0.6500],
        [-2.1489, -1.1161],
        [-0.2902,  0.8320],
        [-0.2129,  0.0617],
        [-1.0421,  0.4991],
        [ 0.7769,  0.2109],
        [-0.5799,  0.4404],
        [ 1.3532,  0.2123],
        [ 0.0944, -0.9769],
        [-0.3562,  2.0757],
        [ 0.3837,  0.3586],
        [-0.3641, -0.5538],
        [-0.7294,  0.0200]])

In [51]:
emb = C[X]
emb.shape

torch.Size([228146, 3, 2])

In [52]:
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [53]:
emb.view(-1, 6)

tensor([[-1.2778, -0.3333, -1.2778, -0.3333, -1.2778, -0.3333],
        [-1.2778, -0.3333, -1.2778, -0.3333, -0.5799,  0.4404],
        [-1.2778, -0.3333, -0.5799,  0.4404,  0.0580, -0.8089],
        ...,
        [-1.2778, -0.3333,  0.4094,  1.6831,  0.9749,  0.4200],
        [ 0.4094,  1.6831,  0.9749,  0.4200,  0.0580, -0.8089],
        [ 0.9749,  0.4200,  0.0580, -0.8089, -1.7171, -0.6500]])

In [54]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
h

tensor([[ 0.1168, -0.9998, -0.9923,  ..., -0.9996, -0.4220, -0.9972],
        [-0.3949, -0.9993, -0.7747,  ..., -0.9942, -0.9922, -0.9880],
        [ 0.7264, -0.7635,  0.1664,  ..., -0.9190,  0.7994, -0.9993],
        ...,
        [ 0.8960,  0.9698,  0.9980,  ...,  0.7756, -0.3750, -0.9967],
        [ 0.8483,  0.9999,  0.9908,  ..., -0.9979,  0.9997, -0.4529],
        [ 0.9675, -0.9778, -0.9970,  ..., -0.9999,  0.9986,  0.9500]])

In [55]:
h.shape

torch.Size([228146, 100])

In [56]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [57]:
logits = h @ W2 + b2

In [58]:
logits.shape

torch.Size([228146, 27])

In [59]:
counts = logits.exp()

In [60]:
counts

tensor([[1.2370e-03, 5.3402e-03, 3.3949e-03,  ..., 3.4754e+01, 2.0085e+02,
         7.4983e+02],
        [1.0270e-04, 6.8171e-02, 2.0134e-04,  ..., 8.8091e-03, 9.6381e+04,
         1.5064e+00],
        [3.9348e-07, 9.8621e-01, 5.8085e+00,  ..., 7.4794e-02, 5.2662e+04,
         4.9218e+02],
        ...,
        [5.5017e-04, 7.0928e+01, 3.2329e+02,  ..., 3.9356e-01, 3.0178e+02,
         3.3260e-01],
        [5.3967e-01, 6.5809e-03, 2.7226e-01,  ..., 1.2237e+02, 1.1513e+05,
         1.9771e-02],
        [3.3487e+04, 2.1629e+00, 1.6759e-05,  ..., 2.4229e+03, 7.3088e-01,
         2.1135e+03]])

In [61]:
prob = counts / counts.sum(1, keepdims=True)

In [62]:
prob.shape

torch.Size([228146, 27])

In [63]:
loss = -prob[torch.arange(32), Y].log().mean()
loss

IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [32], [228146]

In [64]:
Xtr.shape, Ytr.shape 

(torch.Size([182441, 3]), torch.Size([182441]))

In [90]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [91]:
sum(p.nelement() for p in parameters) # number of parameters in total

3481

In [92]:
for p in parameters:
    p.requires_grad = True

In [95]:
for _ in range(100):
    emb = C[X]
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    print(loss.item())
    for p in parameters:
        p.grad = None
    loss.backward()
    for p in parameters:
        p.data += -0.1 * p.grad


3.575550079345703
3.5621652603149414
3.5490872859954834
3.5363073348999023
3.5238168239593506
3.5116055011749268
3.499666929244995
3.4879918098449707
3.4765725135803223
3.4654011726379395
3.45447039604187
3.4437735080718994
3.433302879333496
3.4230518341064453
3.4130139350891113
3.4031834602355957
3.3935537338256836
3.3841187953948975
3.374873161315918
3.365811824798584
3.356928586959839
3.3482189178466797
3.3396778106689453
3.3313004970550537
3.32308292388916
3.3150203227996826
3.307107925415039
3.299342632293701
3.291719436645508
3.2842354774475098
3.276886463165283
3.2696690559387207
3.262579917907715
3.255614995956421
3.248771905899048
3.24204683303833
3.23543643951416
3.228938102722168
3.2225492000579834
3.2162656784057617
3.210085868835449
3.2040061950683594
3.1980247497558594
3.192138671875
3.186345338821411
3.180642604827881
3.175027370452881
3.1694984436035156
3.1640524864196777
3.1586878299713135
3.15340256690979
3.1481945514678955
3.1430611610412598
3.1380014419555664
3.1330

In [84]:
loss

tensor(10.4076, grad_fn=<NllLossBackward0>)