In [1]:
# heavely inspired by Andrej Karpathy serie Zero to Hero
# paper: https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

In [2]:
import numpy as np
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
!python --version

Python 3.10.10


### building dataset

In [93]:
words = open('names.txt', 'r').read().splitlines()

In [94]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0 # add 27th character, which represent Start and End tokens
itos = {i: s for s, i in stoi.items()}
[(i, j) for i, j in zip(stoi.items(), itos.items())]

[(('a', 1), (1, 'a')),
 (('b', 2), (2, 'b')),
 (('c', 3), (3, 'c')),
 (('d', 4), (4, 'd')),
 (('e', 5), (5, 'e')),
 (('f', 6), (6, 'f')),
 (('g', 7), (7, 'g')),
 (('h', 8), (8, 'h')),
 (('i', 9), (9, 'i')),
 (('j', 10), (10, 'j')),
 (('k', 11), (11, 'k')),
 (('l', 12), (12, 'l')),
 (('m', 13), (13, 'm')),
 (('n', 14), (14, 'n')),
 (('o', 15), (15, 'o')),
 (('p', 16), (16, 'p')),
 (('q', 17), (17, 'q')),
 (('r', 18), (18, 'r')),
 (('s', 19), (19, 's')),
 (('t', 20), (20, 't')),
 (('u', 21), (21, 'u')),
 (('v', 22), (22, 'v')),
 (('w', 23), (23, 'w')),
 (('x', 24), (24, 'x')),
 (('y', 25), (25, 'y')),
 (('z', 26), (26, 'z')),
 (('.', 0), (0, '.'))]

In [102]:
# size of 'window' of chars from word
# every iteration the 'window' moves by one char
block_size = 3
X = [] # sequence (ex: '...' or 'emm')
Y = [] # chars from word
for w in words[:4]:
    print(w)
    context = [0] * block_size # create first empty block of chars (0 'cause it is '.' char)
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', ch) # 'ch' or itos[ix]
        context = context[1:] + [ix]
    print("="*10)
# new dataset
X = torch.LongTensor(X)
Y = torch.Tensor(Y)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .


In [103]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([25, 3]), torch.int64, torch.Size([25]), torch.float32)

In [104]:
X[:4], [([itos[int(j)] for j in i]) for i in X[:5]]

(tensor([[ 0,  0,  0],
         [ 0,  0,  5],
         [ 0,  5, 13],
         [ 5, 13, 13]]),
 [['.', '.', '.'],
  ['.', '.', 'e'],
  ['.', 'e', 'm'],
  ['e', 'm', 'm'],
  ['m', 'm', 'a']])

In [105]:
Y[:5], [itos[int(i)] for i in Y[:5]]

(tensor([ 5., 13., 13.,  1.,  0.]), ['e', 'm', 'm', 'a', '.'])

In [106]:
C = torch.randn((27, 2))

In [107]:
C[:4], C[5], C[[1, 2, 6, 6]] # multi dim indexing

(tensor([[-1.3848, -1.0270],
         [-0.5578, -1.8801],
         [-2.3347, -0.5463],
         [-0.5198, -1.1513]]),
 tensor([0.5509, 0.1807]),
 tensor([[-0.5578, -1.8801],
         [-2.3347, -0.5463],
         [ 1.2513,  0.4893],
         [ 1.2513,  0.4893]]))

In [109]:
emb = C[X] # embedding
emb

tensor([[[-1.3848, -1.0270],
         [-1.3848, -1.0270],
         [-1.3848, -1.0270]],

        [[-1.3848, -1.0270],
         [-1.3848, -1.0270],
         [ 0.5509,  0.1807]],

        [[-1.3848, -1.0270],
         [ 0.5509,  0.1807],
         [ 1.1020, -1.6985]],

        [[ 0.5509,  0.1807],
         [ 1.1020, -1.6985],
         [ 1.1020, -1.6985]],

        [[ 1.1020, -1.6985],
         [ 1.1020, -1.6985],
         [-0.5578, -1.8801]],

        [[-1.3848, -1.0270],
         [-1.3848, -1.0270],
         [-1.3848, -1.0270]],

        [[-1.3848, -1.0270],
         [-1.3848, -1.0270],
         [ 1.0739,  0.3573]],

        [[-1.3848, -1.0270],
         [ 1.0739,  0.3573],
         [-0.7477,  1.1902]],

        [[ 1.0739,  0.3573],
         [-0.7477,  1.1902],
         [-0.0855, -1.3304]],

        [[-0.7477,  1.1902],
         [-0.0855, -1.3304],
         [ 0.4757,  0.5514]],

        [[-0.0855, -1.3304],
         [ 0.4757,  0.5514],
         [-0.0855, -1.3304]],

        [[ 0.4757,  0