In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
words = open('names-shuffled.txt', 'r').read().splitlines()

In [4]:
words[:8]

['pushp',
 'aishwary',
 'prahalan',
 'adrija',
 'ghanika',
 'vibulan',
 'kavinthran',
 'priyaalen']

In [37]:
chars = sorted(list(set(''.join(words))))
stoi = {c:i+1 for i, c in enumerate(chars)}
stoi['.'] = 0
itos = {v:k for k, v in stoi.items()}

In [51]:
# build the dataset

block_size = 3 # context length: number of chars used to predict the next char
X, Y = [], [] # inputs and labels

for w in words[:3]:
    print(w)
    context = [0] * block_size
    for c in w + '.':
        
        print(f"{''.join(itos[item] for item in context)} --> {c}")

        X.append(context)
        Y.append(stoi[c])
        context = context[1:] + [stoi[c]] # sliding the context by 1 character to right
    print('-----------')

X = torch.tensor(X)
Y = torch.tensor(Y)

pushp
... --> p
..p --> u
.pu --> s
pus --> h
ush --> p
shp --> .
-----------
aishwary
... --> a
..a --> i
.ai --> s
ais --> h
ish --> w
shw --> a
hwa --> r
war --> y
ary --> .
-----------
prahalan
... --> p
..p --> r
.pr --> a
pra --> h
rah --> a
aha --> l
hal --> a
ala --> n
lan --> .
-----------


In [52]:
X.shape, Y.shape

(torch.Size([24, 3]), torch.Size([24]))

In [70]:
g = torch.Generator().manual_seed(1337101)

C = torch.randn((27, 2), generator=g) # lookup table

C is the lookup table which will contain the embeddings for all 27 characters. It is a matrix of size 27x2.
Meaning it contains embedding of lenght 2 for a character.

How to get embedding of a character from C?

There are 2 ways:
- We can directly do C[i], where i is the integer corrosponding to a character e.g. 1 for a and 2 for b.
- We can create a one hot encodding for a character and then multiply that 1x27 one hot encoding by matrix C.

Both of these operations will result in an encoding of size 1x2 for a character because all encodings are for size 2.

In [71]:
print(C[5]) # first method
t = F.one_hot(torch.tensor(5), num_classes=27).float()
print(t@C) # second method

# both of these operations will result in same output. We'll choose the first method.

tensor([0.2890, 0.7356])
tensor([0.2890, 0.7356])


Now we want to encode the complete X tensor which is of size [m, block_size].

For explanation, let's take m=24 and blok_size=3. So X is of size [24, 3].

PyTorch can do this directly by passing X to C like C[X]. This will result in an output of size [24, 3, 2].

X contains 3 characters in all the 24 rows and all the characters have and embedding of size 2. Hence the size of output
is [24, 3, 2].

In [72]:
C[X].shape

torch.Size([24, 3, 2])

In [73]:
print(X[0])
C[X][0]

# all 3 embeddings are same because all the characters are same.

tensor([0, 0, 0])


tensor([[ 0.6996, -0.1849],
        [ 0.6996, -0.1849],
        [ 0.6996, -0.1849]])

In [74]:
emb = C[X]
emb.shape

torch.Size([24, 3, 2])