In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('names.txt', 'r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
len(words)

32033

In [4]:
#build vocab
chars = sorted(list(set(''.join(words))))
stoi = {s:i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0 
itos = {i:s for s, i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [5]:
 #build dataset
block_size = 3 #context length: how many characters do we take to predict the next one
X, Y = [], [] #Inputs, labels 

for w in words[:5]:
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        conxtext = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [6]:
C = torch.randn((27, 2)) #27 characters, 2 embedding dimensions

In [7]:
C[5]

tensor([ 0.0225, -0.7338])

In [None]:
#embed all integers in X simultaneously:
emb = C[X] # (N, block_size, d)
emb.shape#

torch.Size([32, 3, 2])

In [None]:
#construct hidden layer:
W1 = torch.randn((6, 100)) #(no of inputs: 3 of 2 dimensional embeddings, no of neurons: 100)
b1 = torch.randn((100)) #bias for hidden layer

#transform embedding matrix to [32, 6] matrix - concatenate all the embeddings of the 3 words in the context
torch.cat([emb[:,0,:], emb[:,1,:], emb[:,2,:]], dim=1).shape # (N, 6)

torch.Size([32, 6])

In [14]:
torch.cat(torch.unbind(emb, 1), 1).shape

torch.Size([32, 6])

In [15]:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [16]:
a.shape

torch.Size([18])

In [17]:
a.view(3,3,2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

We can chage the shape of the tensor with .view(*as long as the multiplication equals to 18 - original tensor)


In [18]:
a.storage()

  a.storage()


 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In pytorch, tensors has an underlying storage that is just a 1 dimensional array - how its represented in the computer memnory.

When we call .view, no underlying memory is changed, but internal attributes of the view of the tensor - storage offset, stride, shapes -> seen as n-dimensional array.

In [19]:
emb.shape

torch.Size([32, 3, 2])

In [None]:
emb.view(32,6) #flattened view of the embedding matrix

tensor([[-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0282, -0.9782,  1.0282],
        [-0.9782,  1.0282, -0.9782,  1.0

In [21]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)


#(when input for view is -1, it infers the dimension automatically)

In [22]:
h.shape

torch.Size([32, 100])

In [23]:
(emb.view(-1, 6) @ W1 + b1).shape 

torch.Size([32, 100])

In [24]:
b1.shape

torch.Size([100])

In [None]:
#32, 100
#1, 100 -> broadcasted to 32, 100

In [25]:
W2 = torch.rand((100, 27)) #27 possible characters
b2 = torch.rand((27)) 


In [26]:
logits = h @ W2 + b2 # (N, 27)
logits.shape


torch.Size([32, 27])

In [27]:
counts = logits.exp()
prob = counts / counts.sum(1, keepdim=True) # (N, 27)
prob.shape

torch.Size([32, 27])