In [1]:
# in our previous model, we only took 1 character of context (bigram) so we got not so good results
# if we take 2 characters as context , we will grow into 27 * 27 context matrix 
# if we take 3 characters as context , we will grow into 27 * 27 * 27 context matrix 
# it will be extremely difficult to keep track of these counts
# we are following https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

In [75]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [76]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
print(chars, '\n')

stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
print(stoi , '\n')

itos = {i:s for s,i in stoi.items()}
print(itos , '\n')

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0} 

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'} 



In [77]:
def buildDataset(words , block_size):
    X , Y = [] , [] 
    for word in words:
        context = [0] * block_size
        word =  word + '.' # append a end character to word, so the generator is trained to stop
        for ch in word:
            ix = stoi[ch]
            context = context[1:] + [ix]
            X.append(context)
            Y.append(ix)
    return X, Y

In [78]:
import random

def trainDevAndTestSplit(words , block_size):
    random.seed(42)
    random.shuffle(words)
    n1 = int(0.8 * len(words))
    n2 = int(0.9 * len(words))
    xtr, ytr = buildDataset(words[:n1], block_size)
    xdev, ydev = buildDataset(words[n1:n2], block_size)
    xtest, ytest = buildDataset(words[n2:], block_size)
    return torch.tensor(xtr), torch.tensor(ytr), torch.tensor(xdev), torch.tensor(ydev), torch.tensor(xtest), torch.tensor(ytest)

In [79]:
block_size = 3
xtr, ytr, xdev, ydev, xtest, ytest = trainDevAndTestSplit(words, block_size)

print(xtr.shape, ytr.shape)
print(xdev.shape, ydev.shape)
print(xtest.shape, ytest.shape)

torch.Size([182580, 3]) torch.Size([182580])
torch.Size([22767, 3]) torch.Size([22767])
torch.Size([22799, 3]) torch.Size([22799])


In [80]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility

character_embedding_size = 5
layer1_size = 150

C = torch.randn((27 , character_embedding_size) , generator = g) # character embedding
W1 = torch.randn((character_embedding_size * block_size , layer1_size) , generator = g)
b1 = torch.randn(layer1_size , generator = g)
W2 = torch.randn((layer1_size , 27) , generator = g)
b2 = torch.randn(27 , generator = g)

In [81]:
network_parameters = [C , W1 , b1 , W2, b2]
print('Total no of parameters ' , sum(p.nelement() for p in network_parameters))

Total no of parameters  6612


In [82]:
for p in network_parameters:
    p.requires_grad = True

In [85]:
def forwardAndBackwardPass(x , y , batch_size , lr):
    ix_batch = torch.randint(0 , x.shape[0], (batch_size, ))
    x_batch = x[ix_batch]
    embed_batch = C[x_batch].flatten(1)
    
    y_batch = y[ix_batch]
    
    h0 = embed_batch @ W1 + b1
    
    h1 = torch.tanh(h0)
    
    logits = h1 @ W2 + b2
    
    loss = F.cross_entropy(logits , y_batch)
    
    # backward pass 
    for p in network_parameters:
        p.grad = None
    
    loss.backward()
    
    # update grad
    for p in network_parameters:
        p.data += -lr * p.grad
        
    return loss.item()

def trainNetwork(x, y, batch_size , epochs , lr):
    loss_i = []
    iter_i = []
    for i in range(0,epochs):
        loss = forwardAndBackwardPass(x, y, batch_size , lr)
        loss_i.append(loss)
        iter_i.append(i)
    return loss_i , iter_i

In [86]:
losses , iters = trainNetwork(xtr , ytr , 64 , 1000 , 0.1)

In [87]:
losses[-10:]

[0.14254187047481537,
 0.20875024795532227,
 0.25030431151390076,
 0.2795918881893158,
 0.09074337780475616,
 0.03833494335412979,
 0.20134767889976501,
 0.014675336889922619,
 0.05529174581170082,
 0.44112691283226013]