In [1]:
import torch
import torch.nn.functional as F

In [2]:
#setup globabl variables
words = open('names.txt','r').read().splitlines()
chars = sorted(list(set(''.join(words)))) #list of chars
stoi = {s:i+1 for i,s in enumerate(chars)}#mapping of chars to integers
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [3]:
#build the dataset
block_size = 3
X, Y = [], []
#X is embedding of 3 characters
#Y is output after the 3 characters

for word in words:
    context = [0]*block_size
    #print(word)
    for char in word + '.':
        ix = stoi[char]
        X.append(context)
        Y.append(ix)
        #print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] #crop and append
X = torch.tensor(X)
Y = torch.tensor(Y)

## build a neural net to predict letter from Context

In [4]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g) #each letter is a 2D vector

#build a 2 layer neural net to predict Ys from Xs
#first layer goes to 100
W1 = torch.randn((6,100), generator=g) #weights
b1 = torch.randn(100, generator=g) #biases

#condense 100 to 27 digits
W2 = torch.randn((100, 27), generator=g) #weights
b2 = torch.randn(27, generator=g) #biases

parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True

In [20]:
for _ in range(10000): 
    #minibatch
    ix = torch.randint(0, X.shape[0], (32,))
    
    #forward pass 
    emb = C[X[ix]]
    h = torch.tanh(emb.view(-1, 6)@W1 + b1)
    logits = h@W2 + b2
    #calculate loss
    loss = F.cross_entropy(logits, Y[ix]) #F.cross_ejntropy computes loss
    
    #backward pass
    for p in parameters:
        p.grad = None # zero out params
    loss.backward()
    #update
    for p in parameters:
        p.data += -0.1*p.grad

print(loss.item())

1.9306527376174927


In [12]:
#finding optimal learning rate (we found it was 0.1)
rates = torch.logspace(-3, 0, 100)
losses = []
for i, rate in enumerate(rates):
    for _ in range(10): 
        #minibatch
        ix = torch.randint(0, X.shape[0], (32,))
        
        #forward pass 
        emb = C[X[ix]]
        h = torch.tanh(emb.view(-1, 6)@W1 + b1)
        logits = h@W2 + b2
        #calculate loss
        loss = F.cross_entropy(logits, Y[ix]) #F.cross_ejntropy computes loss
        
        
        #backward pass
        for p in parameters:
            p.grad = None # zero out params
        loss.backward()
        #update
        for p in parameters:
            p.data += -0.1*p.grad
        losses.append(loss.item())

In [None]:
#ideally you want to shuffle the data set mutliple times to have 
#80% in training set, 10% to search for hyperparameters, 10% for validation

In [29]:
#sample from model
g = torch.Generator().manual_seed(2147483647)
names = []
for _ in range(20):
    out=[]
    context = [0]*block_size
    while True:
        emb = C[torch.tensor([context])] #create embedding from context
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1) #find probabiltiy distribution from logits
        ix = torch.multinomial(probs, num_samples=1, generator=g).item() #sample
        context = context[1:] + [ix] #update context
        out.append(ix)
        if ix == 0:
            break
    names.append(''.join(itos[i] for i in out))
names

['dex.',
 'maidalluraile.',
 'kayda.',
 'kazimi.',
 'tain.',
 'lunan.',
 'kaida.',
 'jamivaulla.',
 'srigot.',
 'jeig.',
 'jellavo.',
 'jaiteda.',
 'kalemka.',
 'sade.',
 'ankavirny.',
 'fols.',
 'mhina.',
 'lavtahlas.',
 'kasd.',
 'del.']

In [23]:
out

[4, 5, 12, 0]