In [20]:
#Load up names.txt
words = open('names.txt', 'r').read().split('\n')

In [21]:
import torch
import torch.nn.functional as F
import numpy as np
import plotly.graph_objs as go

In [22]:
#Get a list of all the chars
chars = ['.'] + sorted(list(set(''.join(words))))

#Tokenize the text- This will be used to convert the text to numbers
char_to_idx = {ch:i for i,ch in enumerate(chars)}
idx_to_char = {i:ch for i,ch in enumerate(chars)}


def text_to_tensor(text):
    #Converts a string to a tensor
    tensor = torch.zeros(len(text)).long()
    for c in range(len(text)):
        tensor[c] = char_to_idx[text[c]]
    return tensor

def tensor_to_text(tensor):
    #Converts a tensor to a string
    text = ''
    for t in tensor:
        text += idx_to_char[t.item()]
    return text

In [65]:
#Build data set
block_size = 3

def build_data_set(words):
    #Get the inputs and targets
    X, Y = [], []

    for word in words[:]:
        word = "." + word + "."
        context = [0] * block_size
        for i in range(len(word)-1):
            chr = char_to_idx[word[i]]
            context.append(chr)
            if len(context) > block_size:
                context = context[1:]
            X.append(context[:])
            Y.append(char_to_idx[word[i+1]])

            # print("For context", context, "predict", word[i+1])

    #Convert to tensor
    X = torch.tensor(X)
    Y = torch.tensor(Y)

    return X, Y

import random
random.seed(42)

#Shuffle the data
random.shuffle(words)

n1 = int(len(words)*0.8)
n2 = int(len(words)*0.9)

X_train, Y_train = build_data_set(words[:n1])
X, Y = build_data_set(words[:n1]) #Just for prev code
X_val, Y_val = build_data_set(words[n1:n2])
X_test, Y_test = build_data_set(words[n2:])
print("Train:", len(X_train), "Val:", len(X_val), "Test:", len(X_test))



Train: 182580 Val: 22767 Test: 22799


In [24]:
C = torch.randn(len(chars), 2)
C_5_A = C[5]
C_5_B = F.one_hot(torch.tensor(5), len(chars)).float() @ C

#These are the same
print("C_5_A", C_5_A) #This is faster so we will use this
print("C_5_B", C_5_B)

#Essentially we are assigning two outputs for the same input 



C_5_A tensor([-1.1439,  1.0959])
C_5_B tensor([-1.1439,  1.0959])


In [25]:
emb = C[X]
emb.shape #len(X) x block_size x 2 

torch.Size([182625, 5, 2])

In [26]:
#Hidden layer
W1 = torch.randn(2*block_size, 100) #2*block_size x hidden_size (up to us)
b1 = torch.randn(100) #Bias for hidden layer

#Goal: emb @ W1 + b1

#Concatenate the embeddings
#This will combine the 3rd dimension of emb into the 2nd dimension
emb = emb.view(len(X), -1) #len(X) x (block_size*2) 
#The -1 means "figure out the size of this dimension for me"
print("Emb Shape: ", emb.shape)

#Now we can do the hidden layer
h = emb @ W1 + b1 #len(X) x hidden_size

Emb Shape:  torch.Size([182625, 10])


In [27]:
#Now we can do the output layer
W2 = torch.randn(100, len(chars)) #hidden_size x len(chars)
b2 = torch.randn(len(chars)) #Bias for output layer

#Goal: h @ W2 + b2

#Now we can do the output layer
logits = h @ W2 + b2 #len(X) x len(chars)

In [28]:
#Calculate prob
counts = torch.exp(logits)
probs = counts / counts.sum(dim=1, keepdim=True)

In [29]:
#Get prob of correct characters
correct_probs = probs[range(len(X)), Y]

#Calculate loss
loss = -torch.log(correct_probs).mean()
print("Loss: ", loss)


Loss:  tensor(nan)


In [68]:
#Put it all together
emb_dim = 10
hidden_size = 200
batch_size = 32 
n = 10000

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn(len(chars), emb_dim, generator=g)
W1 = torch.randn(emb_dim*block_size, hidden_size, generator=g)
b1 = torch.randn(hidden_size, generator=g)
W2 = torch.randn(hidden_size, len(chars), generator=g)
b2 = torch.randn(len(chars), generator=g)

parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad_()

#Print number of parameters
num_params = 0
for p in parameters:
    num_params += p.numel()
print("Total Parameters:", num_params)


Total Parameters: 11897


In [76]:
n = 10000
batch_size = 32

#Learning Rate
lre = torch.linspace(-3,0,n)
lrs = 10**lre

lre_i, loss_i = [], []

In [83]:
bestC = None
bestLoss = float('inf')

for i in range(n):
    #Mini-batch
    ix = torch.randint(len(X_train), (batch_size,))

    emb = C[X_train[ix]]
    emb = emb.view(len(X_train[ix]), -1) #len(X_train) x (block_size*2)
    h = emb @ W1 + b1 #len(X_train) x hidden_size
    h = torch.tanh(h) #Apply activation function
    logits = h @ W2 + b2 #len(X_train) x len(chars)
    # counts = torch.exp(logits)
    # probs = counts / counts.sum(dim=1, keepdim=True)
    # correct_probs = probs[range(len(X_train)), Y]
    # loss = -torch.log(correct_probs).mean()
    loss = F.cross_entropy(logits, Y[ix]) #This is the same as the above 4 lines 
    # Forward / Backward Pass is much more efficient than the above 4 lines
    # Can account for high possitve values that would cause overflow

    #Backward Pass
    for p in parameters:
        p.grad = None

    loss.backward()

    #Update parameters
    # lr = lrs[i]
    #Disable gradient tracking (for efficiency)
    with torch.no_grad():
        lr = .0003 #This was the best learning rate without sacrificing speed
        for p in parameters:
            p.data -= lr * p.grad

    #Track Stats 
    lre_i.append(lre[i])
    loss_i.append(loss.item())

    #Save best parameters
    if loss < bestLoss:
        bestLoss = loss
        # bestC = C.clone()
        # print("New Best Loss:", bestLoss.item())


    

    # if i % 100 == 0:
    #     print("Loss: ", loss.item())

In [34]:
#Plot learning rate vs loss
fig = go.Figure()
fig.add_trace(go.Scatter(x=lre_i, y=loss_i))
fig.update_layout(
    title="Learning Rate Exponent vs Loss",
    xaxis_title="Learning Rate Exponent",
    yaxis_title="Loss",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)
fig.show()  

In [84]:
#Calculate loss of validation set
emb = C[X_val]
emb = emb.view(len(X_val), -1) #len(X_val) x (block_size*2)
h = emb @ W1 + b1 #len(X_val) x hidden_size
h = torch.tanh(h) #Apply activation function
logits = h @ W2 + b2 #len(X_val) x len(chars)
loss = F.cross_entropy(logits, Y_val) #This is the same as the above 4 lines
print("Validation Loss: ", loss.item())

#Calculate loss of Training set
emb = C[X_train]
emb = emb.view(len(X_train), -1) #len(X_train) x (block_size*2)
h = emb @ W1 + b1 #len(X_train) x hidden_size
h = torch.tanh(h) #Apply activation function
logits = h @ W2 + b2 #len(X_train) x len(chars)
loss = F.cross_entropy(logits, Y_train) #This is the same as the above 4 lines
print("Training Loss: ", loss.item()) 

#Training Loss is close to Validation Loss so we are not overfitting. 

Validation Loss:  2.1527202129364014
Training Loss:  2.115222692489624


In [85]:
g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    context = [0] * block_size
    name = "."
    while True:
        #Get the context
        context_tensor = torch.tensor(context).unsqueeze(0)
        emb = C[context_tensor]
        emb = emb.view(1, -1)
        h = emb @ W1 + b1
        h = torch.tanh(h)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1).squeeze(0)

        #Get the next character
        next_char = torch.multinomial(probs, 1, generator=g).item()

        #Add the next character to the name
        name += idx_to_char[next_char]

        #Update the context
        context.append(next_char)
        if len(context) > block_size:
            context = context[1:]

        if next_char == 0:
            break

    print(name[1:-1])
        


cex
maleah
makilah
tyha
kalimitta
noluwan
katar
samiyah
javer
got
shid
jence
kinzited
jena
mathside
enkaviyah
foble
huniven
tahlas
kaspr
