In [1]:
#Load up names.txt
words = open('names.txt', 'r').read().split('\n')

In [2]:
import torch
import torch.nn.functional as F
import numpy as np
import plotly.graph_objs as go

In [3]:
#Get a list of all the chars
chars = ['.'] + sorted(list(set(''.join(words))))

#Tokenize the text- This will be used to convert the text to numbers
char_to_idx = {ch:i for i,ch in enumerate(chars)}
idx_to_char = {i:ch for i,ch in enumerate(chars)}

In [4]:
#Build data set
block_size = 3

def build_data_set(words):
    #Get the inputs and targets
    X, Y = [], []

    for word in words[:]:
        word = "." + word + "."
        context = [0] * block_size
        for i in range(len(word)-1):
            chr = char_to_idx[word[i]]
            context.append(chr)
            if len(context) > block_size:
                context = context[1:]
            X.append(context[:])
            Y.append(char_to_idx[word[i+1]])

            # print("For context", context, "predict", word[i+1])

    #Convert to tensor
    X = torch.tensor(X)
    Y = torch.tensor(Y)

    return X, Y

import random
random.seed(42)

#Shuffle the data
random.shuffle(words)

n1 = int(len(words)*0.8)
n2 = int(len(words)*0.9)

X_train, Y_train = build_data_set(words[:n1])
X, Y = build_data_set(words[:n1]) #Just for prev code
X_val, Y_val = build_data_set(words[n1:n2])
X_test, Y_test = build_data_set(words[n2:])
print("Train:", len(X_train), "Val:", len(X_val), "Test:", len(X_test))


Train: 182625 Val: 22655 Test: 22866


In [262]:
emb_dim = 10
hidden_size = 200
batch_size = 32 
n = 10000

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn(len(chars), emb_dim, generator=g)
W1 = torch.randn(emb_dim*block_size, hidden_size, generator=g) * 0.1
b1 = torch.randn(hidden_size, generator=g) * .01
W2 = torch.randn(hidden_size, len(chars), generator=g)*0.01
b2 = torch.randn(len(chars), generator=g)*0

parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad_()



In [228]:
n = 10000
batch_size = 32
lr = .001

In [229]:
for i in range(n):
    #Mini-batch
    ix = torch.randint(len(X_train), (batch_size,))

    emb = C[X_train[ix]]
    emb = emb.view(len(X_train[ix]), -1) #len(X_train) x (block_size*2)
    h = emb @ W1 + b1 #len(X_train) x hidden_size
    h = torch.tanh(h) #Apply activation function
    logits = h @ W2 + b2 #len(X_train) x len(chars)
    loss = F.cross_entropy(logits, Y[ix]) #This is the same as the above 4 lines 
    

    #Backward Pass
    for p in parameters:
        p.grad = None

    loss.backward()

    #Update parameters
    #Disable gradient tracking (for efficiency)
    with torch.no_grad():
        for p in parameters:
            p.data -= lr * p.grad





In [236]:
#Calculate loss of validation set
emb = C[X_val]
emb = emb.view(len(X_val), -1) #len(X_val) x (block_size*2)
h = emb @ W1 + b1 #len(X_val) x hidden_size
h = torch.tanh(h) #Apply activation function
logits = h @ W2 + b2 #len(X_val) x len(chars)
loss = F.cross_entropy(logits, Y_val) #This is the same as the above 4 lines
print("Validation Loss: ", loss.item())

#Calculate loss of Training set
emb = C[X_train]
emb = emb.view(len(X_train), -1) #len(X_train) x (block_size*2)
h = emb @ W1 + b1 #len(X_train) x hidden_size
h = torch.tanh(h) #Apply activation function
logits = h @ W2 + b2 #len(X_train) x len(chars)
loss = F.cross_entropy(logits, Y_train) #This is the same as the above 4 lines
print("Training Loss: ", loss.item()) 

#Training Loss is close to Validation Loss so we are not overfitting. 

Validation Loss:  3.295836925506592
Training Loss:  3.29583740234375


In [263]:
#Mini-batch
ix = torch.randint(len(X_train), (batch_size,))

emb = C[X_train[ix]]
emb = emb.view(len(X_train[ix]), -1) #len(X_train) x (block_size*2)
h = emb @ W1 + b1 #len(X_train) x hidden_size
h = torch.tanh(h) #Apply activation function

#Plot histogram of h values
h = h.detach().numpy().flatten()

fig = go.Figure(data=[go.Histogram(x=h)])
fig.show()

In [267]:
x = torch.randn(1000,10)
w = torch.randn(10,200)/10**.5
y = x @ w

#Print mean and standard deviation of x compared to y (2 decimal places)
print("x mean:", round(x.mean().item(),2), "std:", round(x.std().item(),2))
print("y mean:", round(y.mean().item(),2), "std:", round(y.std().item(),2))

x mean: -0.01 std: 1.0
y mean: 0.0 std: 1.0
