In [9]:
import torch

In [10]:
raw = open("data/names.txt").read()
names = raw.split("\n")

In [11]:
chars = sorted(list(set("".join(names) + ".")))
char_to_num = { char: num for num, char in enumerate(chars) }
num_to_char = { num: char for char, num in char_to_num.items() }

In [12]:
X = []
Y = []; y_idx = []
eye = torch.eye(len(chars))

context_window = 4
for name in names:
  string = "." * (context_window) + name + "."
  string_nums = [char_to_num[char] for char in string]

  for idx in range(context_window, len(string)):
    substring_nums = string_nums[idx - context_window:idx]
    target_num = string_nums[idx]
    x = eye[substring_nums]; X.append(x)
    y = eye[target_num]; Y.append(y); y_idx.append(target_num)
    
    # print(f"{string[idx - context_window:idx]} -> {string[idx]}")
    # print(f"{substring_nums} -> {target_num}")
    # print(x, y)

X = torch.stack(X) # stack to merge list of tensors
Y = torch.stack(Y); y_idx = torch.tensor(y_idx)

X.shape, Y.shape,

(torch.Size([228146, 4, 27]), torch.Size([228146, 27]))

In [24]:
# initializing parameters
generator = torch.Generator().manual_seed(14)
embedding_vector_dimensionality = 2
embedding_matrix = torch.randn((len(chars), embedding_vector_dimensionality), generator=generator, requires_grad=True)

hidden_layer_num_neurons = 100
hidden_layer_weights = torch.randn((context_window * embedding_vector_dimensionality, hidden_layer_num_neurons), generator=generator, requires_grad=True)
hidden_layer_biases = torch.randn((hidden_layer_num_neurons), generator=generator, requires_grad=True)

output_layer_num_neurons = len(chars)
output_layer_weights = torch.randn((hidden_layer_num_neurons, output_layer_num_neurons), generator=generator, requires_grad=True)
output_layer_biases = torch.randn((output_layer_num_neurons), generator=generator, requires_grad=True)

parameters = [embedding_matrix, hidden_layer_weights, hidden_layer_biases, output_layer_weights, output_layer_biases]

In [25]:
# forward pass
embeddings = X @ embedding_matrix
print(embeddings.shape)

embeddings_flattened = embeddings.view(-1, context_window * embedding_vector_dimensionality) # think about traversal order
print(embeddings_flattened.shape)

hidden_layer_preactivations = embeddings_flattened @ hidden_layer_weights + hidden_layer_biases
# hidden_layer_activations = torch.maximum(hidden_layer_preactivations, torch.tensor(0.0))
hidden_layer_activations = hidden_layer_preactivations.tanh()
print(hidden_layer_activations.shape)

output_layer_preactivations = hidden_layer_activations @ output_layer_weights + output_layer_biases
output_layer_activations = output_layer_preactivations
logits = output_layer_activations
print(logits.shape)

logits_sub_max = logits - logits.max(dim=1, keepdim=True).values
counts = logits_sub_max.exp()
prob_distributions = counts / counts.sum(dim=1, keepdim=True)
print(prob_distributions.shape, prob_distributions.sum(dim=1).isclose(torch.tensor(1.0)).all())

target_probs = prob_distributions[torch.arange(X.shape[0]), y_idx]
target_logprobs = target_probs.log()
print(target_logprobs.shape)

negative_average_log_likelihood = -target_logprobs.mean()
loss = negative_average_log_likelihood
print(loss)

intermediates = [embeddings, embeddings_flattened, hidden_layer_preactivations, hidden_layer_activations, output_layer_preactivations, output_layer_activations, logits, logits_sub_max, counts, prob_distributions, target_probs, target_logprobs]
params_and_intermediates = parameters + intermediates

# tensor(15.7477, grad_fn=<NegBackward0>)

torch.Size([228146, 4, 2])
torch.Size([228146, 8])
torch.Size([228146, 100])
torch.Size([228146, 27])
torch.Size([228146, 27]) tensor(True)
torch.Size([228146])
tensor(16.2682, grad_fn=<NegBackward0>)


In [26]:
# backward pass
for tensor in params_and_intermediates:
  tensor.grad = None

learning_rate = 0.01
loss.backward()
embedding_matrix.data = embedding_matrix.data - learning_rate * embedding_matrix.grad
hidden_layer_weights.data = hidden_layer_weights.data - learning_rate * hidden_layer_weights.grad
hidden_layer_biases.data = hidden_layer_biases.data - learning_rate * hidden_layer_biases.grad
output_layer_weights.data = output_layer_weights.data - learning_rate * output_layer_weights.grad
output_layer_biases.data = output_layer_biases.data - learning_rate * output_layer_biases.grad

In [27]:
# computation graph gets nuked after backward() called. don't need to worry about duplicate nodes getting created each forward pass. 

In [28]:
losses = []

In [29]:
for iter in range(300):

  # forward
  embeddings = X @ embedding_matrix
  embeddings_flattened = embeddings.view(-1, context_window * embedding_vector_dimensionality) # think about traversal order

  hidden_layer_preactivations = embeddings_flattened @ hidden_layer_weights + hidden_layer_biases
  hidden_layer_activations = hidden_layer_preactivations.tanh()

  output_layer_preactivations = hidden_layer_activations @ output_layer_weights + output_layer_biases
  output_layer_activations = output_layer_preactivations
  logits = output_layer_activations

  logits_sub_max = logits - logits.max(dim=1, keepdim=True).values
  counts = logits_sub_max.exp()
  prob_distributions = counts / counts.sum(dim=1, keepdim=True)

  target_probs = prob_distributions[torch.arange(X.shape[0]), y_idx]
  target_logprobs = target_probs.log()
  negative_average_log_likelihood = -target_logprobs.mean()
  loss = negative_average_log_likelihood

  losses.append(loss); print(iter, loss)

  intermediates = [embeddings, embeddings_flattened, hidden_layer_preactivations, hidden_layer_activations, output_layer_preactivations, output_layer_activations, logits, logits_sub_max, counts, prob_distributions, target_probs, target_logprobs] # new objects created each forward pass, so i think i need to redefine this each time
  params_and_intermediates = parameters + intermediates

  # backward
  for tensor in params_and_intermediates:
    tensor.grad = None

  learning_rate = 0.1
  loss.backward()
  embedding_matrix.data = embedding_matrix.data - learning_rate * embedding_matrix.grad
  hidden_layer_weights.data = hidden_layer_weights.data - learning_rate * hidden_layer_weights.grad
  hidden_layer_biases.data = hidden_layer_biases.data - learning_rate * hidden_layer_biases.grad
  output_layer_weights.data = output_layer_weights.data - learning_rate * output_layer_weights.grad
  output_layer_biases.data = output_layer_biases.data - learning_rate * output_layer_biases.grad

0 tensor(16.2055, grad_fn=<NegBackward0>)
1 tensor(15.5977, grad_fn=<NegBackward0>)
2 tensor(15.0310, grad_fn=<NegBackward0>)
3 tensor(14.4924, grad_fn=<NegBackward0>)
4 tensor(13.9861, grad_fn=<NegBackward0>)
5 tensor(13.4927, grad_fn=<NegBackward0>)
6 tensor(12.9937, grad_fn=<NegBackward0>)
7 tensor(12.4881, grad_fn=<NegBackward0>)
8 tensor(11.9912, grad_fn=<NegBackward0>)
9 tensor(11.5278, grad_fn=<NegBackward0>)
10 tensor(11.1313, grad_fn=<NegBackward0>)
11 tensor(10.8109, grad_fn=<NegBackward0>)
12 tensor(10.5232, grad_fn=<NegBackward0>)
13 tensor(10.2513, grad_fn=<NegBackward0>)
14 tensor(9.9924, grad_fn=<NegBackward0>)
15 tensor(9.7453, grad_fn=<NegBackward0>)
16 tensor(9.5090, grad_fn=<NegBackward0>)
17 tensor(9.2829, grad_fn=<NegBackward0>)
18 tensor(9.0667, grad_fn=<NegBackward0>)
19 tensor(8.8600, grad_fn=<NegBackward0>)
20 tensor(8.6631, grad_fn=<NegBackward0>)
21 tensor(8.4759, grad_fn=<NegBackward0>)
22 tensor(8.2997, grad_fn=<NegBackward0>)
23 tensor(8.1341, grad_fn=<Neg