In [4]:
import torch

In [5]:
raw = open("data/names.txt").read()
names = raw.split("\n")

In [6]:
chars = sorted(list(set("".join(names) + ".")))
char_to_num = { char: num for num, char in enumerate(chars) }
num_to_char = { num: char for char, num in char_to_num.items() }

In [25]:
X = []
Y = []; y_idx = []
eye = torch.eye(len(chars))

context_window = 2
for name in names:
  string = "." * (context_window) + name + "."
  string_nums = [char_to_num[char] for char in string]

  for idx in range(context_window, len(string)):
    substring_nums = string_nums[idx - context_window:idx]
    target_num = string_nums[idx]
    x = eye[substring_nums]; X.append(x)
    y = eye[target_num]; Y.append(y); y_idx.append(target_num)
    
    # print(f"{string[idx - context_window:idx]} -> {string[idx]}")
    # print(f"{substring_nums} -> {target_num}")
    # print(x, y)

X = torch.stack(X) # stack to merge list of tensors
Y = torch.stack(Y); y_idx = torch.tensor(y_idx)

X.shape, Y.shape, y_idx.shape

(torch.Size([228146, 2, 27]), torch.Size([228146, 27]), torch.Size([228146]))

In [18]:
# initializing parameters
generator = torch.Generator().manual_seed(14)
embedding_vector_dimensionality = 2
embedding_matrix = torch.randn((len(chars), embedding_vector_dimensionality), generator=generator, requires_grad=True)

hidden_layer_num_neurons = 100
hidden_layer_weights = torch.randn((context_window * embedding_vector_dimensionality, hidden_layer_num_neurons), generator=generator, requires_grad=True)
hidden_layer_biases = torch.randn((hidden_layer_num_neurons), generator=generator, requires_grad=True)

output_layer_num_neurons = len(chars)
output_layer_weights = torch.randn((hidden_layer_num_neurons, output_layer_num_neurons), generator=generator, requires_grad=True)
output_layer_biases = torch.randn((output_layer_num_neurons), generator=generator, requires_grad=True)

In [40]:
# forward pass
embeddings = X @ embedding_matrix
print(embeddings.shape)

embeddings_flattened = embeddings.view(-1, context_window * embedding_vector_dimensionality) # think about traversal order
print(embeddings_flattened.shape)

hidden_layer_preactivations = embeddings_flattened @ hidden_layer_weights + hidden_layer_biases
# hidden_layer_activations = torch.maximum(hidden_layer_preactivations, torch.tensor(0.0))
hidden_layer_activations = hidden_layer_preactivations.tanh()
print(hidden_layer_activations.shape)

output_layer_preactivations = hidden_layer_activations @ output_layer_weights + output_layer_biases
output_layer_activations = output_layer_preactivations
logits = output_layer_activations
print(logits.shape)

logits_sub_max = logits - logits.max(dim=1, keepdim=True).values
counts = logits_sub_max.exp()
prob_distributions = counts / counts.sum(dim=1, keepdim=True)
print(prob_distributions.shape, prob_distributions.sum(dim=1).isclose(torch.tensor(1.0)).all())

target_probs = prob_distributions[torch.arange(X.shape[0]), y_idx]
target_logprobs = target_probs.log()
print(target_logprobs.shape)

negative_average_log_likelihood = -target_logprobs.mean()
loss = negative_average_log_likelihood

print(loss)

torch.Size([228146, 2, 2])
torch.Size([228146, 4])
torch.Size([228146, 100])
torch.Size([228146, 27])
torch.Size([228146, 27]) tensor(True)
torch.Size([228146])
tensor(15.7477, grad_fn=<NegBackward0>)
