In [None]:
import random
import os
from tqdm import trange, tqdm
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def read_words(file_path):
    def clean(word):
        return ''.join(filter(str.isalpha, word)).lower()
    with open(file_path, 'r') as file:
        words = [ clean(word) for word in file.read().splitlines() ]
        file.close()
    return words

In [None]:
### initializing dataset
S = read_words('../geowords.txt')
print(f'total words: {len(S)}')

In [None]:
### set vocabulary mappings
chars = sorted(list(set(''.join(S))))
stoi = { s: i + 1 for i, s in enumerate(chars) }
stoi['.'] = 0
itos = { i: s for s, i in stoi.items() }
vocab_size = len(stoi) # this is vocabulary size
len(itos), len(stoi)

In [None]:
# randomize dataset
random.seed(42069)
random.shuffle(S)
len(S)

In [None]:
### build dataset
block_size = 3 # this is how much will there be in each context

def build_dataset(words):
    X, Y = [], []
    for word in words:
        context = [0] * block_size
        for ch in word + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

n1 = int(0.8 * len(S))
n2 = int(0.9 * len(S))
X_train, y_train = build_dataset(S[:n1])
X_dev, y_dev = build_dataset(S[n1:n2])
X_test, y_test = build_dataset(S[n2:])

print(f'---- training set ----')
print(f'input shape: {X_train.shape}')
print(f'label shape: {y_train.shape}')
print(f'total samples: {X_train.shape[0]}')
print(f'---- validation set ----')
print(f'input shape: {X_dev.shape}')
print(f'label shape: {y_dev.shape}')
print(f'total samples: {X_dev.shape[0]}')
print(f'---- testing set ----')
print(f'input shape: {X_test.shape}')
print(f'label shape: {y_test.shape}')
print(f'total samples: {X_test.shape[0]}')

In [None]:
### neural network layers
nneurons = 300 # number of neurons in hidden layer
emb_size = 10 # this is how we will embed contexts

g = torch.Generator().manual_seed(42069)
# this is lookup table
C = torch.randn((vocab_size, emb_size),             generator=g)
W1 = torch.randn((block_size * emb_size, nneurons), generator=g) * (5/3) / (block_size * emb_size)**0.5
# b1 = torch.randn(nneurons,                          generator=g) * 0.01
W2 = torch.randn((nneurons, vocab_size),            generator=g) * 0.01
b2 = torch.randn(vocab_size,                        generator=g) * 0.0

# batch normalization parameters
bngain = torch.ones((1, nneurons))
bnbias = torch.zeros((1, nneurons))
bnmean_running = torch.zeros((1, nneurons))
bnstd_running = torch.ones((1, nneurons))

params = [C, W1, W2, b2, bngain, bnbias,] # b1,]
for p in params:
    p.requires_grad = True
nparams = sum(p.nelement() for p in params)

print(f'---- neural network ----')
print(f'number of parameters: {nparams}')

In [None]:
batch_size = 32 # for minibatch
max_steps = 20_00
lossi = []

# for i in tqdm(range(max_steps + 1), desc="training", unit_scale=True):
for i in range(1, max_steps + 1):

    # minibatch construct
    ix = torch.randint(0, X_train.shape[0], (batch_size, ), generator=g)
    X_batch, y_batch = X_train[ix], y_train[ix]

    # forward pass
    # flatten embedded block
    emb = C[X_batch]
    embcat = emb.view(emb.shape[0], -1)
    # ---------- Linear Layer -------------- 
    # this is hidden layer, pre-activations
    hpreact = embcat @ W1 # + b1
    # ----------- BatchNorm ----------------
    # when using bias in batch normalization layer effect won't be noticable
    # after normalizing the batch, so we remove bias
    # batch normalization is itself a bias
    bnmeani = hpreact.mean(0, keepdim=True)
    bnstdi = hpreact.std(0, keepdim=True)
    hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias
    with torch.no_grad():
        bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
        bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
    # ----------- Non-Linear Layer --------
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y_batch)

    # --------- Backpropagation -----------
    for p in params:
        p.grad = None
    loss.backward()

    # udpate
    lr = 0.1 if i < 100_000 else 0.01 # this is laerning rate decay
    for p in params:
        p.data += -lr * p.grad

    # track stats
    if i % 10_000 == 0:
        print(f'{i:6d}/{max_steps:6d}: {loss.item():.4f}')
    lossi.append(loss.item())   

    # break
loss.item()

In [None]:
@torch.no_grad()
def split_loss(split):
    x, y = {
        'train': (X_train, y_train),
        'val': (X_dev, y_dev),
        'test': (X_test, y_test),
    }[split]
    emb = C[x]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1 + b1
    hpreact = bngain * (hpreact - bnmean) / bnstd + bnbias
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

In [None]:
split_loss('val')

In [None]:
split_loss('test')