In [5]:
import os
import random
import torch
from torch import Generator
from torch.nn import functional as F
from tqdm import trange, tqdm
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
def read_words(file_path):
    with open(file_path, 'r') as file:
        words = set([ word.lower() for word in file.read().splitlines() if word.isalpha() ])
        file.close()
    return list(words)

In [15]:
# read words
words = read_words('../geowords.txt')
print(f'total words: {len(words)}')

total words: 314532


In [16]:
# prepare vocabulary
chars = sorted(list(set(''.join(words))))
stoi = { s:i+1 for i, s in enumerate(chars) }
stoi['.'] = 0
itos = { i:s for s, i in stoi.items() }
vocab_size = len(stoi)
print(f'vocab size: {vocab_size}')
print(f'vocab: {[s for s, _ in stoi.items()]}')

vocab size: 34
vocab: ['ა', 'ბ', 'გ', 'დ', 'ე', 'ვ', 'ზ', 'თ', 'ი', 'კ', 'ლ', 'მ', 'ნ', 'ო', 'პ', 'ჟ', 'რ', 'ს', 'ტ', 'უ', 'ფ', 'ქ', 'ღ', 'ყ', 'შ', 'ჩ', 'ც', 'ძ', 'წ', 'ჭ', 'ხ', 'ჯ', 'ჰ', '.']


In [17]:
random.seed(42069)
random.shuffle(words)

In [19]:
# build dataset of words
block_size = 3
def build_dataset(words_set):
    X, Y = [], []
    for word in words_set:
        context = [0] * block_size
        for ch in word + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
X_train, y_train = build_dataset(words[:n1])
X_dev, y_dev = build_dataset(words[n1:n2])
X_test, y_test = build_dataset(words[n2:])

print(f'---- training set ----')
print(f'input shape: {X_train.shape}')
print(f'label shape: {y_train.shape}')
print(f'total samples: {X_train.shape[0]}')
print(f'---- validation set ----')
print(f'input shape: {X_dev.shape}')
print(f'label shape: {y_dev.shape}')
print(f'total samples: {X_dev.shape[0]}')
print(f'---- testing set ----')
print(f'input shape: {X_test.shape}')
print(f'label shape: {y_test.shape}')
print(f'total samples: {X_test.shape[0]}')

---- training set ----
input shape: torch.Size([2556170, 3])
label shape: torch.Size([2556170])
total samples: 2556170
---- validation set ----
input shape: torch.Size([319614, 3])
label shape: torch.Size([319614])
total samples: 319614
---- testing set ----
input shape: torch.Size([319435, 3])
label shape: torch.Size([319435])
total samples: 319435


In [36]:
class Module:
    def __call__(self, x):
        return self.forward(x)
    def forward(self, x):
        raise NotImplementedError
    def params(self):
        raise NotImplementedError
    
class Linear(Module):
    def __init__(self, fan_in, fan_out, bias=True):
        self.weights = torch.randn((fan_in, fan_out), generator=g) / fan_in**0.5
        self.bias = torch.zeros(fan_out) if bias else None
    def forward(self, x):
        self.out = x @ self.weights
        if self.bias is not None: self.out += self.bias
        return self.out
    def params(self):
        return [self.weights] + ([self.bias] if self.bias is not None else [])

class BatchNorm1d(Module):
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # this are the parameters to adjust, tweak
        # and shift normalized data. gamma is a
        # scalar value and beta is a shifter.
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # buffers (trained with a running 'momentum update')
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)
    def forward(self, x):
        # if nn is in training mode we calculate batch
        # mean and variance. else if testing or evaluating
        # we need running mean and variance
        if self.training:
            xmean = x.mean(0, keepdim=True)
            xvar = x.var(0, keepdim=True, ubiased=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta
        # in training mode, we have to update buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out
    def params(self):
        return [self.gamma, self.beta]

class Tanh(Module):
    def forward(self, x):
        self.out = torch.tanh(x)
        return self.out
    def params(self):
        return []

In [49]:
# preparing neural network
n_embd = 10
n_hidden = 100
g = Generator().manual_seed(1337)

C = torch.randn((vocab_size, n_embd))
layers = [
    Linear(n_embd * block_size, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, vocab_size)
]

with torch.no_grad():
    layers[-1].weights *= 0.1
    # we should apply gain for other 
    for layer in layers[:-1]:
        if isinstance(layer, Linear):
            layer.weights *= 5/3

params = [C] + [p for layer in layers for p in layer.params()]
print(f"number of parameters: {sum([p.nelement() for p in params])}")
for p in params:
    p.requires_grad = True

number of parameters: 37174


In [50]:
# training phase
max_steps = 200_000
batch_size = 64
lossi = []

for i in range(1, max_steps + 1):
    # batch construct
    ix = torch.randint(0, X_train.shape[0], (batch_size, ), generator=g)
    X_batch, y_batch = X_train[ix], y_train[ix]

    # forward pass
    emb = C[X_batch]
    # flatten embedded matrix
    x = emb.view(emb.shape[0], -1)
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, y_batch)

    # backward pass
    for layer in layers:
        layer.out.retain_grad()
    for param in params:
        param.grad = None
    loss.backward()

    # update params
    lr = 0.1 if i < 100_000 else 0.01
    for param in params:
        param.data += -lr * param.grad

    # tracking
    if i % 5_000 == 0:
        print(f'step: {i}; loss: {loss.item():.4f}')
    lossi.append(loss.log10().item())

step: 5000; loss: 2.1081
step: 10000; loss: 2.2333
step: 15000; loss: 2.4388
step: 20000; loss: 2.1796
step: 25000; loss: 2.4549
step: 30000; loss: 2.4319
step: 35000; loss: 2.3857
step: 40000; loss: 2.2976
step: 45000; loss: 2.3065
step: 50000; loss: 2.0398
step: 55000; loss: 2.5996
step: 60000; loss: 2.2533
step: 65000; loss: 2.3223
step: 70000; loss: 2.3406
step: 75000; loss: 1.8335
step: 80000; loss: 2.1736
step: 85000; loss: 2.3374
step: 90000; loss: 2.2884
step: 95000; loss: 2.2115
step: 100000; loss: 2.4658
step: 105000; loss: 2.0801
step: 110000; loss: 2.2860
step: 115000; loss: 2.0587
step: 120000; loss: 2.2696
step: 125000; loss: 2.3074
step: 130000; loss: 2.2627
step: 135000; loss: 2.2788
step: 140000; loss: 1.8905
step: 145000; loss: 2.3200
step: 150000; loss: 2.2631
step: 155000; loss: 2.0632
step: 160000; loss: 2.4449
step: 165000; loss: 2.2556
step: 170000; loss: 2.5258
step: 175000; loss: 2.1653
step: 180000; loss: 2.1036
step: 185000; loss: 2.4070
step: 190000; loss: 2