In [88]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [89]:
with open('../names.txt', 'r') as file:
    names = file.read().split()

print('Total names:', len(names))
print(names[:5])

Total names: 32033
['emma', 'olivia', 'ava', 'isabella', 'sophia']


In [90]:
# Form stoi and itos
vocab = sorted(list(set(''.join(names))))
stoi = {s: i+1 for i, s in enumerate(vocab)}
stoi['.'] = 0
itos = {stoi[s]: s for s in stoi}

vocab_size = len(stoi)

In [91]:
block_size = 3
def form_dataset(words):
    X = []
    Y = []

    for word in words:
        word = ['.'] * block_size + list(word) + ['.']
        for ind in range(3, len(word)):
            X.append([stoi[x] for x in word[ind-3:ind]])
            Y.append(stoi[word[ind]])
            # print(''.join(word[ind-3:ind]), '--->', word[ind])

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

In [92]:
import random
random.seed(42)
random.shuffle(names)

X, Y = form_dataset(names)

n1 = int(0.8 * X.shape[0])
n2 = int(0.9 * X.shape[0])
Xtr, Xdev, Xts = X.tensor_split((n1, n2), dim=0) # input is split into X[:n1], X[n1:n2] and X[n2:]
Ytr, Ydev, Yts = Y.tensor_split((n1, n2), dim=0)

g = torch.Generator().manual_seed(2147483647)

In [104]:
class Layer:
    def __init__(self, in_features, out_features, bias=True):
        self.W = torch.randn((in_features, out_features), generator=g) / (in_features ** 2)
        if bias:
            self.b = torch.randn(out_features) if bias else None
    
    def __call__(self, x):
        # forward pass
        x_new = x.view(-1, self.W.shape[0])
        self.out = x.view(-1, self.W.shape[0]) @ self.W
        if self.b is not None:
            self.out = self.out + self.b
        
        return self.out

    def parameters(self):
        return [self.W] + ([] if self.b is None else [self.b])

In [102]:
layer = Layer(4, 6)
layer(torch.randn(100, 4))
layer.parameters()

[tensor([[-0.0015,  0.0852, -0.0688,  0.0198,  0.1233,  0.0219],
         [-0.0681,  0.1082, -0.0864,  0.0247,  0.0639,  0.0195],
         [ 0.0523,  0.0265, -0.0279, -0.0180, -0.0002, -0.0081],
         [-0.0108,  0.0295, -0.0216,  0.0384, -0.0032, -0.0268]]),
 tensor([-0.3411, -0.1157, -1.0170,  0.6771,  0.6893, -0.7846])]

In [95]:
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    def parameters(self):
        return []

In [96]:
th = Tanh()
th(torch.randn(10))

tensor([-0.3420,  0.3962,  0.0993, -0.4178,  0.7253, -0.8575, -0.9274, -0.9860,
        -0.4772, -0.8924])

In [97]:
class BatchNorm1d:
    def __init__(self, dim, eps=1e-05, momentum=0.1):
        """
        dim: Dimensionality or num_features
        """
        self.dim = dim
        self.eps = eps
        self.momentum = momentum
        self.training = True # By default for training

        self.gamma = torch.ones(dim) # initialized to 1 for multiplication
        self.beta = torch.zeros(dim) # initialized to 0 for addition

        # track running stats is True by default
        self.running_mean = torch.zeros(dim)
        self.running_variance = torch.ones(dim)

    def __call__(self, x):
        if self.training:
            xmean = x.mean(0, keepdim=True) # batch mean
            xstd = x.std(0, keepdim=True) # batch variance
        else:
            xmean = self.running_mean
            xstd = self.running_variance

        xhat = (x - xmean) / torch.sqrt(xstd + self.eps)
        self.out =  self.gamma * xhat + self.beta
        if self.training:
            with torch.no_grad():
                self.running_mean = self.momentum * xmean + (1-self.momentum) * self.running_mean
                self.running_variance = self.momentum * xstd + (1-self.momentum) * self.running_variance
        
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

In [98]:
Xtr[0]

tensor([0, 0, 0])

In [103]:
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 100 # the number of neurons in the hidden layer of the MLP

C = torch.randn(vocab_size, n_embd) # used to encode the dataset into vector space of 10

layers = [
    Layer(n_embd * block_size, n_hidden), Tanh(),
    Layer(n_hidden, n_hidden), Tanh(),
    Layer(n_hidden, n_hidden), Tanh(),
    Layer(n_hidden, n_hidden), Tanh(),
    Layer(n_hidden, n_hidden), Tanh(),
    Layer(n_hidden, vocab_size)
]

with torch.no_grad():
    layers[-1].W = layers[-1].W * 0.1 # So that it is closer to 0 at the last layer and less confident in predictions

parameters = [p for layer in layers for p in layer.parameters()]
count = sum(p.nelement() for p in parameters)
print('Total parameters:', count)
for p in parameters:
  p.requires_grad = True

Total parameters: 46227


In [123]:
embs = C[Xtr]
for layer in layers:
    embs = layer(embs)

print(embs.shape)

loss = F.cross_entropy(embs, Ytr)
print(loss.item())

for layer in layers:
    for p in layer.parameters():
        p.grad = None

loss.backward()

for layer in layers:
    for p in layer.parameters():
        p.data += -0.1 * p.grad

torch.Size([182516, 27])
2.8959527015686035
