In [4]:
import torch as t
import math as m
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random
import matplotlib.gridspec as gridspec

from matplotlib.font_manager import FontProperties

In [5]:
# get data
words = open("names.txt", 'r').read().splitlines()

In [6]:
# lookup tables for s --> i || i --> s
chars = sorted(list(set(''.join(words))))
stoi = {s: i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s,i in stoi.items()}
vocab_size = len(itos)

In [7]:
# build the dataset
block_size = 3

def build_dataset(data,
                  stoi: dict,
                  itos: dict,
                  block_size: int):
    """
    creates X tensor which contains input tensors, and Y tensor, which contains the associated output
    :param block_size: number of necessary characters to predict the next tone
    :param itos: lookup table from string to int
    :param stoi: lookup table from int to string
    :param data: list of names, nouns, etc.
    :return: tuple of X and Y
    """
    X, Y = [], []
    for w in data:

        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            # print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix]    # crop and append

    X = t.tensor(X)
    Y = t.tensor(Y)
    # print(X.shape, Y.shape)

    return X, Y


random.seed(1789)
random.shuffle(words)

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

# create splits
Xtr,  Ytr  = build_dataset(words[:n1], stoi=stoi, itos=itos, block_size=block_size)      # 80%
Xdev, Ydev = build_dataset(words[n1:n2], stoi=stoi, itos=itos, block_size=block_size)    # 10%
Xte,  Yte  = build_dataset(words[n2:], stoi=stoi, itos=itos, block_size=block_size)      # 10%

In [8]:
# MLP revisited
n_emb = 10       # dimensionality of the character embedding vectors
n_hidden = 100    # number of neurons in the hidden layer of the MLP

g  = t.Generator().manual_seed(1789)
C  = t.randn((vocab_size, n_emb),            generator=g)

# Layer 1
w1 = t.randn((n_emb * block_size, n_hidden), generator=g) * (5/3)/((n_emb * block_size)**0.5)
b1 = t.randn(n_hidden,                       generator=g) * 0.1

# Layer 2
w2 = t.randn((n_hidden, vocab_size),         generator=g) * 0.1
b2 = t.randn(vocab_size,                     generator=g) * 0.1

# BatchNorm parameters
bngain = t.randn((1, n_hidden)) * 0.1 + 1.0
bnbias = t.randn((1, n_hidden)) * 0.1

parameters = [C, w1, b1, w2, b2, bngain, bnbias]
print(f"number of parameters in total: {sum(p.nelement() for p in parameters)}")

for p in parameters:
    p.requires_grad = True

number of parameters in total: 6297


In [None]:
max_steps = 200000
batch_size = 32
n = batch_size
lossi = []

with t.no_grad():
    for i in range(max_steps):

        # minibatch construct
        ix = t.randint(0, Xtr.shape[0], (batch_size,), generator=g)
        Xb, Yb = Xtr[ix], Ytr[ix]
    
        # forward pass
        emb = C[Xb]
        embcat = emb.view(emb.shape[0], -1)
        
        # Linear Layer 1
        hprebn = embcat @ w1 + b1

        # BatchNorm layer
        bnmean = hprebn.mean(0, keepdim=True)
        bnvar = hprebn.var(0, keepdim=True, unbiased=True)
        bnvar_inv = (bnvar + 1e-5)**-0.5
        bnraw = (hprebn - bnmean) * bnvar_inv
        hpreact = bngain * bnraw + bnbias

        # Non-linearity
        h = t.tanh(hpreact)

        # Linear Layer 2
        logits = h @ w2 + b2

        # Loss Function
        loss = F.cross_entropy(logits, Yb)

        # backward pass
        for p in parameters:
            p.grad = None

        # Manual Backpropagation
        
        # Cross Entropy Loss
        dlogits = F.softmax(logits, 1)
        dlogits[range(n), Yb] -= 1
        dlogits /= n

        # Linear Layer 2
        dh = dlogits @ w2.T
        dW2 = h.T @ dlogits
        db2 = dlogits.sum(0)
        
        # Non-Linearity
        dhpreact = (1.0 - h**2) * dh
        
        # BatchNorm Layer
        dbngain = (bnraw * dhpreact).sum(0, keepdim=True)
        dbnbias = dhpreact.sum(0, keepdim=True)
        dhprebn = bngain*bnvar_inv/n * (n*dhpreact - dhpreact.sum(0) - n/(n-1)*bnraw*(dhpreact*bnraw).sum(0))
        
        # Linear Layer 1
        dembcat = dhprebn @ w1.T
        dw1 = embcat.T @ dhprebn
        db1 = dhprebn.sum(0)
        
        # Initial Embedding
        demb = dembcat.view(emb.shape)
        dC = t.zeros_like(C)
        dC = dC.index_add_(0, Xb.view(-1),  demb.view(-1, 10))

        grads = [dC, dW1, db1, dW2, db2, dbngain, dbnbias]

        # Update
        lr = 0.1 if i < 100000 else 0.01
        for p, grad in zip(parameters, grads):
              p.data += -lr * grad

         # track stats
        if i % 10000 == 0:
            print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
        lossi.append(loss.log10().item())



In [14]:
with t.no_grad():
    
    # pass the training set through
    emb = C[Xtr]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ w1 + b1
    
    # measure the mean/std over the entire training set
    bnmean = hpreact.mean(0, keepdim=True)
    bnvar = hpreact.var(0, keepdim=True, unbiased=True)

In [15]:
# evaluate train and val loss

@t.no_grad() # this decorator disables gradient tracking
def split_loss(split):
    x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
    }[split]
    emb = C[x]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ w1 + b1
    hpreact = bngain * (hpreact - bnmean) * (bnvar + 1e-5)**-0.5 + bnbias
    h = t.tanh(hpreact)
    logits = h @ w2 + b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 3.563689947128296
val 3.5631814002990723


In [17]:
# sample from the model
g = t.Generator()

for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:

        # forward pass:
        emb = C[t.tensor([context])]
        embcat = emb.view(emb.shape[0], -1)
        hpreact = embcat @ w1 + b1
        hpreact = bngain * (hpreact - bnmean) * (bnvar + 1e-5)**-0.5 + bnbias
        h = t.tanh(hpreact)
        logits = h @ w2 + b2
        
        # Sample
        probs = F.softmax(logits, dim=1)
        ix = t.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break
        
    print(''.join(itos[i] for i in out))

cybiafwhfwgghyppydaituuwaaieptdikrrkirzuwveczzscazjrtaflmcvuu.
blrflialntgtqc.
zcsanibprg.
uxvibccqrevvaamzccfmogufaxifsvp.
ivg.
xcxafjgargpfniyvvrntdukncguvzhojonqkxziwvqnxtxfosufgh.
lmhsbomcfiafefsggecptgcaereksgavrokciakjzjijo.
ovckiykugohmfnihywvpumhjknwfgnfcvvhjvdfwvgmhmpzjnjwoczldsmuyyqcatvhj.
pcliwzhusirarrphvwfyqlgtjwoclujlnb.
dxwsbpkubktcfibwlfhpgzoarobgribirih.
ywpqssomlnmhnwhamzgkaqohwyhoyiiiptulftmtnsjytntyhwwvam.
dxozxlsfebjezclmerjlcoueklrcvvtffmugdjqqmybqpwpklbkn.
mdcugmpheznyezwacjdrzrttcxdvujjvintqirzohcfjstdaphqmmcborlmznejvtclouxujenj.
xogcrgasghvgtftjljziwtflsmgogxcqyurbrygnhgweoicv.
ywvvhjqvrq.
xynvkhz.
etridcfbgpzakepgajoeorzxqipjvvcciokubbcqcwekgqygjlzcrgvwagcshycnacskfshdolewdnklnxzlmflgawzasvpaaqwroldxo.
pkcbkponcqivssheznixwffwqavkwqokrbymizkevqezitnkbzusivijvcviejzpbgifa.
fzgclkgrhofsixev.
azvnvvhpntpjx.
