Language Modeling Part 2:

We have trained a character level MLP (Multi Layer Perceptron) on a dataset containing ~32k names. The model predicts new names after training.

Technical Details:
- The architecture has three layers -
    - First is the embedding layer which creates an embedding for the input tokens
    - Second is the hidden layer which learns patterns in the data
    - Third is the output layer which outputs logits (logarithmic counts)
- Activation of the hidden layer is tanh
- Loss being used is the classification cross entropy loss a.k.a average negative log likelihood loss

Hyperparameters like numbers of neuron in hidden layer, the length of context to be used for prediction (last n characters), the size of the embedding layers, learning rate, the number of epochs, batch size, etc can be played around with

In [None]:
# import

import torch.nn.functional as F
import requests
import random
import torch

In [None]:
# dataset download

names = requests.get("https://raw.githubusercontent.com/karpathy/makemore/master/names.txt").text
names = names.splitlines()

In [22]:
len(names)

32033

In [None]:
# string to token conversion

itos = ['.'] + sorted(list(set("".join(names))))
stoi = {ch: idx for idx, ch in enumerate(itos)}

In [None]:
# create dataset

def create_data(words, block_size=3):
    xs = []
    ys = []
    for word in words:
        window = [0] * block_size
        word += "."
        for char in word:
            xs.append(window)
            ys.append(stoi[char])

            window = window[1:]
            window.append(stoi[char])
    return torch.tensor(xs), torch.tensor(ys)

In [None]:
# initialize hyperparameters

emb_dim = 10
block_size = 4
hidden_layer_dim = 100
emb_size = emb_dim * block_size  # don't change this

In [None]:
# initialize train, val, test datasets

random.seed(17)
random.shuffle(names)
n1 = int(0.8 * len(names))
n2 = int(0.9 * len(names))
xs, ys = create_data(names, block_size)
x_train, y_train = create_data(names[:n1], block_size)
x_val, y_val = create_data(names[n1:n2], block_size)
x_test, y_test = create_data(names[n2:], block_size)

print("train -", x_train.shape[0])
print("val   -", x_val.shape[0])
print("test  -", x_test.shape[0])
print("total -", xs.shape[0])

train - 182705
val   - 22652
test  - 22789
total - 228146


In [None]:
# initialize model weights

g = torch.Generator().manual_seed(2147483647)
emb = torch.randn((27, emb_dim), requires_grad=True, generator=g)
w1 = torch.randn((emb_size, hidden_layer_dim), requires_grad=True, generator=g)
b1 = torch.randn(hidden_layer_dim, requires_grad=True, generator=g)
w2 = torch.randn((hidden_layer_dim, 27), requires_grad=True, generator=g)
b2 = torch.randn(27, requires_grad=True, generator=g)
params = [emb, w1, b1, w2, b2]
params_count = sum([p.nelement() for p in params])
print(f"{params_count=}")

params_count=7097


In [None]:
# training loop

batch_size = 32
for i in range(200001):

    # creating batch
    ix = torch.randint(0, x_train.shape[0], (batch_size,))

    # forward pass
    embs = emb[x_train[ix]]
    preact = embs.view(-1, emb_size) @ w1 + b1
    act = torch.tanh(preact)
    logits = act @ w2 + b2
    loss = F.cross_entropy(logits, y_train[ix])  # softmax (final activation) + avg neg log likelihood loss

    # print loss
    if i % 10000 == 0:
        print(i, "-", round(loss.item(), 4))

    # backward pass
    for param in params:
        param.grad = None
    loss.backward()

    # update parameters
    lr = 0.01 if i < 100000 else 0.001
    for param in params:
        param.data += -lr * param.grad

0 - 17.584
10000 - 2.58
20000 - 2.7229
30000 - 2.1249
40000 - 2.3352
50000 - 1.8454
60000 - 2.3872
70000 - 2.6873
80000 - 2.3014
90000 - 2.5933
100000 - 2.3176
110000 - 2.0669
120000 - 2.5923
130000 - 2.0771
140000 - 2.5835
150000 - 2.1807
160000 - 2.2941
170000 - 1.9813
180000 - 2.3605
190000 - 2.1533
200000 - 2.3758


In [None]:
# evaluate on validation data

embs = emb[x_val]
act = torch.tanh(embs.view(-1, emb_size) @ w1 + b1)
logits = act @ w2 + b2
loss = F.cross_entropy(logits, y_val)
print(loss.item())

2.300997018814087


In [20]:
# inference

for _ in range(100):
    xt = [0] * block_size
    while True:
        with torch.no_grad():
            embs = emb[xt]
            preact = embs.view(-1, emb_size) @ w1 + b1
            act = torch.tanh(preact)
            logits = act @ w2 + b2
            counts = logits.exp()
            probs = counts / counts.sum(1, keepdim=True)
            next_idx = torch.multinomial(probs, num_samples=1, replacement=True)[0].item()
            xt = xt[1:] + [next_idx]
            print(itos[next_idx], end="")
            if next_idx == 0:
                print("")
                break

alely.
chem.
amir.
khaide.
imyr.
sa.
ginva.
sona.
jano.
ramlea.
dile.
jomer.
emryah.
gexsen.
aban.
leer.
rumitha.
laan.
gylin.
mary.
jhro.
jan.
dareot.
kes.
aahanah.
pumton.
lusur.
riokma.
hamya.
vena.
mandah.
isya.
hahatan.
paene.
alyan.
avdi.
oazob.
lontalde.
kodze.
jaidei.
canru.
balshamlie.
azelde.
shaislan.
xakarnahi.
lovenn.
kenniel.
waial.
sysen.
imiah.
mahanis.
ryza.
lasie.
asla.
evesoa.
kolen.
aryan.
dloc.
adcisse.
jailen.
eylet.
melae.
xuej.
malilane.
myar.
esnaniy.
tiel.
merie.
amrit.
emad.
kdodettor.
weska.
renesharlel.
alysa.
brac.
amalyn.
adde.
zeyva.
dalioa.
brusetton.
jinieso.
ewmespazimina.
marel.
mangee.
anlad.
alyerkerer.
rishan.
jaresiy.
weri.
anna.
nazyn.
nefana.
ronva.
noza.
amaran.
raah.
aliahce.
lonssaisrdon.
kaikl.
iamarvicta.


In [21]:
# evaluate on test data

embs = emb[x_test]
act = torch.tanh(embs.view(-1, emb_size) @ w1 + b1)
logits = act @ w2 + b2
loss = F.cross_entropy(logits, y_test)
print(loss.item())

2.279118299484253
