In [2]:
import random 
import string

chars = sorted(set(string.ascii_lowercase))
stoi = {ch: i+1 for i, ch in enumerate(chars)}
stoi['.'] = 0
itos = {i: ch for ch, i in stoi.items()}

with open('names.txt', 'r') as f:
    names = f.read().splitlines()

random.shuffle(names)
n_train = int(0.8*len(names))
n_dev = int(0.9*len(names))
trainset = names[:n_train]
devset = names[n_train:n_dev]
testset = names[n_dev:]
len(trainset), len(devset), len(devset)

(25626, 3203, 3203)

In [57]:
import torch

def prepare_data(names, cl):
    xs = []
    ys = []
    for name in names:
        name = name + '.'
        context = [stoi['.']]*cl
        for ch in name:
            ix = stoi[ch]
            xs.append(context)
            ys.append(ix)
            context = context[1:] + [ix]
    return torch.tensor(xs), torch.tensor(ys)

def print_context(X, y):
    for context, label in zip(X,y):
        con_str = ''.join(itos[ix.item()] for ix in context)
        print(f'{con_str} --> {itos[label.item()]}')

context_length = 4
X_train, y_train = prepare_data(trainset, context_length)
X_dev, y_dev = prepare_data(devset, context_length)
X_test, y_test = prepare_data(testset, context_length)
print_context(X_train[:10], y_train[:10])

.... --> k
...k --> h
..kh --> y
.khy --> r
khyr --> i
hyri --> e
yrie --> .
.... --> t
...t --> h
..th --> o


In [58]:
import torch.nn.functional as F
from itertools import product
from tqdm import tqdm

n_neuron = 200
emb_size = 10
g = torch.Generator().manual_seed(42)

vocab_size = len(stoi)
C = torch.randn(vocab_size, emb_size, generator=g, requires_grad=True)

# define neural net
W1 = torch.randn(emb_size*context_length, n_neuron, generator=g, requires_grad=True)
b1 = torch.randn(n_neuron, generator=g, requires_grad=True)
W2 = torch.randn(n_neuron, vocab_size, generator=g, requires_grad=True)
b2 = torch.randn(vocab_size, generator=g, requires_grad=True)
parameters = [C, W1, b1, W2]

def train_nn(X, y, batch_size=64, lr=0.1, train_steps=100_000):
    for step in tqdm(range(train_steps)):
        # ---forward pass---
        ixs = torch.randint(low=0, high=X.shape[0], size=(batch_size,)) 
        mini_batch, labels = X[ixs], y[ixs]
        embs = C[mini_batch].view(batch_size, emb_size*context_length)
        h = torch.tanh(embs @ W1 + b1)
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, labels)

        # ---backward pass---
        loss.backward()
        for param in parameters:
            param.data += -lr * param.grad
            param.grad = None
    
def eval_nn(X, y):
    with torch.no_grad():
        embs = C[X].view(X.shape[0], emb_size*context_length)
        h = torch.tanh(embs @ W1 + b1)
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, y)
    return loss.item()

In [59]:
train_nn(X_train, y_train, train_steps=300_000)
train_nn(X_train, y_train, lr=0.01)

100%|██████████| 300000/300000 [01:08<00:00, 4408.33it/s]
100%|██████████| 100000/100000 [00:22<00:00, 4451.81it/s]


In [60]:
train_loss = eval_nn(X_train, y_train)
dev_loss = eval_nn(X_dev, y_dev)
print(f"Train Loss: {train_loss}")
print(f"Dev Loss: {dev_loss}");

Train Loss: 2.061028242111206
Dev Loss: 2.1197381019592285


This outperforms the score of 2.2 on the dev set. Lets now sample some names.

In [61]:
n_samples = 10
for _ in range(n_samples):
    context = [0]*context_length
    while True:
        with torch.no_grad():
            embs = C[torch.tensor(context)].view(1, -1)
            h = torch.tanh(embs @ W1 + b1)
            logits = h @ W2 + b2
            probs = F.softmax(logits, dim=1)
            ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        if ix == 0:
            break
        print(itos[ix], end='')
    print()

savan
ikai
yilie
kyeorosoruarlanion
zadi
lindo
izdishia
khae
dria
kahdees


I implemented the Mixture Model Idea from the Bengio et al. paper. I.e I use the Trigram model from the last exercise and combine it with the MLP. It achieved slightly better Devset performance.

In [65]:
trigram_counts = torch.zeros((27, 27, 27), dtype=torch.int32)
for name in trainset:
    chs = ['.', '.'] + list(name) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        trigram_counts[stoi[ch1], stoi[ch2], stoi[ch3]] += 1

trigram_probs = (trigram_counts + 0.32).float()
trigram_probs /= trigram_probs.sum(dim=-1, keepdim=True)
trigram_probs[0, 0]

tensor([1.2483e-05, 1.3756e-01, 3.9685e-02, 4.7487e-02, 5.3144e-02, 4.9321e-02,
        1.2613e-02, 2.0376e-02, 2.7670e-02, 1.7684e-02, 7.4911e-02, 9.2817e-02,
        4.9555e-02, 7.9475e-02, 3.4965e-02, 1.2105e-02, 1.6475e-02, 2.8602e-03,
        5.2520e-02, 6.4300e-02, 4.1441e-02, 2.5871e-03, 1.1130e-02, 9.3748e-03,
        3.9135e-03, 1.6670e-02, 2.9348e-02])

In [110]:
def eval_mixture(X, y, trigram_weight=0.5):
    tri_prob = trigram_probs[X[:, -2], X[:, -1]]
    with torch.no_grad():
        embs = C[X].view(X.shape[0], emb_size*context_length)
        h = torch.tanh(embs @ W1 + b1)
        logits = h @ W2 + b2
        mlp_probs = F.softmax(logits, dim=1)

    probs = trigram_weight * tri_prob + (1 - trigram_weight) * mlp_probs 
    nll= -probs[torch.arange(len(X)), y].log().mean()
    return nll.item()

eval_mixture(X_dev, y_dev)

2.1162450313568115