In [1]:
import random

import numpy as np

import torch
import torch.nn.functional as F

import matplotlib.pyplot as plt
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('./names.txt') as file:
    names = file.read().splitlines()

In [3]:
names[0:3]

['emma', 'olivia', 'ava']

In [4]:
letters = sorted(list(set([letter for name in names for letter in name])))
i_to_s = {i+1: s for i, s in enumerate(letters)}
i_to_s[0] = '.'
s_to_i = {s: i for i, s in i_to_s.items()}

In [324]:
class Model:
    
    def __init__(self, config):
        self.unique_items = config.get('unique_items', 27)
        self.embedding_depth = config.get('embedding_depth', 2)
        self.n_hidden_layer = config.get('n_hidden_layer', 100)
        self.block_size = config.get('block_size', 3)
        self.batch_size = config.get('batch_size', 32)
        
        self.train_size = config.get('train_size', 0.8)
        self.val_size = config.get('train_size', 0.1)
        self.test_size = 1.0 - self.train_size - self.val_size
        
        self.names = config.get('names').copy() #copy here to make sure we don't shuffle the original names
        
        # Build the model here
        self.g = torch.Generator().manual_seed(2147483647)
        
        # embedding matrix C, size: (27,2) by default
        self.C = torch.randn((self.unique_items, self.embedding_depth), requires_grad=True, generator=self.g)
        
        # Weight matrix of the hidden layer, size: (3 x 2, 100) by default
        self.W1 = torch.randn([self.block_size * self.embedding_depth, self.n_hidden_layer], requires_grad=True, generator=self.g)
        self.b1 = torch.randn([1, self.n_hidden_layer], requires_grad=True, generator=self.g)
        
        # Weight matrix of the logits layer, size: (100, 27) by default
        self.W2 = torch.randn([self.n_hidden_layer, self.unique_items], requires_grad=True, generator=self.g)
        self.b2 = torch.randn([1, self.unique_items,], requires_grad=True, generator=self.g)
        
        self.parameters = [self.C, self.W1, self.b1, self.W2, self.b2]
        self.n_elements = sum(p.nelement() for p in self.parameters)
        
        # Building the datasets
        
        split1 = int(self.train_size * len(self.names))
        split2 = int((self.train_size + self.val_size) * len(self.names))
        
        random.seed(42)
        random.shuffle(self.names)

        self.names_train = self.names[:split1]
        self.names_val = self.names[split1:split2]
        self.names_test = self.names[split2:]
        
        def build_dataset(names, block_size):
            X, Y = [], []
            for name in names:
                context = block_size * [0]
                for letter in name + ".":
                    ix = s_to_i[letter]
                    X.append(context)
                    Y.append(ix)
                    context = context[1:] + [ix]
            X = torch.tensor(X)
            Y = torch.tensor(Y)
            return X,Y
        
        self.X, self.Y = build_dataset(self.names, self.block_size)
        
        self.X_train, self.Y_train = build_dataset(self.names_train, self.block_size)
        self.X_val, self.Y_val = build_dataset(self.names_val, self.block_size)
        self.X_test, self.Y_test = build_dataset(self.names_test, self.block_size)
        
    
    def forward(self, X, Y):
        # Compute embeddings for the Xs
        embs = self.C[X]
        # Stretch out the embeddings, and do linear transformation
        z1 = embs.view(-1, self.block_size * self.embedding_depth) @ self.W1 + self.b1
        # Tanh activation
        a1 = z1.tanh()
        # Second linear transformation (==> computes the logits)
        z2 = a1 @ self.W2 + self.b2
        # Compute the loss
        loss = F.cross_entropy(z2, Y)
        
        return loss
    
    
    def get_loss(self, data_set='train'):
        with torch.no_grad():
            if data_set == 'train':
                return self.forward(self.X_train, self.Y_train).item()
            if data_set == 'val':
                return self.forward(self.X_val, self.Y_val).item()
            if data_set == 'test':
                return self.forward(self.X_val, self.Y_val).item()
        print("data_set should be either train, val or test")
    
    
    def backward(self, loss):
        for p in self.parameters:
            p.grad = None
        loss.backward()
        
        
    def update(self, learning_rate=1e-1):
        for p in self.parameters:
            p.data += - learning_rate * p.grad
        
        
    def train(self, epochs, learning_rate=1e-1, print_every=None):
        for _ in range(epochs):
            loss = self.forward(self.X, self.Y)
            self.backward(loss)
            self.update(learning_rate)
            if type(print_every) == int and _ % print_every == 0:
                print(loss.item())
                
                
    def train_minibatches(self, epochs, learning_rate=1e-1, print_every=None):
        for _ in range(epochs):
            batch_ixs = torch.randint(0, len(self.X_train), (self.batch_size,), generator=self.g)
            X_batch = self.X_train[batch_ixs]
            Y_batch = self.Y_train[batch_ixs]
            
            loss = self.forward(X_batch, Y_batch)
            self.backward(loss)
            self.update(learning_rate)
            if type(print_every) == int and _ % print_every == 0:
                print(loss.item())
        

### This is the final model from Andrej

In [325]:
m = Model({
    'names': names,
    'embedding_depth': 10,
    'n_hidden_layer': 200,
    'block_size': 3,
    'batch_size': 32
})

In [326]:
m.train_minibatches(100000, 0.1)
m.get_loss()

2.3885087966918945

In [327]:
m.train_minibatches(100000, 0.01)
m.get_loss()

2.124298334121704

In [328]:
m.get_loss('val')

2.165433645248413

### I think the embedding depth is too high with 10D, let's try 5 and see if it performs similarly

In [329]:
m = Model({
    'names': names,
    'embedding_depth': 5,
    'n_hidden_layer': 200,
    'block_size': 3,
    'batch_size': 32
})
m.n_elements

8762

In [330]:
m.train_minibatches(100000, 0.1)
m.get_loss()

2.3803389072418213

In [331]:
m.train_minibatches(100000, 0.01)
m.get_loss()

2.1736955642700195

In [332]:
m.get_loss('val')

2.195449113845825

### Does seem to impact the model performance

### Let's try with higher embedding_depth

In [334]:
m = Model({
    'names': names,
    'embedding_depth': 15,
    'n_hidden_layer': 200,
    'block_size': 3,
    'batch_size': 32
})
m.n_elements

15032

In [335]:
m.train_minibatches(100000, 0.1)
m.train_minibatches(100000, 0.01)
m.get_loss()

2.0927231311798096

In [336]:
m.get_loss('val')

2.149803400039673

### Even larger

In [337]:
m = Model({
    'names': names,
    'embedding_depth': 20,
    'n_hidden_layer': 200,
    'block_size': 3,
    'batch_size': 32
})
m.n_elements

18167

In [338]:
m.train_minibatches(100000, 0.1)
m.train_minibatches(100000, 0.01)
m.get_loss()

2.068345308303833

In [339]:
m.get_loss('val')

2.1444251537323

### Even larger

In [340]:
m = Model({
    'names': names,
    'embedding_depth': 25,
    'n_hidden_layer': 200,
    'block_size': 3,
    'batch_size': 32
})
m.n_elements

21302

In [341]:
m.train_minibatches(100000, 0.1)
m.train_minibatches(100000, 0.01)
m.get_loss()

2.0630736351013184

In [342]:
m.get_loss('val')

2.14540958404541

### 20 seems to be enough, let's see if we can reduce n_hidden_layer size

In [343]:
m = Model({
    'names': names,
    'embedding_depth': 20,
    'n_hidden_layer': 150,
    'block_size': 3,
    'batch_size': 32
})
m.n_elements

13767

In [344]:
m.train_minibatches(100000, 0.1)
m.train_minibatches(100000, 0.01)
m.get_loss()

2.0878281593322754

In [345]:
m.get_loss('val')

2.1424944400787354

### Let's reduce even more

In [346]:
m = Model({
    'names': names,
    'embedding_depth': 20,
    'n_hidden_layer': 100,
    'block_size': 3,
    'batch_size': 32
})
m.n_elements

9367

In [347]:
m.train_minibatches(100000, 0.1)
m.train_minibatches(100000, 0.01)
m.get_loss()

2.12813401222229

In [348]:
m.get_loss('val')

2.160646677017212

### Let's keep it at 150, and increase block_size

In [349]:
m = Model({
    'names': names,
    'embedding_depth': 20,
    'n_hidden_layer': 150,
    'block_size': 5,
    'batch_size': 32
})
m.n_elements

19767

In [350]:
m.train_minibatches(100000, 0.1)
m.train_minibatches(100000, 0.01)
m.get_loss()

2.169311046600342

In [351]:
m.get_loss('val')

2.2097270488739014

### Longer training

In [352]:
m = Model({
    'names': names,
    'embedding_depth': 20,
    'n_hidden_layer': 150,
    'block_size': 5,
    'batch_size': 32
})
m.train_minibatches(200000, 0.1)
m.train_minibatches(200000, 0.01)
m.get_loss()

2.118191957473755

In [353]:
m.get_loss('val')

2.178915023803711

### Even longer training

In [441]:
m = Model({
    'names': names,
    'embedding_depth': 20,
    'n_hidden_layer': 150,
    'block_size': 5,
    'batch_size': 32
})
m.train_minibatches(200000, 0.1)
m.train_minibatches(400000, 0.01)

In [442]:
m.get_loss(), m.get_loss('val')

(2.111159086227417, 2.174161911010742)

### Only small improvement.. 

In [444]:
m = Model({
    'names': names,
    'embedding_depth': 15,
    'n_hidden_layer': 250,
    'block_size': 5,
    'batch_size': 32
})
m.n_elements

26182

In [445]:
m.train_minibatches(100000, 0.1)
m.get_loss(), m.get_loss('val')

(2.423304319381714, 2.456299066543579)

### let's get back to the best model so far:

In [358]:
m = Model({
    'names': names,
    'embedding_depth': 20,
    'n_hidden_layer': 150,
    'block_size': 3,
    'batch_size': 32
})
m.n_elements

m.train_minibatches(50000, 0.1)
m.train_minibatches(50000, 0.01)

In [359]:
m.get_loss(), m.get_loss('val')

(2.128970146179199, 2.1740517616271973)

In [360]:
m = Model({
    'names': names,
    'embedding_depth': 20,
    'n_hidden_layer': 150,
    'block_size': 3,
    'batch_size': 32
})
m.n_elements

m.train_minibatches(150000, 0.1)
m.train_minibatches(150000, 0.01)

In [361]:
m.get_loss(), m.get_loss('val')

(2.070401191711426, 2.1311147212982178)

In [362]:
m.train_minibatches(100000, 0.001)

In [363]:
m.get_loss(), m.get_loss('val')

(2.061901807785034, 2.125061273574829)

In [364]:
m.train_minibatches(100000, 0.001)

In [365]:
m.get_loss(), m.get_loss('val')

(2.0613131523132324, 2.1244380474090576)

### Larger n_hidden_layer

In [366]:
m = Model({
    'names': names,
    'embedding_depth': 20,
    'n_hidden_layer': 200,
    'block_size': 3,
    'batch_size': 32
})
m.n_elements

m.train_minibatches(100000, 0.1)
m.train_minibatches(100000, 0.01)

In [367]:
m.get_loss(), m.get_loss('val')

(2.068345308303833, 2.1444251537323)

In [400]:
m = Model({
    'names': names,
    'embedding_depth': 20,
    'n_hidden_layer': 150,
    'block_size': 3,
    'batch_size': 32
})
m.n_elements

m.train_minibatches(100000, 0.1)
m.train_minibatches(100000, 0.01)

In [401]:
m.get_loss(), m.get_loss('val')

(2.0878281593322754, 2.1424944400787354)

In [402]:
m.train_minibatches(100000, 0.001)

In [403]:
m.get_loss(), m.get_loss('val')

(2.079813241958618, 2.1356570720672607)

In [404]:
m.train_minibatches(100000, 0.001)

In [405]:
m.get_loss(), m.get_loss('val')

(2.079188108444214, 2.1362316608428955)

In [417]:
m = Model({
    'names': names,
    'embedding_depth': 20,
    'n_hidden_layer': 150,
    'block_size': 3,
    'batch_size': 32
})
m.get_loss()

23.754575729370117

In [436]:
m = Model({
    'names': names,
    'embedding_depth': 20,
    'n_hidden_layer': 150,
    'block_size': 3,
    'batch_size': 32
})

In [437]:
m.get_loss(), m.get_loss('val')

(23.754575729370117, 23.730976104736328)

In [438]:
m.get_loss(), m.get_loss('val')

(23.754575729370117, 23.730976104736328)

In [439]:
m.train_minibatches(100000, 0.1)
m.train_minibatches(100000, 0.01)

In [440]:
m.get_loss(), m.get_loss('val')

(2.0878281593322754, 2.1424944400787354)