In [1]:
import random

In [2]:
import torch
import torch.nn.functional as F

# Preparing the data

In [3]:
file_path = '../names.txt'

In [4]:
# reading the names from the file
names = open(file_path, 'r').read().split('\n')

In [5]:
# Creating vocabulary of all possible characters
vocabulary = sorted(list(set([c for word in names for c in word])))
vocabulary = ['.'] + vocabulary  # adding start/end character
len(vocabulary)

27

In [6]:
# Creating dictionaries to convert from character to numerical index, and viceversa
stoi = {v:i for i,v in enumerate(vocabulary)}
itos = {i:v for i,v in enumerate(vocabulary)}

In [7]:
# Splitting data in train - val - test
random.shuffle(names)

end_train = int(len(names) * 0.8)
end_val = int(len(names) * 0.9)
names_train = names[:end_train]
names_val = names[end_train:end_val]
names_test = names[end_val:]
len(names_train), len(names_val), len(names_test)

(25626, 3203, 3204)

In [8]:
# Creating dataset

def make_data(names, context_size = 3):

    X, y  = [], []
    
    for name in names[:]:
    
        # initializing context (empty) to predict the beginning of the name
        x_i = [stoi['.']] * context_size
    
        # going through the characters in the name
        for c in name + '.':
    
            # we try to predict the next character given the context
            y_i = stoi[c]
            X.append(x_i)
            y.append(y_i)
    
            # sliding the context 
            x_i = x_i[1:] + [y_i] 
    
    # converting to tensors
    X = torch.tensor(X)
    y = torch.tensor(y)

    return X, y


In [9]:
context_size = 5
X_train, y_train = make_data(names_train, context_size)
X_val, y_val = make_data(names_val, context_size)
X_test, y_test = make_data(names_test, context_size)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

torch.Size([182484, 5]) torch.Size([182484])
torch.Size([22850, 5]) torch.Size([22850])
torch.Size([22812, 5]) torch.Size([22812])


# Creating  the models



## Improving the weights initialization of the model

In [10]:


class Model_v1():

    def __init__(self, emb_size=2, vocabulary_size=27, context_size = 3, layer1_size=100,
                 W1_std=1, B1_std=1, W2_std=1, B2_std=1):
        # Input layer
        # the index of the character is used to retrieve an  embedding vector of size @
        self.emb_size = emb_size
        self.C  = torch.randn((vocabulary_size, emb_size), requires_grad=True)
        # First fully connected layer with 100 neurons
        self.context_size = context_size
        self.input_size = context_size * emb_size
        self.layer1_size = layer1_size
        self.W1 = (torch.randn((self.input_size, layer1_size)) * W1_std).clone().detach().requires_grad_(True)
        self.B1 = (torch.randn(layer1_size) * B1_std ).clone().detach().requires_grad_(True)
        # Last layer with 27 neurons (the size of the vocabulary)
        self.W2 = (torch.randn((layer1_size, vocabulary_size))* W2_std).clone().detach().requires_grad_(True)
        self.B2 = (torch.randn(vocabulary_size) * B2_std).clone().detach().requires_grad_(True)
        # Creating list with all  trainable params
        self.params = [self.C, self.W1, self.B1, self.W2, self.B2]

        # initializing loss
        self.loss = None
        

    def __call__(self, X):
        # Getting the input embedding
        X_emb = self.C[X]
        # Flattening X_emb so that each context is represented as one dimensional  vector
        X_flat = X_emb.view(-1, self.input_size)
        # Applying first layer
        X1 = (X_flat @ self.W1 + self.B1).tanh()
        # Applying last layer
        X2 = (X1 @ self.W2 + self.B2)
        # Normalizing output (softmax)
        #counts = X2.exp()
        #probs = counts / counts.sum(axis=1, keepdims=True)

        return X2

    def forward_pass(self, X, y):
        # computing model output
        #probs =  self.__call__(X)
        logits = self.__call__(X)
        
        # Normalizing output (softmax)
        #counts = X2.exp()
        #probs = counts / counts.sum(axis=1, keepdims=True)

        # getting, for each row of probs, the value corresponding to the actual next character in the training set
        #preds = probs[range(len(y)), y]
        
        # computing the average, of the negative logs
        #self.loss = -preds.log().mean()

        # computing the loss, efficient way
        self.loss = F.cross_entropy(logits, y)

    def get_loss(self):
        return self.loss.item()

    def backward_pass(self):

        # skipping if loss has never been computed
        if self.loss is None:
            return

        # Resetting the gradients
        for p in self.params:
            p.grad = None
        
        # performing the backpropagation of the loss to compute the gradients
        self.loss.backward()

        # Updating the params
        alpha = 0.1
        for p in self.params:
            p.data += - alpha * p.grad
        
    def predict_probs(self, X):

        # getting logits
        logits = self.__call__(X)

        # Normalizing output (softmax)
        counts = logits.exp()
        probs = counts / counts.sum(axis=1, keepdims=True)
        
        # Getting the indexes with highest probabilities
        #preds = probs.argmax(axis=1)
        
        return probs


In [11]:
# Training the model

m = Model_v1(emb_size=8, context_size=context_size)

batch_size = 64

for i in range(80000):

    # generating the indexes of the elements to be used in the batch
    batch_idx = torch.randint(0, X_train.shape[0], (batch_size,))
    
    m.forward_pass(X_train[batch_idx], y_train[batch_idx])

    if i % 10000 == 0:
        print(f"i: {i}, loss:{m.get_loss()}")
    #print(m.get_loss())
    
    m.backward_pass()

# Computing the loss on the whole validation dataset
m.forward_pass(X_val, y_val)
print()
print(f"validation dataset loss:{m.get_loss()}")

i: 0, loss:16.517805099487305
i: 10000, loss:2.459197521209717
i: 20000, loss:2.457740306854248
i: 30000, loss:2.487136125564575
i: 40000, loss:2.3575973510742188
i: 50000, loss:2.1241016387939453
i: 60000, loss:2.01373028755188
i: 70000, loss:2.295576810836792

validation dataset loss:2.2350573539733887


In [12]:
# Training the model with a better initialization of the last layer
# We modify the std of W2 and B2 so that we start with a flat output distribution
# to diminish the risk of the model being overly confident on some wrong predictions

m = Model_v1(emb_size=8, context_size=context_size, W2_std=0.2, B2_std=0)

batch_size = 64

for i in range(80000):

    # generating the indexes of the elements to be used in the batch
    batch_idx = torch.randint(0, X_train.shape[0], (batch_size,))
    
    m.forward_pass(X_train[batch_idx], y_train[batch_idx])

    if i % 10000 == 0:
        print(f"i: {i}, loss:{m.get_loss()}")
    #print(m.get_loss())
    
    m.backward_pass()

# Computing the loss on the whole validation dataset
m.forward_pass(X_val, y_val)
print()
print(f"validation dataset loss:{m.get_loss()}")

i: 0, loss:4.434738636016846
i: 10000, loss:2.542330026626587
i: 20000, loss:2.2687408924102783
i: 30000, loss:2.300877094268799
i: 40000, loss:2.0275070667266846
i: 50000, loss:2.502208948135376
i: 60000, loss:2.4851906299591064
i: 70000, loss:2.295337200164795

validation dataset loss:2.1837849617004395


In [13]:
# Training the model with a better initialization of the first layer
# We modify the std of W1 and B1 so that the distribution of the output of the layer
# stays around 0:
#   - around 0, the gradient of the tanh function is linear and can backpropagate gradient effectively
#   - for values far from 0 the gradient of tanh will be almost so, it will prevent information from
#     the later layer to be backpropagated effectively (g * 0 = 0)


# We use the kaiming rule to initialize W1
# a = gain / sqrt(fan_in)
# gain constant that depends on the activation (2/3 for tanh)
# fan_in is the the number of input to the layer (emb_size * context_size in our case)
emb_size=8
fan_in = emb_size * context_size
W1_std = (3/2) / fan_in**0.5


m = Model_v1(emb_size=emb_size, context_size=context_size, W1_std=W1_std, B1_std=0.1, W2_std=0.2, B2_std=0)

batch_size = 64

for i in range(80000):

    # generating the indexes of the elements to be used in the batch
    batch_idx = torch.randint(0, X_train.shape[0], (batch_size,))
    
    m.forward_pass(X_train[batch_idx], y_train[batch_idx])

    if i % 10000 == 0:
        print(f"i: {i}, loss:{m.get_loss()}")
    #print(m.get_loss())
    
    m.backward_pass()

# Computing the loss on the whole validation dataset
m.forward_pass(X_val, y_val)
print()
print(f"validation dataset loss:{m.get_loss()}")

i: 0, loss:4.204208850860596
i: 10000, loss:2.41817307472229
i: 20000, loss:2.145571231842041
i: 30000, loss:2.2393765449523926
i: 40000, loss:2.1258485317230225
i: 50000, loss:2.3216745853424072
i: 60000, loss:2.2246437072753906
i: 70000, loss:1.9209933280944824

validation dataset loss:2.1187126636505127


## Adding batch normalization to the model

In [25]:


class Model_v2():

    def __init__(self, emb_size=2, vocabulary_size=27, context_size = 3, layer1_size=100,
                 W1_std=1, B1_std=1, W2_std=1, B2_std=1):
        # Input layer
        # the index of the character is used to retrieve an  embedding vector of size @
        self.emb_size = emb_size
        self.C  = torch.randn((vocabulary_size, emb_size), requires_grad=True)
        # First fully connected layer with 100 neurons
        self.context_size = context_size
        self.input_size = context_size * emb_size
        self.layer1_size = layer1_size
        self.W1 = (torch.randn((self.input_size, layer1_size)) * W1_std).clone().detach().requires_grad_(True)
        self.B1 = (torch.randn(layer1_size) * B1_std ).clone().detach().requires_grad_(True)
        # Batch normalization layer, to apply before the non linearity
        self.bn_gain = torch.randn(layer1_size, requires_grad=True)
        self.bn_bias = torch.randn(layer1_size, requires_grad=True)
        self.bn_mean = torch.randn(layer1_size,)
        self.bn_std = torch.randn(layer1_size,)
        # Last layer with 27 neurons (the size of the vocabulary)
        self.W2 = (torch.randn((layer1_size, vocabulary_size))* W2_std).clone().detach().requires_grad_(True)
        self.B2 = (torch.randn(vocabulary_size) * B2_std).clone().detach().requires_grad_(True)
        # Creating list with all  trainable params
        self.params = [self.C, self.W1, self.B1, self.bn_gain, self.bn_bias, self.W2, self.B2]

        # initializing loss
        self.loss = None
        

    def __call__(self, X):
        # Getting the input embedding
        X_emb = self.C[X]
        # Flattening X_emb so that each context is represented as one dimensional  vector
        X_flat = X_emb.view(-1, self.input_size)
        # Applying first layer
        X1 = (X_flat @ self.W1 + self.B1) # linear
        with torch.no_grad():
            # bn bias are updated manually, not through backpropagation
            self.bn_mean  = 0.99 * self.bn_mean +  0.01 * X1.mean(axis=0, keepdims=True)
            self.bn_std  = 0.99 * self.bn_std +  0.01 * X1.std(axis=0, keepdims=True, correction=0)
        # batch normalization , adding small  epsilon to avoid divide by zero when the batch variance is 0
        X1 = (X1 - self.bn_mean) / (self.bn_std + 0.001) 
        X1 = (X1 * self.bn_gain) + self.bn_bias  # batch normalization gain and bias
        X1 = X1.tanh() #non linearity
        # Applying last layer
        X2 = (X1 @ self.W2 + self.B2)
        # Normalizing output (softmax)
        #counts = X2.exp()
        #probs = counts / counts.sum(axis=1, keepdims=True)

        return X2

    def forward_pass(self, X, y):
        # computing model output
        #probs =  self.__call__(X)
        logits = self.__call__(X)
        
        # Normalizing output (softmax)
        #counts = X2.exp()
        #probs = counts / counts.sum(axis=1, keepdims=True)

        # getting, for each row of probs, the value corresponding to the actual next character in the training set
        #preds = probs[range(len(y)), y]
        
        # computing the average, of the negative logs
        #self.loss = -preds.log().mean()

        # computing the loss, efficient way
        self.loss = F.cross_entropy(logits, y)

    def get_loss(self):
        return self.loss.item()

    def backward_pass(self):

        # skipping if loss has never been computed
        if self.loss is None:
            return

        # Resetting the gradients
        for p in self.params:
            p.grad = None
        
        # performing the backpropagation of the loss to compute the gradients
        self.loss.backward()

        # Updating the params
        alpha = 0.1
        for p in self.params:
            p.data += - alpha * p.grad
        
    def predict_probs(self, X):

        # getting logits
        logits = self.__call__(X)

        # Normalizing output (softmax)
        counts = logits.exp()
        probs = counts / counts.sum(axis=1, keepdims=True)
        
        # Getting the indexes with highest probabilities
        #preds = probs.argmax(axis=1)
        
        return probs


In [26]:
# Trainig the model with batch normalization
m = Model_v2(emb_size=emb_size, context_size=context_size, W1_std=W1_std, B1_std=0.1, W2_std=0.2, B2_std=0)

batch_size = 64

for i in range(80000):

    # generating the indexes of the elements to be used in the batch
    batch_idx = torch.randint(0, X_train.shape[0], (batch_size,))
    
    m.forward_pass(X_train[batch_idx], y_train[batch_idx])

    if i % 10000 == 0:
        print(f"i: {i}, loss:{m.get_loss()}")
    #print(m.get_loss())
    
    m.backward_pass()

# Computing the loss on the whole validation dataset
m.forward_pass(X_val, y_val)
print()
print(f"validation dataset loss:{m.get_loss()}")

i: 0, loss:4.718467712402344
i: 10000, loss:2.348910331726074
i: 20000, loss:2.0643563270568848
i: 30000, loss:2.02232027053833
i: 40000, loss:1.967730164527893
i: 50000, loss:2.201854944229126
i: 60000, loss:2.1376073360443115
i: 70000, loss:1.9560177326202393

validation dataset loss:2.111684799194336


# Generating names from the model

In [27]:

def generate_name(model):
    
    # Initializing output sequence
    result = []
    
    # Creating starting empty sequence compatible with context size of the model
    context = torch.zeros([1, model.context_size]).int()
    
    # Generating until the end value (0) is returned
    pred_value = -1
    while pred_value != 0:
    
        # predicting probability distribution for next character
        probs = model.predict_probs(context)
        # getting a sample from the probability distribution
        pred = torch.multinomial(probs, 1, replacement=True)
        pred_value = pred.item()
    
        # updating the context with the new character
        context = torch.concat((context[0, 1:], pred[0])).unsqueeze(0)
    
        # adding predicted character to the output sequence
        result.append(itos[pred_value])
    
    return ''.join(result[:-1])


In [28]:
for i in range(10):
    print(generate_name(m))

jaley
millee
magymin
thayloughan
trayieh
mylah
yahj
reyvon
showyn
jovingell
