In [1]:
import random

In [2]:
import torch
import torch.nn.functional as F

# Preparing the data

In [3]:
file_path = '../names.txt'

In [4]:
# reading the names from the file
names = open(file_path, 'r').read().split('\n')

In [5]:
# Creating vocabulary of all possible characters
vocabulary = sorted(list(set([c for word in names for c in word])))
vocabulary = ['.'] + vocabulary  # adding start/end character
len(vocabulary)

27

In [6]:
# Creating dictionaries to convert from character to numerical index, and viceversa
stoi = {v:i for i,v in enumerate(vocabulary)}
itos = {i:v for i,v in enumerate(vocabulary)}

In [7]:
# Splitting data in train - val - test
random.shuffle(names)

end_train = int(len(names) * 0.8)
end_val = int(len(names) * 0.9)
names_train = names[:end_train]
names_val = names[end_train:end_val]
names_test = names[end_val:]
len(names_train), len(names_val), len(names_test)

(25626, 3203, 3204)

In [8]:
# Creating dataset

def make_data(names, context_size = 3):

    X, y  = [], []
    
    for name in names[:]:
    
        # initializing context (empty) to predict the beginning of the name
        x_i = [stoi['.']] * context_size
    
        # going through the characters in the name
        for c in name + '.':
    
            # we try to predict the next character given the context
            y_i = stoi[c]
            X.append(x_i)
            y.append(y_i)
    
            # sliding the context 
            x_i = x_i[1:] + [y_i] 
    
    # converting to tensors
    X = torch.tensor(X)
    y = torch.tensor(y)

    return X, y

X_train, y_train = make_data(names_train)
print(X_train.shape, y_train.shape)
X_val, y_val = make_data(names_val)
print(X_val.shape, y_val.shape)
X_test, y_test = make_data(names_test)
print(X_test.shape, y_test.shape)

torch.Size([182467, 3]) torch.Size([182467])
torch.Size([22837, 3]) torch.Size([22837])
torch.Size([22842, 3]) torch.Size([22842])


# Creating  a model



## Prototyping a model

In [9]:
# Creating a list to store all the trainable parameters
params = []

In [10]:
# Input layer
# the index of the character is used to retrieve an  embedding vector of size @
emb_size = 2
C  = torch.randn((len(vocabulary), emb_size), requires_grad=True)
params.append(C)
C.shape

torch.Size([27, 2])

In [11]:
# Getting the input embedding
X_emb = C[X_train]
X_emb.shape

torch.Size([182467, 3, 2])

In [12]:
# Flattening X_emb so that each context is represented as one dimensional  vector
input_size = X_emb.shape[-1] *  X_emb.shape[-2] 
X_flat = X_emb.view(-1, input_size)
X_flat.shape

torch.Size([182467, 6])

In [13]:
# First fully connected layer with 100 neurons
W1 = torch.randn((input_size, 100), requires_grad=True)
B1 = torch.randn((100), requires_grad=True)
params.extend([W1, B1])
W1.shape, B1.shape

(torch.Size([6, 100]), torch.Size([100]))

In [14]:
# Applying first layer
X1 = (X_flat @ W1 + B1).tanh()
X1.shape

torch.Size([182467, 100])

In [15]:
# Last layer with 27 neurons (the size of the vocabulary)
W2 = torch.randn((100, 27), requires_grad=True)
B2 = torch.randn((27), requires_grad=True)
params.extend([W2, B2])
W2.shape, B2.shape

(torch.Size([100, 27]), torch.Size([27]))

In [16]:
# Applying last layer
X2 = (X1 @ W2 + B2).tanh()
X2.shape

torch.Size([182467, 27])

In [17]:
# Normalizing output (softmax)
counts = X2.exp()
probs = counts / counts.sum(axis=1, keepdims=True)
#probs.sum(axis=1)
probs.shape

torch.Size([182467, 27])

In [18]:
# computing the loss

# getting, for each row of probs, the value corresponding to the actual next character in the training set
preds = probs[range(len(y_train)), y_train]

# computing the average, of the negative logs
loss = -preds.log().mean()
loss

tensor(3.7313, grad_fn=<NegBackward0>)

In [19]:
# performing the backpropagation of the loss to compute the gradients
loss.backward()

In [20]:
# Updating the params
alpha = 0.01
for p in params:
    p = p - alpha * p.grad

## Defining functions to create the model

In [21]:


class Model():

    def __init__(self, emb_size=2, vocabulary_size=27, context_size = 3, layer1_size=100):
        # Input layer
        # the index of the character is used to retrieve an  embedding vector of size @
        self.emb_size = emb_size
        self.C  = torch.randn((vocabulary_size, emb_size), requires_grad=True)
        # First fully connected layer with 100 neurons
        self.context_size = context_size
        self.input_size = self.context_size * self.emb_size
        self.layer1_size = layer1_size
        self.W1 = torch.randn((self.input_size, self.layer1_size), requires_grad=True)
        self.B1 = torch.randn((self.layer1_size), requires_grad=True)
        # Last layer with 27 neurons (the size of the vocabulary)
        self.W2 = torch.randn((self.layer1_size, vocabulary_size), requires_grad=True)
        self.B2 = torch.randn((vocabulary_size), requires_grad=True)
        # Creating list with all  trainable params
        self.params = [self.C, self.W1, self.B1, self.W2, self.B2]

        # initializing loss
        self.loss = None
        

    def __call__(self, X):
        # Getting the input embedding
        X_emb = self.C[X]
        # Flattening X_emb so that each context is represented as one dimensional  vector
        X_flat = X_emb.view(-1, self.input_size)
        # Applying first layer
        X1 = (X_flat @ self.W1 + self.B1).tanh()
        # Applying last layer
        X2 = (X1 @ self.W2 + self.B2)
        # Normalizing output (softmax)
        #counts = X2.exp()
        #probs = counts / counts.sum(axis=1, keepdims=True)

        return X2

    def forward_pass(self, X, y):
        # computing model output
        #probs =  self.__call__(X)
        logits = self.__call__(X)
        
        # Normalizing output (softmax)
        #counts = X2.exp()
        #probs = counts / counts.sum(axis=1, keepdims=True)

        # getting, for each row of probs, the value corresponding to the actual next character in the training set
        #preds = probs[range(len(y)), y]
        
        # computing the average, of the negative logs
        #self.loss = -preds.log().mean()

        # computing the loss, efficient way
        self.loss = F.cross_entropy(logits, y)

    def get_loss(self):
        return self.loss.item()

    def backward_pass(self):

        # skipping if loss has never been computed
        if self.loss is None:
            print('yo')
            return

        # Resetting the gradients
        for p in self.params:
            p.grad = None
        
        # performing the backpropagation of the loss to compute the gradients
        self.loss.backward()

        # Updating the params
        alpha = 0.1
        for p in self.params:
            p.data += - alpha * p.grad
        
    def predict_probs(self, X):

        # getting logits
        logits = self.__call__(X)

        # Normalizing output (softmax)
        counts = logits.exp()
        probs = counts / counts.sum(axis=1, keepdims=True)
        
        # Getting the indexes with highest probabilities
        #preds = probs.argmax(axis=1)
        
        return probs


In [22]:
# Training the model

m = Model()

for i in range(100):
    m.forward_pass(X_train, y_train)

    if i % 10 == 0:
        print(f"i: {i}, loss:{m.get_loss()}")
    #print(m.get_loss())
    
    m.backward_pass()

# Computing the loss on the validation dataset
m.forward_pass(X_val, y_val)
print()
print(f"validation dataset loss:{m.get_loss()}")

i: 0, loss:14.496805191040039
i: 10, loss:9.772998809814453
i: 20, loss:7.365072727203369
i: 30, loss:5.94071102142334
i: 40, loss:5.035120487213135
i: 50, loss:4.48888635635376
i: 60, loss:4.1285881996154785
i: 70, loss:3.867497682571411
i: 80, loss:3.6675920486450195
i: 90, loss:3.508657693862915

validation dataset loss:3.3483662605285645


In [23]:
# Training the model, with batches, instead of whole dataset

m = Model()
batch_size = 32

for i in range(60000):

    # generating the indexes of the elements to be used in the batch
    batch_idx = torch.randint(0, X_train.shape[0], (batch_size,))
    
    m.forward_pass(X_train[batch_idx], y_train[batch_idx])

    if i % 10000 == 0:
        print(f"i: {i}, loss:{m.get_loss()}")
    #print(m.get_loss())
    
    m.backward_pass()

# Computing the loss on the validation dataset
m.forward_pass(X_val, y_val)
print()
print(f"validation dataset loss:{m.get_loss()}")


i: 0, loss:16.090801239013672
i: 10000, loss:2.8609697818756104
i: 20000, loss:2.5561227798461914
i: 30000, loss:2.1848437786102295
i: 40000, loss:2.2716543674468994
i: 50000, loss:2.133680820465088

validation dataset loss:2.3511996269226074


## Improving the architecture of the Model

In [24]:
# Training the model, with batches, instead of whole dataset

context_size = 5
X_train, y_train = make_data(names_train, context_size)
X_val, y_val = make_data(names_val, context_size)
X_test, y_test = make_data(names_test, context_size)


m = Model(emb_size=8, context_size=context_size)

batch_size = 64

for i in range(80000):

    # generating the indexes of the elements to be used in the batch
    batch_idx = torch.randint(0, X_train.shape[0], (batch_size,))
    
    m.forward_pass(X_train[batch_idx], y_train[batch_idx])

    if i % 10000 == 0:
        print(f"i: {i}, loss:{m.get_loss()}")
    #print(m.get_loss())
    
    m.backward_pass()

# Computing the loss on the whole validation dataset
m.forward_pass(X_val, y_val)
print()
print(f"validation dataset loss:{m.get_loss()}")

i: 0, loss:17.27199363708496
i: 10000, loss:2.8270864486694336
i: 20000, loss:2.48972225189209
i: 30000, loss:2.117682695388794
i: 40000, loss:2.306112766265869
i: 50000, loss:2.3946666717529297
i: 60000, loss:2.2828826904296875
i: 70000, loss:2.2651262283325195

validation dataset loss:2.2664685249328613


# Generating names from the model

In [25]:

def generate_name(model):
    
    # Initializing output sequence
    result = []
    
    # Creating starting empty sequence compatible with context size of the model
    context = torch.zeros([1, model.context_size]).int()
    
    # Generating until the end value (0) is returned
    pred_value = -1
    while pred_value != 0:
    
        # predicting probability distribution for next character
        probs = model.predict_probs(context)
        # getting a sample from the probability distribution
        pred = torch.multinomial(probs, 1, replacement=True)
        pred_value = pred.item()
    
        # updating the context with the new character
        context = torch.concat((context[0, 1:], pred[0])).unsqueeze(0)
    
        # adding predicted character to the output sequence
        result.append(itos[pred_value])
    
    return ''.join(result[:-1])


In [26]:
for i in range(10):
    print(generate_name(m))

atlengya
priclete
brostynn
sira
pranc
ary
hinia
maryana
janee
konnoy
