# Character Level Prediction

This Notebook tries to use an LSTM to learn the style of Tolstoj's Anna Karenina.
If we have a string like 'abcd' the training is performed using as label at step 0 the letter 'b', while as prediction it is used the output of the letter 'a' through the neural network.
Similiarly, at step 1, we use as a label 'c' and as prediction 'b'.

In [52]:
# Importing section

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np

## Device Selection

In [53]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Loading Data

In [54]:
with open('data/anna.txt', 'r') as f:
    text = f.read()

## Tokenization

In [55]:
chars = tuple(set(text)) # Creates a set of characters from the text and transforms it into tuples
dict_integer_char = dict(enumerate(chars))
dict_char_integer = {char: integer for integer, char in dict_integer_char.items()}

## Encoding the Text

In [56]:
encoded = np.array([dict_char_integer[char] for char in text])

## One hot encoding of the data

In [57]:
def one_hot_encode(arr, n_labels):
    #print(arr.shape)
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype = np.float32)
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

## Creating batches of characters

In [62]:
def get_batches(arr, batch_size, seq_lenght):
    
    char_in_batch = batch_size*seq_lenght
    num_batches = len(arr)//char_in_batch
    
    # Discharging charachters that unpair the batches
    final_arr = arr[:num_batches*char_in_batch]
    final_arr = final_arr.reshape((batch_size, -1))
    
    # Crating Labels (which in our case are the next characters)
    
    for i in range(0, final_arr.shape[1], seq_lenght):
        x = final_arr[:, i : i + seq_lenght]
        y = np.zeros_like(x)
        try:
            y[:, : -1], y[:, -1] = x[:, 1:], final_arr[:, n +seq_lenght]
        except :
            y[:, : -1], y[:, -1] = x[:, 1:], final_arr[:, 0]
        yield x, y   

## Definition of the network

In [63]:
class RNN(nn.Module):
    
    def __init__(self, tokens, n_hidden = 256, n_layers = 2, drop = 0.5, lr = 0.001):
        super().__init__()
        self.drop = drop
        self.n_hidden = n_hidden
        self.n_layers = n_layers
        self.lr = lr
        
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {char: integer for integer, char in self.int2char.items()}
        
        
        self.lstm = nn.LSTM(len(self.chars), self.n_hidden, self.n_layers, dropout = self.drop, batch_first = True)
        self.dropout = nn.Dropout(self.drop)
        self.linear = nn.Linear(self.n_hidden, len(self.chars)) # Notice the input is equal to the hidden_dim. The short term memory is equal to the output in LSTMs
        
    def forward(self, char, hidden):
        output, hidden = self.lstm(char, hidden)
        output = self.dropout(output)
        
        output = output.contiguous().view(-1, self.n_hidden)
        
        output = self.linear(output)
        
        return output, hidden
    
    def init_hidden(self, batch_size, device):
        weight = next(self.parameters()).data ## Why next?
        
        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device), 
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return hidden

## Training function

In [85]:
def train(net, data, device, epochs = 10, batch_size = 10, seq_length = 50, lr = 0.001, clip = 5, val_frac = 0.1, print_every = 10):
    
    net.train()
    
    optimizer = optim.Adam(net.parameters(), lr = lr)
    criterion = nn.CrossEntropyLoss()
    
    validation_index = int(len(data)*(1.0 - val_frac))
    data, val_data = data[:validation_index], data[validation_index:]
    
    counter = 0
    n_chars = len(net.chars)
    
    for epoch in range(epochs):
        
        # Initizialization of the firs hidden states
        h = net.init_hidden(batch_size, device)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1

            x = one_hot_encode(x, n_chars)
            # Make Tensors
            inputs = torch.from_numpy(x).to(device)
            labels = torch.from_numpy(y).to(device)

            # Deatching h from computation

            h = tuple([hidden.data for hidden in h])

            # zero accumulated gradient
            #optimizer.zero_grad()
            net.zero_grad()

            output, h = net(inputs, h)
            #print(output.size())
            #print(labels.view(batch_size*seq_length).size())
            loss = criterion(output, labels.view(batch_size*seq_length))
            loss.backward()

            nn.utils.clip_grad_norm(net.parameters(), clip)
            optimizer.step()

            if counter % print_every == 0:
                val_h = net.init_hidden(batch_size, device)
                val_losses = []
                net.eval()

                for x,y in get_batches(val_data, batch_size, seq_length):
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x).to(device), torch.from_numpy(y).to(device)

                    val_h = tuple([hidden.data for hidden in val_h])

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, labels.view(batch_size*seq_length))

                    val_losses.append(val_loss.item())

                net.train()

                print('Epoch {}/{}'.format(epoch+1, epochs),
                     'Step {}'.format(counter),
                     'Loss {:.4f}'.format(loss.item()),
                     'Val Loss {:.4f}'.format(np.mean(val_losses)))


## Hyper-parameters/ Instatiating the net

In [86]:
n_hidden = 512
n_layers = 2
batch_size = 128
seq_length = 100
n_epochs = 20

net = RNN(chars, n_hidden, n_layers).to(device)

## Training

In [87]:
train(net, encoded, device, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=10)

  nn.utils.clip_grad_norm(net.parameters(), clip)


Epoch 1/20 Step 10 Loss 3.2409 Val Loss 3.2064
Epoch 1/20 Step 20 Loss 3.1413 Val Loss 3.1055
Epoch 1/20 Step 30 Loss 3.1421 Val Loss 3.1119
Epoch 1/20 Step 40 Loss 3.1187 Val Loss 3.0910
Epoch 1/20 Step 50 Loss 3.1444 Val Loss 3.1240
Epoch 1/20 Step 60 Loss 3.1237 Val Loss 3.1021
Epoch 1/20 Step 70 Loss 3.1069 Val Loss 3.0928
Epoch 1/20 Step 80 Loss 3.1224 Val Loss 3.1082
Epoch 1/20 Step 90 Loss 3.1179 Val Loss 3.1028
Epoch 1/20 Step 100 Loss 3.0857 Val Loss 3.0712
Epoch 1/20 Step 110 Loss 3.0549 Val Loss 3.0285
Epoch 1/20 Step 120 Loss 2.9515 Val Loss 2.9178
Epoch 1/20 Step 130 Loss 2.8645 Val Loss 2.8085
Epoch 2/20 Step 140 Loss 2.7360 Val Loss 2.6635
Epoch 2/20 Step 150 Loss 2.6413 Val Loss 2.5955
Epoch 2/20 Step 160 Loss 2.5616 Val Loss 2.5219
Epoch 2/20 Step 170 Loss 2.4910 Val Loss 2.4551
Epoch 2/20 Step 180 Loss 2.4616 Val Loss 2.4212
Epoch 2/20 Step 190 Loss 2.4296 Val Loss 2.3898
Epoch 2/20 Step 200 Loss 2.4069 Val Loss 2.3631
Epoch 2/20 Step 210 Loss 2.3724 Val Loss 2.3257
E

Epoch 13/20 Step 1700 Loss 1.3917 Val Loss 1.3502
Epoch 13/20 Step 1710 Loss 1.3752 Val Loss 1.3322
Epoch 13/20 Step 1720 Loss 1.3769 Val Loss 1.3306
Epoch 13/20 Step 1730 Loss 1.4131 Val Loss 1.3723
Epoch 13/20 Step 1740 Loss 1.3904 Val Loss 1.3547
Epoch 13/20 Step 1750 Loss 1.3505 Val Loss 1.3289
Epoch 13/20 Step 1760 Loss 1.3826 Val Loss 1.3481
Epoch 13/20 Step 1770 Loss 1.4036 Val Loss 1.3578
Epoch 13/20 Step 1780 Loss 1.3638 Val Loss 1.3402
Epoch 13/20 Step 1790 Loss 1.3649 Val Loss 1.3290
Epoch 13/20 Step 1800 Loss 1.3872 Val Loss 1.3508
Epoch 14/20 Step 1810 Loss 1.4018 Val Loss 1.3588
Epoch 14/20 Step 1820 Loss 1.3742 Val Loss 1.3442
Epoch 14/20 Step 1830 Loss 1.3821 Val Loss 1.3544
Epoch 14/20 Step 1840 Loss 1.3420 Val Loss 1.3069
Epoch 14/20 Step 1850 Loss 1.3313 Val Loss 1.2948
Epoch 14/20 Step 1860 Loss 1.3827 Val Loss 1.3498
Epoch 14/20 Step 1870 Loss 1.3887 Val Loss 1.3488
Epoch 14/20 Step 1880 Loss 1.3715 Val Loss 1.3349
Epoch 14/20 Step 1890 Loss 1.4070 Val Loss 1.3575


## Sampling function. It is used to create an initial hidden state for the LSTM and to create our novel

In [104]:
def sample(net, device, size, first = 'The', top_k = None ):
    
    net.eval()
    chars = [ch for ch in first]
    h = net.init_hidden(1, device)
    
    for ch in first:
        char, h = predict(net, ch, h, top_k = top_k)
        
    chars.append(char)
    
    for i in range(size):
        char, h = predict(net, chars[-1], h, top_k = top_k)
        chars.append(char)
    
    return ''.join(chars)

In [111]:
def predict(net, char, h = None, top_k = None):
    
    x = np.array([[net.char2int[char]]]) # Double [[]] to emulate batch_size = 1
    x = one_hot_encode(x, len(net.chars))
    
    inputs = torch.from_numpy(x).to(device)
    h = tuple([hidden.data for hidden in h])
    
    output, h = net(inputs, h)
    
    # output has shape (batch*seq_length, num_chars in dictionary)
    
    p = F.softmax(output, dim = 1).data
    
    # getting predictions
    if top_k is None:
        top_ch = np.arange(len(net.chars))
    else:
        p, top_ch = p.topk(top_k)
        top_ch = top_ch.cpu().numpy().squeeze() # De-tensorize and removing batch
        
    p = p.cpu().numpy().squeeze()
    char = np.random.choice(top_ch, p = p/p.sum())
    
    return net.int2char[char], h

In [112]:
print(sample(net, device, 1000, first='Anna', top_k=5))

Anna was
daying about his.

"I'll been to her. And I should go in to that," said Anna, trying to see
them, and he
does not talking to his subject of her
face to strack his wife, was satisficulting and settled thas evening to all happened, and this thoughts
of the
pluthing, at this part were for in a tray of seren exact. Besides he had been there to say that it always decided to him.

"Well, thank you're always satisfied. I'm stopping on all of the sacrial thing."

She conceined how at those were still she felt in the regarted coarse.

"And I did not come to him," said Anna, and his feeling said a sign, which stood stringly
in the carriage she heard the
room, solity at his fore hands.

Stepan Arkadyevitch walked and still sore from surprise in which he felt that he had
come for an understand were so than she had
taken the country. He called to him. She flung town the stream, and thisteled with
horror, and he went out of
the
clear house and daighter.

"A struck over on that table," said 