***

### Text Generation Example

In [28]:
import torch
from torch import nn
import numpy as np

In [35]:
text = ['hey how are you', 'good i am fine', 'have a nice day']

# Join all the sentences together and extract the unique characters
chars = set(''.join(text))

# Creating a dictionary that maps integers to the characters
int2char = dict(enumerate(chars))

# Creating another dictionary that maps characters to integers
char2int = {char: ind for ind, char in int2char.items()}

Next, we'll be padding our input sentences to ensure that all the sentences are of standard length. While RNNs are typically able to take in variably sized inputs, we will usually want to feed training data in batches to speed up the trianing process. In order to used batches to train on our data, we'll need to ensure that each sequence within the input data is of equal size.

 - There is a specific data structure to handle sets of series with different length

In [40]:
# Finding the length of the longest string in our data
maxlen = len(max(text, key = len))

# Padding

# A simple loop that loops through the list of sentences and adds a ' ' whitespace
# until the length of the sentence matches the length of the longest sentence
for i in range(len(text)):
    while len(text[i]) < maxlen:
        text[i] += ' '

As we're going to predict the next character in the sequence at each time step, we'll have to divide each sentence into

- Input data: The last input character should be excluded as it does not need to be fed into the model
- Target: One time-step ahead of the input data as this will be the "correct answer" for the model at each time step corresponding to the input data

In [41]:
# Creating lists that will hold our input and target sequences
input_seq = []
target_seq = []

for i in range(len(text)):
    # Remove last character for input sequence
    input_seq.append(text[i][:-1])
    
    # Remove first character for target sequence
    target_seq.append(text[i][1:])
    print("Input sequence: {}\nTarget sequence: {}".format(input_seq[i], target_seq[i]))

Input sequence: hey how are yo
Target sequence: ey how are you
Input sequence: good i am fine
Target sequence: ood i am fine 
Input sequence: have a nice da
Target sequence: ave a nice day


Now we can convert our input and target sequences to sequences of integers instead of a sequence of characters by mapping them using the dictionaries we created above. This will allow us to one-hot-encode our input sequence subsequently:

In [42]:
for i in range(len(text)):
    input_seq[i] = [char2int[character] for character in input_seq[i]]
    target_seq[i] = [char2int[character] for character in target_seq[i]]

In [44]:
dict_size = len(char2int)
seq_len = maxlen - 1
batch_size = len(text)

def one_hot_encode(sequence, dict_size, seq_len, batch_size):
    # Creating a multi-dimensional array of zeros with the desired output shape
    features = np.zeros((batch_size, seq_len, dict_size), dtype = np.float32)
    
    # Replacing the 0 at the relevant character index with a 1 to represent that
    # character
    for i in range(batch_size):
        for u in range(seq_len):
            features[i, u, sequence[i][u]] = 1
    return features

In [45]:
# Input shape --> (Batch Size, Sequence Length, One-Hot Encoding Size)
input_seq = one_hot_encode(input_seq, dict_size, seq_len, batch_size)

In [47]:
input_seq = torch.from_numpy(input_seq)
target_seq = torch.Tensor(target_seq)

In [51]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


In [81]:
class RNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(RNN, self).__init__()
        
        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        # Defining the layers
        # RNN Layer
        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first = True)
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, x):
        batch_size = x.size(0)
        
        # Initializing hidden state for first input using method defined below
        hidden = self.init_hidden(batch_size)
        
        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.rnn(x, hidden)
        
        # Reshaping the outputs such that it can be fit into the fully connected layer
        out = out.contiguous().view(-1, self.hidden_dim)
        
        out = self.fc(out)
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        # This method generates the first hidden state of zeros which wel'll use in the forward pass
        # We'll send the tensor holding the hidden state to the device we specified earlier as well
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden
    

In [83]:
# Instantiate the model with hyperparameters
model = RNN(input_size = dict_size, output_size = dict_size, hidden_dim = 12, n_layers = 1)
model.to(device)

# Define hyperparameters
n_epochs = 100
lr = 0.01

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [84]:
# Training Run
for epoch in range(1, n_epochs + 1):
    optimizer.zero_grad() 
    input_seq.to(device)
    output, hidden = model(input_seq)
    loss = criterion(output, target_seq.view(-1).long())
    loss.backward()
    optimizer.step()
    
    if epoch%10 == 0:
        print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
        print("Loss: {:.4f}".format(loss.item()))

Epoch: 10/100............. Loss: 2.3903
Epoch: 20/100............. Loss: 2.1226
Epoch: 30/100............. Loss: 1.7271
Epoch: 40/100............. Loss: 1.2456
Epoch: 50/100............. Loss: 0.8436
Epoch: 60/100............. Loss: 0.5605
Epoch: 70/100............. Loss: 0.3740
Epoch: 80/100............. Loss: 0.2567
Epoch: 90/100............. Loss: 0.1874
Epoch: 100/100............. Loss: 0.1451


Testing the model

In [85]:
# This function takes in the model and character as arguments and returns the next character prediction and hidden state
def predict(model, character):
    # One-hot encoding our input to fit the model
    character = np.array([[char2int[c] for c in character]])
    character = one_hot_encode(character, dict_size, character.shape[1], 1)
    character = torch.from_numpy(character)
    character.to(device)
    
    out, hidden = model(character)
    
    prob = nn.functional.softmax(out[-1], dim = 0).data
    # Taking the class with the highest probability score from the output
    char_ind = torch.max(prob, dim = 0)[1].item()
    
    return int2char[char_ind], hidden

In [86]:
# This function takes the desired output length and input characters as 
# arguments, returning the produced sentence

def sample(model, out_len, start = 'hey'):
    model.eval() # eval mode
    start = start.lower()
    # First off, run through the starting characters
    chars = [ch for ch in start]
    size = out_len - len(chars)
    # Now pass in the previous characters and get a new one
    for ii in range(size):
        char, h = predict(model, chars)
        chars.append(char)
        
    return ''.join(chars)

In [87]:
sample(model, 30, 'good')

'good i am fine a nice day fine'

#### Experiments

In [68]:
input_seq.shape
# [batch_size, seq_len, num_features]
# batch_size : number of different sequences
# seq_len : length of the sequence
# num_features : number of features at each timestep

torch.Size([3, 14, 17])

In [78]:
model = RNN(input_size = dict_size, output_size = dict_size, 
            hidden_dim = 12, n_layers = 1)
# input_size : number of features of each input (timestep)
# output_size : number of features at the final output (in this case, the linear layer)
# the output will be a unnormalized negative log likelihood of the characters
# hidden_dim : size of the memory layer
# n_layers : number of different recurrent modules stacked together


In [79]:
output, hidden = model(input_seq)
# Testing with an input of size [3, 14, 17]
# After passing the input through the RNN layer, the data will have size [3, 14, 12]
# At each timestep the recursive module produces an output of length equal to "hidden_dim"
# the final output will be a sequence of length equal to the input length
# Before sending the data to the linear layer, it needs to be reshaped, stacking together
# all the batches
# The result is a tensor of size [42, 12], each one of the 42 lines are a unique element
# in a sequence
# The linear layer convert each of these elements in a tensor of size [1, 17],
# a unnormalized negative log likelihood of a character (the character after the given input character)
# For predictions, we take the last element of this tensor.

Input shape
torch.Size([3, 14, 17])
RNN output shape
torch.Size([3, 14, 12])
Reshaped output
torch.Size([42, 12])
Linear output shape
torch.Size([42, 17])
