In [1]:
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter

# Check whether GPU is available and can be used
# if CUDA is found then device is set accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if not torch.cuda.is_available():
    print("Consider changing your run-time to GPU or training will be slow.")

In [2]:
with open('data/sonnets.txt', 'r') as f:
    text = f.read()
    
# print an excerpt of the text 
print(text[:200])

From fairest creatures we desire increase,
That thereby beauty's rose might never die,
But as the riper should by time decease,
His tender heir might bear his memory:
But thou contracted to thine own 


In [3]:
# We create two dictionaries:
# 1. int2char, which maps integers to characters
# 2. char2int, which maps characters to integers
chars = sorted(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

print(chars)
print(char2int)

# Encode the text
encoded = np.array([char2int[ch] for ch in text])

print(f"vocab length: {len(char2int)}")
# Again showing the excerpt, but this time as integers 
encoded[:200]

['\n', ' ', '!', "'", '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
{'\n': 0, ' ': 1, '!': 2, "'": 3, '(': 4, ')': 5, ',': 6, '-': 7, '.': 8, ':': 9, ';': 10, '?': 11, 'A': 12, 'B': 13, 'C': 14, 'D': 15, 'E': 16, 'F': 17, 'G': 18, 'H': 19, 'I': 20, 'J': 21, 'K': 22, 'L': 23, 'M': 24, 'N': 25, 'O': 26, 'P': 27, 'R': 28, 'S': 29, 'T': 30, 'U': 31, 'V': 32, 'W': 33, 'Y': 34, 'a': 35, 'b': 36, 'c': 37, 'd': 38, 'e': 39, 'f': 40, 'g': 41, 'h': 42, 'i': 43, 'j': 44, 'k': 45, 'l': 46, 'm': 47, 'n': 48, 'o': 49, 'p': 50, 'q': 51, 'r': 52, 's': 53, 't': 54, 'u': 55, 'v': 56, 'w': 57, 'x': 58, 'y': 59, 'z': 60}
vocab length: 61


array([17, 52, 49, 47,  1, 40, 35, 43, 52, 39, 53, 54,  1, 37, 52, 39, 35,
       54, 55, 52, 39, 53,  1, 57, 39,  1, 38, 39, 53, 43, 52, 39,  1, 43,
       48, 37, 52, 39, 35, 53, 39,  6,  0, 30, 42, 35, 54,  1, 54, 42, 39,
       52, 39, 36, 59,  1, 36, 39, 35, 55, 54, 59,  3, 53,  1, 52, 49, 53,
       39,  1, 47, 43, 41, 42, 54,  1, 48, 39, 56, 39, 52,  1, 38, 43, 39,
        6,  0, 13, 55, 54,  1, 35, 53,  1, 54, 42, 39,  1, 52, 43, 50, 39,
       52,  1, 53, 42, 49, 55, 46, 38,  1, 36, 59,  1, 54, 43, 47, 39,  1,
       38, 39, 37, 39, 35, 53, 39,  6,  0, 19, 43, 53,  1, 54, 39, 48, 38,
       39, 52,  1, 42, 39, 43, 52,  1, 47, 43, 41, 42, 54,  1, 36, 39, 35,
       52,  1, 42, 43, 53,  1, 47, 39, 47, 49, 52, 59,  9,  0, 13, 55, 54,
        1, 54, 42, 49, 55,  1, 37, 49, 48, 54, 52, 35, 37, 54, 39, 38,  1,
       54, 49,  1, 54, 42, 43, 48, 39,  1, 49, 57, 48,  1])

In [4]:
# Defining method to make mini-batches for training
def get_batches(arr, batch_size, seq_length):
    # determine the flattened batch size, i.e. sequence length times batch size
    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [5]:
foox, fooy = next(get_batches(encoded, 10, 50))
print(foox.shape, fooy.shape)
print(foox[:10, :10])
print(fooy[:10, :10])

(10, 50) (10, 50)
[[17 52 49 47  1 40 35 43 52 39]
 [52 39  1 36 46 39 53 53 39 38]
 [53 54 49 46  3 48  1 40 52 49]
 [52 54 10  0 12 48 38  1 36 59]
 [46 46  1 47 59  1 39 56 39 52]
 [39 48  1 48 55 52 53 39 38  6]
 [49 45 53  1 57 43 54 42  1 47]
 [56 39 52 59  1 53 35 47 39 10]
 [ 1 53 37 59 54 42 39  1 35 48]
 [39 46 53 39 57 42 39 52 39 10]]
[[52 49 47  1 40 35 43 52 39 53]
 [39  1 36 46 39 53 53 39 38  1]
 [54 49 46  3 48  1 40 52 49 47]
 [54 10  0 12 48 38  1 36 59  1]
 [46  1 47 59  1 39 56 39 52 59]
 [48  1 48 55 52 53 39 38  6  1]
 [45 53  1 57 43 54 42  1 47 39]
 [39 52 59  1 53 35 47 39 10  0]
 [53 37 59 54 42 39  1 35 48 38]
 [46 53 39 57 42 39 52 39 10  1]]


In [11]:
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [16]:
one_hot_encode(np.array([[1, 2, 3]]), 8)

SyntaxError: invalid syntax (3987770715.py, line 1)

In [7]:
class RNN(nn.Module):
    def __init__(self, chars, device, hidden_sz, drop_prob=0.5):
        super().__init__()
        
        self.device = device
        
        # creating character dictionaries
        # we already have this code on the top, but giving it to our model 
        # will be convenient for doing predictions later
        # i.e. doing conversions from text to integers to one-hot & vice-versa
        self.n_chars = len(chars)
        self.int2char = dict(enumerate(chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        self.hidden_sz = hidden_sz
        
        # Note that this class inherits from the torch neural network class
        # Instead of using a pre-built function we will write the math ourselves
        # For this reason we will first need to define "Parameters()", that 
        # the PyTorch graph keeps track of and can optimize. In other words,
        # let's give our class the weights & the bias that the RNN will need. 
        self.weight_ih = Parameter(torch.Tensor(self.n_chars, self.hidden_sz))
        self.weight_hh = Parameter(torch.Tensor(self.hidden_sz, self.hidden_sz))
        self.bias_hh = Parameter(torch.Tensor(self.hidden_sz))
        
        # Now that we have defined the RNN cell, let us define the output layer
        # We will use a dropout layer to prevent overfitting and then 
        # follow with a conventional linear layer (matrix multiplication) that 
        # maps the RNN cell's output (the hidden state of the network) to the 
        # class output. Remembert that the class output corresponds to a 
        # vector of length of unique characters. 
        
        # define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        # define the final, fully-connected output layer. We can use a 
        # PyTorch nn function here (or you could add the corresponding math
        # below and assign an additional weight & bias at the top). 
        # We can see that we can create very custom models this way
        self.fc = nn.Linear(self.hidden_sz, self.n_chars)
        
        # We have assigned the Parameters above, but we will need to also 
        # initialize them. Let's write a function for that and initialize
        # our weights and bias. 
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.weight_ih)
        nn.init.xavier_uniform_(self.weight_hh)
        nn.init.zeros_(self.bias_hh)
    
    def forward(self, x, h_t):
        """Assumes x is of shape (batch, sequence, feature)"""
        bs, seq_sz, _ = x.size()
        hidden_seq = []
        
        # Given an input and an initial hidden state, calculate the next hidden
        # state for each sequence element.
        # We append all the hidden states to a list (similar to a batch size)
        # so that we can concatenate them in the batch and feed them to our
        # last linear layer all in parallel to avoid looping through the final
        # output layer as there is no more dependence on other time steps. 
        for t in range(seq_sz):
            x_t = x[:, t, :]
            h_t = torch.tanh(x_t @ self.weight_ih + h_t @ self.weight_hh + self.bias_hh)
            hidden_seq.append(h_t.unsqueeze(0))
            
        # Do the concatenation and reshaping for convenience
        hidden_seq = torch.cat(hidden_seq, dim=0)
        # reshape from shape (sequence, batch, feature) to (batch, sequence, feature)
        hidden_seq = hidden_seq.transpose(0, 1).contiguous()
        
        # Stack up the RNN outputs using view so that we can process the last 
        # layer in parallel
        r_output = hidden_seq.contiguous().view(-1, self.hidden_sz)
        
        # pass through a dropout layer
        out = self.dropout(r_output)
        
        # Calculate fully connected layer output that yields our class vector
        out = self.fc(out)
        
        return out, h_t
    
    def init_hidden(self, batch_size=1):
        ''' Initializes hidden state '''
        # This is a convenience function so that we can initialize a hidden
        # state to zero when we start prediction on a sequence. Every further
        # step will then depend on the previous hidden state. 
        
        # Create two new tensors with sizes batch_size x n_hidden,
        # initialized to zero for hidden the RNN's hidden state.
        weight = next(self.parameters()).data
        h_t = weight.new(batch_size, self.hidden_sz).zero_().to(device)
        
        return h_t


In [9]:
# Declaring the train method
def train(model, data, device, optimizer, criterion, epochs=10, batch_size=10,
          seq_length=50, clip=5):
    model.train()
    
    for epoch in range(epochs):
        # initialize first hidden states with zeros
        h = model.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):

            # One-hot encode our data, make them torch tensors & cast to device
            x = one_hot_encode(x, model.n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            inputs, targets = inputs.to(device), targets.to(device)

            # zero accumulated gradients
            model.zero_grad()
            
            # get the output and hidden state from the model
            output, h = model(inputs, h)
            
            # calculate the loss and perform backprop
            # because we have flattened our batch and sequence in the model to 
            # be able to speed up the connection of the last fully-connected 
            # layer we now also need to view/flatten our target here
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward(retain_graph=True)
            
            # we use an additional trick of clipping gradients to avoid 
            # exploding gradients, which is a prominent problem in RNNs, just
            # as the opposite problem of vanishing gradients.
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()
            
            
        print("Epoch: {}/{}:".format(epoch + 1, epochs),
              "Loss: {:.4f}:".format(loss.item()))

In [12]:
# Define the model
n_hidden=512
model = RNN(chars, device, n_hidden).to(device)

# Hyperparameters
batch_size = 128
seq_length = 100
epochs = 100 # start with 50 or similar if you are debugging 
# train much longer if you want good results

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# train the model
torch.autograd.set_detect_anomaly(True)
train(model, encoded, device, optimizer, criterion, epochs=epochs,
      batch_size=batch_size, seq_length=seq_length)

Epoch: 1/100: Loss: 3.4647:
Epoch: 2/100: Loss: 3.1898:
Epoch: 3/100: Loss: 6.9307:
Epoch: 4/100: Loss: 3.4230:
Epoch: 5/100: Loss: 3.2569:
Epoch: 6/100: Loss: 3.2210:
Epoch: 7/100: Loss: 3.1619:
Epoch: 8/100: Loss: 3.1369:
Epoch: 9/100: Loss: 3.1115:
Epoch: 10/100: Loss: 3.0961:
Epoch: 11/100: Loss: 3.0699:
Epoch: 12/100: Loss: 3.0454:
Epoch: 13/100: Loss: 3.0142:
Epoch: 14/100: Loss: 2.9752:
Epoch: 15/100: Loss: 2.9368:
Epoch: 16/100: Loss: 2.8977:
Epoch: 17/100: Loss: 2.8545:
Epoch: 18/100: Loss: 2.8478:
Epoch: 19/100: Loss: 2.7960:
Epoch: 20/100: Loss: 2.7517:
Epoch: 21/100: Loss: 2.7314:
Epoch: 22/100: Loss: 2.7080:
Epoch: 23/100: Loss: 2.6567:
Epoch: 24/100: Loss: 2.6560:
Epoch: 25/100: Loss: 2.5973:
Epoch: 26/100: Loss: 2.5711:
Epoch: 27/100: Loss: 2.5369:
Epoch: 28/100: Loss: 2.5350:
Epoch: 29/100: Loss: 2.4932:
Epoch: 30/100: Loss: 2.4671:
Epoch: 31/100: Loss: 2.4558:
Epoch: 32/100: Loss: 2.4390:
Epoch: 33/100: Loss: 2.4167:
Epoch: 34/100: Loss: 2.3915:
Epoch: 35/100: Loss: 2.

In [13]:
def predict(model, char, device, h=None, top_k=5):
        ''' Given a character & hidden state, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[model.char2int[char]]])
        x = one_hot_encode(x, model.n_chars)
        inputs = torch.from_numpy(x).to(device)
        
        with torch.no_grad():
            # get the output of the model
            out, h = model(inputs, h)

            # get the character probabilities
            # move to cpu for further processing with numpy etc. 
            p = F.softmax(out, dim=1).data.cpu()

            # get the top characters with highest likelihood
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()

            # select the likely next character with some element of randomness
            # for more variability
            p = p.numpy().squeeze()
            char = np.random.choice(top_ch, p=p/p.sum())
        
        # return the encoded value of the predicted char and the hidden state
        return model.int2char[char], h

In [14]:
def sample(model, size, device, prime='A', top_k=None):
    # method to generate new text based on a "prime"/initial sequence. 
    # Basically, the outer loop convenience function that calls the above
    # defined predict method. 
    model.eval() # eval mode
    
    # Calculate model for the initial prime characters
    chars = [ch for ch in prime]
    with torch.no_grad():
        # initialize hidden with 0 in the beginning. Set our batch size to 1 
        # as we wish to generate one sequence only. 
        h = model.init_hidden(batch_size=1)
        for ch in prime:
            char, h = predict(model, ch, device, h=h, top_k=top_k)

        # append the characters to the sequence
        chars.append(char)

        # Now pass in the previous/last character and get a new one
        # Repeat this process for the desired length of the sequence to be 
        # generated
        for ii in range(size):
            char, h = predict(model, chars[-1], device, h=h, top_k=top_k)
            chars.append(char)

    return ''.join(chars)

In [15]:
print(sample(model, 1000, device, prime='A', top_k=5))

Asw fare thou mere,
Then thou mist lfat, tere ingust wered frrat thing wald this tay the wall tin willd sathe for neas non the portered se fantis dost hos then se facr ant, bes beath st ald meses atenser, in the round this this peen bred woth,
Ands ald wing that thee thou hos arthen tom hitg tont ferm stel ghoun see seen,
Thet il his felthiss ow the thit, th wisl worl weet be tay sull,
As the best love sor thal,
I do geat y aldesus test beat why shis thin, ald thes with whit thes ithor sored, wath ton sans,
Whin to mat ains besury'd hand with worch tr and.
Or houn thes that the tore be that beast oun sen toul oving inded iras nout sheald and wred
The sthat stoun whotrs if that tee thor with nost beat youg tenet, stor hat theree fees tore wall wore,
Whan thou tort thas en whed,
Whoch bete wort watl will th me teren ald be the singes buse fist wirls to me tind leveny
Whet,
Whechire bus speesse fot dene seerid,
And sull seare thing teer my than to lim hich so fure thit te thie pingur stam