# Building an RNN in PyTorch

In this notebook, I'll construct a character-level RNN with PyTorch. If you are unfamiliar with character-level RNNs, check out [this great article](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) by Andrej Karpathy. The network will train character by character on some text, then generate new text character by character. As an example, I will train on Anna Karenina, one of my favorite novels. I call this project Anna KaRNNa.

In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

In [2]:
with open('anna.txt', 'r') as f:
    text = f.read()

Now we have the text, encode it as integers.

In [3]:
chars = set(text)
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}
encoded = np.array([char2int[ch] for ch in text])

Now I'll create the batches. We'll take the encoded characters and split them into multiple sequences, given by `n_seqs` (also refered to as "batch size" in other places). Each of those sequences will be `n_steps` long.

In [4]:
def get_batches(arr, n_seqs, n_steps):
    '''Create a generator that returns batches of size
       n_seqs x n_steps from arr.
    '''
    
    batch_size = n_seqs * n_steps
    n_batches = len(arr)//batch_size
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size]
    # Reshape into n_seqs rows
    arr = arr.reshape((n_seqs, -1))
    
    for n in range(0, arr.shape[1], n_steps):
        # The features
        x = arr[:, n:n+n_steps]
        # The targets, shifted by one
        y = np.zeros_like(x)
        y[:, :-1], y[:, -1] = x[:, 1:], x[:, 0]
        yield torch.from_numpy(x), torch.from_numpy(y)

Now I'll build the network.

In [59]:
class CharRNN(nn.Module):
    def __init__(self, tokens, embed_dim=50, n_steps=100, 
                               n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        self.embed = nn.Embedding(len(self.chars), embed_dim)
        self.dropout = nn.Dropout(drop_prob)
        self.lstm = nn.LSTM(embed_dim, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        self.fc = nn.Linear(n_hidden, len(self.chars))
        
        self.init_weights()
        
        self.opt = torch.optim.Adam(self.parameters(), lr=lr)
        self.criterion = nn.CrossEntropyLoss()
        
    def forward(self, x, hc):
        x = self.embed(x)
        x = self.dropout(x)
        x, (h, c) = self.lstm(x, hc)
        x = self.dropout(x)
        
        # Stack up LSTM outputs
        x = x.view(x.size()[0]*x.size()[1], self.n_hidden)
        
        x = self.fc(x)
        
        return x, (h, c)
    
    def predict(self, char, h=None, cuda=False, top_k=None):
        ''' Given a character, predict the next character.
        
            Returns the predicted character and the hidden state.
        '''
        if cuda:
            self.cuda()
        else:
            self.cpu()
        
        if h is None:
            h = self.init_hidden(1)
        
        x = np.array([[self.char2int[char]]])
        inputs = Variable(torch.from_numpy(x), volatile=True)
        if cuda:
            inputs = inputs.cuda()
        
        h = tuple([Variable(each.data, volatile=True) for each in h])
        out, h = self.forward(inputs, h)

        p = F.softmax(out).data
        if cuda:
            p = p.cpu()
        p = p.numpy().squeeze()
        
        if top_k is not None:
            p[np.argsort(p)[:-top_k]] = 0
            p = p/p.sum()
            
        char = np.random.choice(np.arange(len(self.chars)), p=p)
            
        return self.int2char[char], h
    
    def init_weights(self):
        
        initrange = 0.1
        # Embedding weights as random uniform
        self.embed.weight.data.uniform_(-initrange, initrange)
        # Set bias tensor to all zeros
        self.fc.bias.data.fill_(0)
        # FC weights as random uniform
        self.fc.weight.data.uniform_(-initrange, initrange)
        
    def init_hidden(self, n_seqs):
        # Create two new tensors with sizes n_layers x n_seqs x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        return (Variable(weight.new(self.n_layers, n_seqs, self.n_hidden).zero_()),
                Variable(weight.new(self.n_layers, n_seqs, self.n_hidden).zero_()))
        

In [60]:
def train(net, data, epochs=10, n_seqs=10, n_steps=50, clip=5, cuda=False, print_every=10):
    net.train()
    if cuda:
        net.cuda()
    counter = 0
    for e in range(epochs):
        h = net.init_hidden(n_seqs)
        for x, y in get_batches(data, n_seqs, n_steps):
            counter += 1
            
            inputs, targets = Variable(x), Variable(y)
            if cuda:
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([Variable(each.data) for each in h])

            net.zero_grad()
            
            output, h = net.forward(inputs, h)
            loss = net.criterion(output, targets.view(n_seqs*n_steps))

            loss.backward()
            
            #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
#             nn.utils.clip_grad_norm(net.parameters(), clip)
#             for p in net.parameters():
#                 p.data.add_(-net.lr, p.grad.data)

            net.opt.step()
            
            if counter % print_every == 0:
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}".format(loss.data[0]))

In [61]:
if 'net' in locals():
    del net

In [62]:
net = CharRNN(chars, embed_dim=len(chars), n_hidden=512, lr=0.001, n_layers=2)

In [63]:
n_seqs, n_steps = 128, 100
train(net, encoded, 20, n_seqs=n_seqs, n_steps=n_steps, cuda=True, print_every=10)

Epoch: 1/20... Step: 10... Loss: 3.4410
Epoch: 1/20... Step: 20... Loss: 3.2046
Epoch: 1/20... Step: 30... Loss: 3.1687
Epoch: 1/20... Step: 40... Loss: 3.1429
Epoch: 1/20... Step: 50... Loss: 3.1292
Epoch: 1/20... Step: 60... Loss: 3.0953
Epoch: 1/20... Step: 70... Loss: 3.0578
Epoch: 1/20... Step: 80... Loss: 3.0022
Epoch: 1/20... Step: 90... Loss: 2.9278
Epoch: 1/20... Step: 100... Loss: 2.8086
Epoch: 1/20... Step: 110... Loss: 2.7241
Epoch: 1/20... Step: 120... Loss: 2.6444
Epoch: 1/20... Step: 130... Loss: 2.5750
Epoch: 1/20... Step: 140... Loss: 2.5374
Epoch: 1/20... Step: 150... Loss: 2.4750
Epoch: 2/20... Step: 160... Loss: 2.4355
Epoch: 2/20... Step: 170... Loss: 2.4025
Epoch: 2/20... Step: 180... Loss: 2.3655
Epoch: 2/20... Step: 190... Loss: 2.3453
Epoch: 2/20... Step: 200... Loss: 2.3079
Epoch: 2/20... Step: 210... Loss: 2.2869
Epoch: 2/20... Step: 220... Loss: 2.2533
Epoch: 2/20... Step: 230... Loss: 2.2363
Epoch: 2/20... Step: 240... Loss: 2.2087
Epoch: 2/20... Step: 250.

Epoch: 13/20... Step: 1980... Loss: 1.3633
Epoch: 13/20... Step: 1990... Loss: 1.3426
Epoch: 13/20... Step: 2000... Loss: 1.3631
Epoch: 13/20... Step: 2010... Loss: 1.3447
Epoch: 14/20... Step: 2020... Loss: 1.3139
Epoch: 14/20... Step: 2030... Loss: 1.3140
Epoch: 14/20... Step: 2040... Loss: 1.3429
Epoch: 14/20... Step: 2050... Loss: 1.3424
Epoch: 14/20... Step: 2060... Loss: 1.3335
Epoch: 14/20... Step: 2070... Loss: 1.3300
Epoch: 14/20... Step: 2080... Loss: 1.3588
Epoch: 14/20... Step: 2090... Loss: 1.3585
Epoch: 14/20... Step: 2100... Loss: 1.3404
Epoch: 14/20... Step: 2110... Loss: 1.3409
Epoch: 14/20... Step: 2120... Loss: 1.3209
Epoch: 14/20... Step: 2130... Loss: 1.3047
Epoch: 14/20... Step: 2140... Loss: 1.3133
Epoch: 14/20... Step: 2150... Loss: 1.3360
Epoch: 14/20... Step: 2160... Loss: 1.3614
Epoch: 14/20... Step: 2170... Loss: 1.3010
Epoch: 15/20... Step: 2180... Loss: 1.3334
Epoch: 15/20... Step: 2190... Loss: 1.3381
Epoch: 15/20... Step: 2200... Loss: 1.3409
Epoch: 15/2

In [54]:
ch, _ = net.predict('g', top_k=5)

In [19]:
char='A'
h = net.init_hidden(1)
x = np.array([[char2int[char]]])
inputs = Variable(torch.from_numpy(x), volatile=True)

h = tuple([Variable(each.data, volatile=True) for each in h])
out, h = net.forward(inputs, h)

In [36]:
p = F.softmax(out).data.numpy().squeeze()

In [39]:
p[np.argsort(p)[:-5]] = 0

In [44]:

p = p/p.sum()

In [46]:
np.random.choice(len(chars), p=p)

81

In [39]:
def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)

In [56]:
def sample(net, size, prime='The', top_k=None, cuda=False):
        
    if cuda:
        net.cuda()
    else:
        net.cpu()

    net.eval()
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = net.predict(ch, h, cuda=cuda, top_k=top_k)

    chars.append(char)

    for ii in range(size):
        char, h = net.predict(chars[-1], h, cuda=cuda, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [58]:
print(sample(net, 200, prime='Anna', top_k=5))

Annag thos tar and wind tor onther whe sartes hit sat ho wore he tasit ant hhate
nhans.
"od har ant ter orotang se wort oud wonte har, ard wit he an serisde
site te teasd he hate tiste whe terans he thes h
