# Building an RNN in PyTorch

In this notebook, I'll construct a character-level RNN with PyTorch. If you are unfamiliar with character-level RNNs, check out [this great article](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) by Andrej Karpathy. The network will train character by character on some text, then generate new text character by character. As an example, I will train on Anna Karenina, one of my favorite novels. I call this project Anna KaRNNa.

In [2]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

In [3]:
with open('anna.txt', 'r') as f:
    text = f.read()

Now we have the text, encode it as integers.

In [4]:
chars = set(text)
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}
encoded = np.array([char2int[ch] for ch in text])

Now I'll create the batches. We'll take the encoded characters and split them into multiple sequences, given by `n_seqs` (also refered to as "batch size" in other places). Each of those sequences will be `n_steps` long.

In [5]:
def get_batches(arr, n_seqs, n_steps):
    '''Create a generator that returns batches of size
       n_seqs x n_steps from arr.
    '''
    
    batch_size = n_seqs * n_steps
    n_batches = len(arr)//batch_size
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size]
    # Reshape into n_seqs rows
    arr = arr.reshape((n_seqs, -1))
    
    for n in range(0, arr.shape[1], n_steps):
        # The features
        x = arr[:, n:n+n_steps]
        # The targets, shifted by one
        y = np.zeros_like(x)
        y[:, :-1], y[:, -1] = x[:, 1:], x[:, 0]
        yield torch.from_numpy(x), torch.from_numpy(y)

Now I'll build the network.

In [40]:
class CharRNN(nn.Module):
    def __init__(self, labels, embed_dim=50, n_steps=100, 
                               n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.chars = labels
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        self.embed = nn.Embedding(len(self.chars), embed_dim)
        self.dropout = nn.Dropout(drop_prob)
        self.lstm = nn.LSTM(embed_dim, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        self.fc = nn.Linear(n_hidden, len(self.chars))
        
        self.init_weights()
        
        self.opt = torch.optim.Adam(self.parameters(), lr=lr)
        self.criterion = nn.CrossEntropyLoss()
        
    def forward(self, x, hc):
        x = self.embed(x)
        x = self.dropout(x)
        x, (h, c) = self.lstm(x, hc)
        x = self.dropout(x)
        
        # Stack up LSTM outputs
        x = x.view(x.size()[0]*x.size()[1], self.n_hidden)
        
        x = self.fc(x)
        
        return x, (h, c)
    
    def predict(self, char, h=None, cuda=False):
        ''' Given a character, predict the next character.
        
            Returns the predicted character and the hidden state.
        '''
        if cuda:
            self.cuda()
        else:
            self.cpu()
        
        if h is None:
            h = self.init_hidden(1)
        
        x = np.array([[char2int[char]]])
        inputs = Variable(torch.from_numpy(x), volatile=True)
        if cuda:
            inputs = inputs.cuda()
        
        h = tuple([Variable(each.data, volatile=True) for each in h])
        out, h = self.forward(inputs, h)

        p = F.softmax(out).data
        if cuda:
            p = p.cpu()
        p = p.numpy().squeeze()
        
        char = np.random.choice(np.arange(len(self.chars)), p=p)
        
        return self.int2char[char], h
        
    
    def sample(self, size, prime='The', cuda=False):
        
        if cuda:
            self.cuda()
        else:
            self.cpu()
        
        self.eval()
        chars = [ch for ch in prime]
        h = self.init_hidden(1)
        for ch in prime:
            char, h = self.predict(ch, h, cuda=cuda)

        chars.append(char)

        for ii in range(size):
            char, h = self.predict(chars[-1], h, cuda=cuda)
            chars.append(char)

        return ''.join(chars)
    
    def init_weights(self):
        
        initrange = 0.1
        # Embedding weights as random uniform
        self.embed.weight.data.uniform_(-initrange, initrange)
        # Set bias tensor to all zeros
        self.fc.bias.data.fill_(0)
        # FC weights as random uniform
        self.fc.weight.data.uniform_(-initrange, initrange)
        
    def init_hidden(self, n_seqs):
        # Create two new tensors with sizes n_layers x n_seqs x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        return (Variable(weight.new(self.n_layers, n_seqs, self.n_hidden).zero_()),
                Variable(weight.new(self.n_layers, n_seqs, self.n_hidden).zero_()))
        

In [41]:
def train(net, epochs, n_seqs, n_steps, clip=5, cuda=False, print_every=10):
    net.train()
    if cuda:
        net.cuda()
    counter = 0
    for e in range(epochs):
        h = net.init_hidden(n_seqs)
        for x, y in get_batches(encoded, n_seqs, n_steps):
            counter += 1
            
            inputs, targets = Variable(x), Variable(y)
            if cuda:
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([Variable(each.data) for each in h])

            net.zero_grad()
            
            output, h = net.forward(inputs, h)
            loss = net.criterion(output, targets.view(n_seqs*n_steps))

            loss.backward()
            
            #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
#             nn.utils.clip_grad_norm(net.parameters(), clip)
#             for p in net.parameters():
#                 p.data.add_(-net.lr, p.grad.data)

            net.opt.step()
            
            if counter % print_every == 0:
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}".format(loss.data[0]))
    return net

In [42]:
if 'net' in locals():
    del net

In [43]:
net = CharRNN(chars, embed_dim=len(chars), n_hidden=512, lr=0.001, n_layers=2)

In [44]:
n_seqs, n_steps = 128, 100
train(net, 1, n_seqs, n_steps, cuda=True, clip=5, print_every=10)

Epoch: 1/1... Step: 10... Loss: 3.4131
Epoch: 1/1... Step: 20... Loss: 3.2153
Epoch: 1/1... Step: 30... Loss: 3.1651
Epoch: 1/1... Step: 40... Loss: 3.1379
Epoch: 1/1... Step: 50... Loss: 3.1234
Epoch: 1/1... Step: 60... Loss: 3.0821
Epoch: 1/1... Step: 70... Loss: 3.0256
Epoch: 1/1... Step: 80... Loss: 2.9561
Epoch: 1/1... Step: 90... Loss: 2.8768
Epoch: 1/1... Step: 100... Loss: 2.7406
Epoch: 1/1... Step: 110... Loss: 2.6551
Epoch: 1/1... Step: 120... Loss: 2.5931
Epoch: 1/1... Step: 130... Loss: 2.5311
Epoch: 1/1... Step: 140... Loss: 2.4882
Epoch: 1/1... Step: 150... Loss: 2.4336


CharRNN (
  (embed): Embedding(83, 83)
  (dropout): Dropout (p = 0.5)
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear (512 -> 83)
  (criterion): CrossEntropyLoss (
  )
)

In [45]:
net.predict('g')

('t', (Variable containing:
  ( 0 ,.,.) = 
   -3.2075e-02  1.6754e-02  4.0479e-04  ...   3.3758e-03  1.1412e-01 -2.1982e-03
  
  ( 1 ,.,.) = 
    7.3218e-02  5.8807e-02  3.3345e-02  ...  -8.4269e-03  4.2334e-03  1.5514e-02
  [torch.FloatTensor of size 2x1x512], Variable containing:
  ( 0 ,.,.) = 
   -5.5502e-02  2.7901e-02  8.4663e-04  ...   6.0325e-03  2.3942e-01 -4.3903e-03
  
  ( 1 ,.,.) = 
    1.4547e-01  9.7894e-02  6.5760e-02  ...  -1.9031e-02  8.6778e-03  2.8723e-02
  [torch.FloatTensor of size 2x1x512]))

In [46]:
print(net.sample(1000))

Thew of bod'f he anstevyahds
sreaptp
ofh nat hingerting peet hised mas eld benly inpnamged fisw hearry thirs bered taur Dalules sisew nec -ong winr thet and, He firhin an them of ashedy mant At cate" out the sosule Thheth teidh wo his he Mupaten." Be alkezind tha lalsey he marue oxe samen thirg sos kamw fide ol
lewer, Leas
atet andilger; the hody wlam no elungsalciln breaf`, thse
tas am teey at vhfanh ta
lrar, 
ot."," "Te os the she paryspreting sorll waets as aed wot Acrile
eCilecsed at lemsan, bit. wold
idec'sionn, wo wAs che prathion it ave 3rtoiss yaxy, _it pishh ogt tiwiiss, at tor."
Bte inton oud
toan foultet of bo cmathing, wpon the ofZemiog han the ult aftary he lis art f rokle. 
reruekg fo-s whe
nereore, Thoe tont, shis cisl; andrerigey hte diol harl, and woud so
ece rassewh the pocseldunx
the i tram,, go thot yI wis on wilntiny" tomed'n,, was
Vussart Yat ta terr an to
te, I dals :w coacy
dad the
waidoinker be, namh funt tha or iflpesdet?. an! to he poaf at asansd. Anver wo pi

In [39]:
def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)

In [102]:
def sample(net, size, prime='The'):
    net.eval()
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        h = tuple([Variable(each.data) for each in h])
        x = np.array([[char2int[ch]]])
        inputs = Variable(torch.from_numpy(x))
        out, h = net.forward(inputs, h)
    
    p = F.softmax(out).data
    chars.append(int2char[p.max(1)[1].numpy()[0,0]])
    
    for ii in range(size):
        h = tuple([Variable(each.data) for each in h])
        
        x = np.array([[char2int[chars[-1]]]])
        inputs = Variable(torch.from_numpy(x))
        out, h = net.forward(inputs, h)
        
        p = F.softmax(out).data
        #print(p.numpy().squeeze())
        char = np.random.choice(np.arange(len(net.chars)), p=p.numpy().squeeze())
        chars.append(int2char[char])
        
    return ''.join(chars)

In [103]:
print(sample(net, 200, prime='Anna'))

Annall3)&uK6kZaKUfsg0o9rGyf%ORE7E7DnCPTV&(a%49yfDPhRQ0:BU.44)d?&G7BnM':68kaW*%Z:oTezUFxKFJV2TsS4JZ
`u-MxlzFpVXzk?
;`_Y8gSv@hrDe,BInJa(`68alj-VRkx"Dkn;4`)4567a/5ZXT aOXgk?7K@"wD*4o6?F*%:nH(.IqBosI8z!N
NSD J
