In [1]:
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn

In [2]:
# open text
with open('../../deep-learning-v2-pytorch/recurrent-neural-networks/char-rnn/data/anna.txt', 'r') as f:
    text = f.read()

In [3]:
# first 100
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [4]:
# Tokenization
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode
encoded = np.array([char2int[ch] for ch in text])

In [5]:
encoded[:100]

array([77, 68, 70, 25,  5, 55, 81, 60, 39, 82, 82, 82, 53, 70, 25, 25, 26,
       60, 79, 70, 23, 27,  8, 27, 55,  7, 60, 70, 81, 55, 60, 70,  8,  8,
       60, 70,  8, 27, 64, 55,  6, 60, 55,  9, 55, 81, 26, 60, 40, 76, 68,
       70, 25, 25, 26, 60, 79, 70, 23, 27,  8, 26, 60, 27,  7, 60, 40, 76,
       68, 70, 25, 25, 26, 60, 27, 76, 60, 27,  5,  7, 60, 42, 24, 76, 82,
       24, 70, 26,  2, 82, 82, 71,  9, 55, 81, 26,  5, 68, 27, 76])

In [6]:
# one hot encode
def one_hot_encode(arr, labels):
    one_hot = np.zeros((np.multiply(*arr.shape),labels), dtype=np.float32 )
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
    one_hot = one_hot.reshape((*arr.shape, labels))
    return one_hot

In [8]:
test_seq = np.array([[1,2,5]])
one_hot = one_hot_encode(test_seq, 8)
one_hot

array([[[0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0.]]], dtype=float32)

In [9]:
def get_batches(arr, batch_size, seq_length):
    # get the number of full batches
    batch_size_total = batch_size * seq_length
    n_batches = len(arr)//batch_size_total
    # keep only enough chars to make full batches
    arr = arr[:n_batches * batch_size_total]
    # reshape into {batch_size} rows
    arr = arr.reshape((batch_size, -1))
    # loop through the batches using a seq length of 3
    for n in range(0, arr.shape[1], seq_length):
        # features
        x = arr[:, n:n+seq_length]
        # targets
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [10]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [11]:
# printing out the first 10 items in a sequence
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[77 68 70 25  5 55 81 60 39 82]
 [ 7 42 76 60  5 68 70  5 60 70]
 [55 76 11 60 42 81 60 70 60 79]
 [ 7 60  5 68 55 60 57 68 27 55]
 [60  7 70 24 60 68 55 81 60  5]
 [57 40  7  7 27 42 76 60 70 76]
 [60 78 76 76 70 60 68 70 11 60]
 [63 32  8 42 76  7 64 26  2 60]]

y
 [[68 70 25  5 55 81 60 39 82 82]
 [42 76 60  5 68 70  5 60 70  5]
 [76 11 60 42 81 60 70 60 79 42]
 [60  5 68 55 60 57 68 27 55 79]
 [ 7 70 24 60 68 55 81 60  5 55]
 [40  7  7 27 42 76 60 70 76 11]
 [78 76 76 70 60 68 70 11 60  7]
 [32  8 42 76  7 64 26  2 60 28]]


## Implementation

In [12]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [13]:
class CharRNN(nn.Module):
    def __init__(self, tokens, n_hidden=256, n_layers=2, drop_prob=0.3, lr=0.001):
            super().__init__()
            self.drop_prob = drop_prob
            self.n_layers = n_layers
            self.n_hidden = n_hidden
            self.lr = lr
            
            #create char dictionaries
            self.chars = tokens
            self.int2char = dict(enumerate(self.chars))
            self.char2int = {ch: ii for ii, ch in self.int2char.items()}
            
            # model layers
            self.lstm = nn.LSTM(len(self.chars),n_hidden, n_layers, dropout=drop_prob, batch_first=True)
            self.dropout = nn.Dropout(drop_prob)
            self.fc = nn.Linear(n_hidden, len(self.chars))
            
    def forward(self, x, hidden):
        r_output, hidden = self.lstm(x, hidden)
        out = self.dropout(r_output)
        out = out.reshape(-1, self.n_hidden)
        out = self.fc(out)
        return out, hidden
    
    def init_hidden(self, batch_size):
        '''initializes hidden state'''
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))

        return hidden
                

In [14]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
    '''
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    net.to(device)
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            inputs, targets = inputs.to(device), targets.to(device)
            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    inputs, targets = inputs.to(device), targets.to(device)

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                
                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

## Model Instantiation

In [15]:
n_hidden=512
n_layers=2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


## Training

In [17]:
batch_size = 128
seq_length = 100
n_epochs = 20
# train the model
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=10)

Epoch: 1/20... Step: 10... Loss: 2.1378... Val Loss: 2.1357
Epoch: 1/20... Step: 20... Loss: 2.0948... Val Loss: 2.1000
Epoch: 1/20... Step: 30... Loss: 2.0862... Val Loss: 2.0705
Epoch: 1/20... Step: 40... Loss: 2.0376... Val Loss: 2.0529
Epoch: 1/20... Step: 50... Loss: 2.0578... Val Loss: 2.0363
Epoch: 1/20... Step: 60... Loss: 1.9831... Val Loss: 2.0155
Epoch: 1/20... Step: 70... Loss: 1.9877... Val Loss: 2.0023
Epoch: 1/20... Step: 80... Loss: 1.9637... Val Loss: 1.9889
Epoch: 1/20... Step: 90... Loss: 1.9764... Val Loss: 1.9679
Epoch: 1/20... Step: 100... Loss: 1.9306... Val Loss: 1.9502
Epoch: 1/20... Step: 110... Loss: 1.9089... Val Loss: 1.9346
Epoch: 1/20... Step: 120... Loss: 1.8645... Val Loss: 1.9242
Epoch: 1/20... Step: 130... Loss: 1.9011... Val Loss: 1.9029
Epoch: 2/20... Step: 140... Loss: 1.9013... Val Loss: 1.8843
Epoch: 2/20... Step: 150... Loss: 1.8655... Val Loss: 1.8697
Epoch: 2/20... Step: 160... Loss: 1.8662... Val Loss: 1.8559
Epoch: 2/20... Step: 170... Loss:

Epoch: 10/20... Step: 1350... Loss: 1.2161... Val Loss: 1.3071
Epoch: 10/20... Step: 1360... Loss: 1.2106... Val Loss: 1.3045
Epoch: 10/20... Step: 1370... Loss: 1.1954... Val Loss: 1.3045
Epoch: 10/20... Step: 1380... Loss: 1.2395... Val Loss: 1.3005
Epoch: 10/20... Step: 1390... Loss: 1.2575... Val Loss: 1.2956
Epoch: 11/20... Step: 1400... Loss: 1.2445... Val Loss: 1.2968
Epoch: 11/20... Step: 1410... Loss: 1.2469... Val Loss: 1.2949
Epoch: 11/20... Step: 1420... Loss: 1.2536... Val Loss: 1.2914
Epoch: 11/20... Step: 1430... Loss: 1.2207... Val Loss: 1.2987
Epoch: 11/20... Step: 1440... Loss: 1.2292... Val Loss: 1.2982
Epoch: 11/20... Step: 1450... Loss: 1.1646... Val Loss: 1.2968
Epoch: 11/20... Step: 1460... Loss: 1.2024... Val Loss: 1.2952
Epoch: 11/20... Step: 1470... Loss: 1.1803... Val Loss: 1.2972
Epoch: 11/20... Step: 1480... Loss: 1.2149... Val Loss: 1.2960
Epoch: 11/20... Step: 1490... Loss: 1.1952... Val Loss: 1.2925
Epoch: 11/20... Step: 1500... Loss: 1.1908... Val Loss:

Epoch: 20/20... Step: 2660... Loss: 1.1016... Val Loss: 1.2417
Epoch: 20/20... Step: 2670... Loss: 1.1067... Val Loss: 1.2495
Epoch: 20/20... Step: 2680... Loss: 1.0970... Val Loss: 1.2443
Epoch: 20/20... Step: 2690... Loss: 1.0812... Val Loss: 1.2498
Epoch: 20/20... Step: 2700... Loss: 1.0917... Val Loss: 1.2462
Epoch: 20/20... Step: 2710... Loss: 1.0669... Val Loss: 1.2487
Epoch: 20/20... Step: 2720... Loss: 1.0664... Val Loss: 1.2461
Epoch: 20/20... Step: 2730... Loss: 1.0637... Val Loss: 1.2464
Epoch: 20/20... Step: 2740... Loss: 1.0658... Val Loss: 1.2521
Epoch: 20/20... Step: 2750... Loss: 1.0790... Val Loss: 1.2525
Epoch: 20/20... Step: 2760... Loss: 1.0606... Val Loss: 1.2535
Epoch: 20/20... Step: 2770... Loss: 1.1015... Val Loss: 1.2489
Epoch: 20/20... Step: 2780... Loss: 1.1262... Val Loss: 1.2480


## Save the checkpoint

In [24]:
checkpoint = {'n_hidden':net.n_hidden,
             'n_layers':net.n_layers,
             'state_dict':net.state_dict(),
             'tokens':net.chars}
with open('rnn_checkpoint', 'wb')as f:
    torch.save(checkpoint, f)

## Making predictions 

In [30]:
def predict(net, char, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[net.char2int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)
        inputs = inputs.to(device)
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        p = p.cpu() # move to cpu
        
        # get top characters
        if top_k is None:
            top_ch = np.arange(len(net.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        
        # return the encoded value of the predicted char and the hidden state
        return net.int2char[char], h

## Priming and generating text

In [32]:
def sample(net, size, prime='The', top_k=None):
    net.to(device)
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [35]:
print(sample(net, 1000, prime='Anna was', top_k=3))

Anna was
to blame, the sorrow, and she would have thought of her. She
had been so sorry for the strange sound in his heart, but he was not to
see him, but he had to do with the soft of the satisfaction.

"Why, how is it, I am not so much to have the means to me in how
sid I do it. I doesn't begin to be so much to me. But I wanted to say
that you are no more and me for the most feeling."

"I don't understand it," she said to the letter, "and the old prince is
so sorry for her," and with a smile without and struck her and
had no difference of some desire to spare a difficult expression
of her face at the thick other and her feeling.

The significance of the stream with his father had been all that he could
never be done to be as a bit and his father, had, with the most
part in his soul where they were satisfied with them, but she
had been standing.

Alexey Alexandrovitch stopped, she came to the soft of the party as to
his face that she was always seen in the same son.

"What is it you a

## Load checkpoint

In [36]:
# Here we have loaded in a model that trained over 20 epochs
with open('rnn_checkpoint', 'rb') as f:
    checkpoint = torch.load(f)
    
loaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [38]:
print(sample(loaded, 2000, top_k=5, prime="And she said to levin .."))

And she said to levin ...
and was entering that time he heard her the charce. He was
now and there was not the members of how to be living in the
finances of her figure and the manner. The princess and
the princess saw that she was always to get her thoughts. He
was so good to stopped to her with a sort of study. He was
awful, the sick man sat down at home, and was silent. Because
he showed his head.

"Who has stoubed?"

"No, it's all to go away, and I can't understand that it's not a
possibility of concealing the crop, to be passing me if they're so till
then to see you," said Vronsky, and her lows, were at the
steps of the political expression, and so high todouhed her.

Alexey Alexandrovitch had seen it any once made from the same
peasants, who, she cried not to say and then at the same.

"It's a little garden."

Anna had to get in sincere. But that it was all the contempt of
pains. All that to start here and would not be asked to
tell them.

"If it's the tries to thinks of my death