# Character-wise RNNs

In [1]:
# prepare packages
import time
from collections import namedtuple

import numpy as np
import tensorflow as tf

In [2]:
# open text
anna_text_fn = '../data/anna.txt'
def open_text_file(fn):
    with open(anna_text_fn, 'r') as f:
        t = f.read()
    return t
text = open_text_file(anna_text_fn)
text[:50]

'Chapter 1\n\n\nHappy families are all alike; every un'

## Preprocessings

In [3]:
# preprocessing
# get unique characters
vocab = set(text)
print('vocabulary size: ', len(vocab))

vocabulary size:  83


In [4]:
# mark ids to characters
vocab_to_int = {c: i for i, c in enumerate(vocab)}
int_to_vocab = {i: c for c, i in vocab_to_int.items()}

In [5]:
# encode anna text in np.array
encoded = np.array( [vocab_to_int[c] for c in text], dtype=np.int32 )
encoded[:50]

array([33, 51, 68, 46, 77, 41, 13, 45, 73, 50, 50, 50, 69, 68, 46, 46, 61,
       45,  6, 68, 71, 40,  8, 40, 41, 42, 45, 68, 13, 41, 45, 68,  8,  8,
       45, 68,  8, 40, 25, 41, 62, 45, 41, 11, 41, 13, 61, 45, 23, 15])

### Making batches

In [6]:
def get_batches(arr, n_seqs, n_steps):
    '''Create a generator that returns batches of size
       n_seqs x n_steps from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       n_seqs: the number of sequences per batch(sequence length)
       n_steps: Number of sequence steps per batch(how many sequence needed for single batch?)
    '''
    #print('Input arr length: ', len(arr))
    
    # get number of chracters per batch and number of batches we can make
    characters_per_batch = n_seqs * n_steps
    n_batches = len(arr) // characters_per_batch
    
    # keep only enough chracters to make full batches
    arr = arr[:n_batches * characters_per_batch]
    #print('cliped arr length: ', len(arr))
    #print('characters_per_batch: ', characters_per_batch)
    #print('batch size: ', n_batches)
    
    # reshape
    arr = arr.reshape( (n_seqs, -1) )
    #print('reshaped arr: ', arr.shape)
    
    for n in range(0, arr.shape[1], n_steps):
        # the features
        x = arr[:, n:n + n_steps]
        # targets, x shifted by 1
        y = np.zeros_like(x)
        y[:, :-1], y[:, -1] = x[:, 1:], x[:, 0]
        yield x, y

In [7]:
batches = get_batches(encoded, 10, 50)

In [8]:
x, y = next(batches)
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[33 51 68 46 77 41 13 45 73 50]
 [45 68 71 45 15 24 77 45 79 24]
 [11 40 15 70 50 50 39 37 41 42]
 [15 45 30 23 13 40 15 79 45 51]
 [45 40 77 45 40 42 36 45 42 40]
 [45  2 77 45 16 68 42 50 24 15]
 [51 41 15 45 28 24 71 41 45  6]
 [62 45 55 23 77 45 15 24 16 45]
 [77 45 40 42 15 63 77 70 45 29]
 [45 42 68 40 30 45 77 24 45 51]]

y
 [[51 68 46 77 41 13 45 73 50 50]
 [68 71 45 15 24 77 45 79 24 40]
 [40 15 70 50 50 39 37 41 42 36]
 [45 30 23 13 40 15 79 45 51 40]
 [40 77 45 40 42 36 45 42 40 13]
 [ 2 77 45 16 68 42 50 24 15  8]
 [41 15 45 28 24 71 41 45  6 24]
 [45 55 23 77 45 15 24 16 45 42]
 [45 40 42 15 63 77 70 45 29 51]
 [42 68 40 30 45 77 24 45 51 41]]


## Building the model

### Inputs

In [9]:
def build_inputs(batch_size, num_steps):
    ''' Define placeholders for inputs, targets, and dropout 
    
        Arguments
        ---------
        batch_size: Batch size, number of sequences per batch
        num_steps: Number of sequence steps in a batch
    '''
    inputs = tf.placeholder( tf.int32, [batch_size, num_steps], name='inputs' )
    targets = tf.placeholder( tf.int32, [batch_size, num_steps], name='targets' )
    
    # keep probability placeholder for drop out layers
    keep_prob = tf.placeholder( tf.float32, name='keep_prob' )
    
    return inputs, targets, keep_prob

### LSTM Cell

In [10]:
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    ''' Build LSTM cell.
    
        Arguments
        ---------
        keep_prob: Scalar tensor (tf.placeholder) for the dropout keep probability
        lstm_size: Size of the hidden layers in the LSTM cells
        num_layers: Number of LSTM layers
        batch_size: Batch size
    '''
    # build lstm cell with dropout
    def build_cell(lstm_size, keep_prob):
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
        return drop
    
    # stack up multiple LSTM layers
    cell = tf.contrib.rnn.MultiRNNCell( [build_cell(lstm_size, keep_prob) for _ in range(num_layers)] )
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    return cell, initial_state

### RNN output

In [11]:
def build_output(lstm_output, in_size, out_size):
    ''' Build a softmax layer, return the softmax output and logits.
    
        Arguments
        ---------
        
        x: Input tensor
        in_size: Size of the input tensor, for example, size of the LSTM cells
        out_size: Size of this softmax layer
    '''
    # reshape output so it's a bunch of rows, one row for each step for each sequence.
    seq_output = tf.concat(lstm_output, axis=1)
    x = tf.reshape(seq_output, [-1, in_size])
    
    # connect the RNN outputs to a softmax layer
    with tf.variable_scope('softmax'):
        softmax_w = tf.Variable( tf.truncated_normal([in_size, out_size], stddev=0.1) )
        softmax_b = tf.Variable(  tf.zeros(out_size))
    
    # logits
    logits = tf.matmul(x, softmax_w) + softmax_b
    
    # use softmax to get the probabilities for predicted characters
    out = tf.nn.softmax(logits=logits, name='predictions')
    
    return out, logits

### Training loss

In [12]:
def build_loss(logits, targets, lstm_size, num_classes):
    ''' Calculate the loss from the logits and the targets.
    
        Arguments
        ---------
        logits: Logits from final fully connected layer
        targets: Targets for supervised learning
        lstm_size: Number of LSTM hidden units
        num_classes: Number of classes in targets        
    '''
    # one-hot encoded targets and reshape to match logitsm one row per batch_size per step
    y_one_hot = tf.one_hot(targets, num_classes)
    y_reshaped = tf.reshape(y_one_hot, logits.get_shape())
    
    # softmax cross entrophy loss
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped)
    loss = tf.reduce_mean(loss)
    
    return loss

### Optimizer

In [13]:
def build_optimizer(loss, learnin_rate, grad_clip):
    ''' Build optmizer for training, using gradient clipping.
    
        Arguments:
        loss: Network loss
        learning_rate: Learning rate for optimizer    
    '''
    # optimizer for training, using gradient clipping to control exploding gradients
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate=learnin_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    return optimizer

### Build the network

In [14]:
class CharRNN:    
    def __init__(self, num_classes, batch_size=64, num_steps=50,
                 lstm_size=128, num_layers=2, learning_rate=0.001, 
                 grad_clip=5, sampling=False):
        # When we're using this network for sampling later, we'll be passing in
        # one character at a time, so providing an option for that
        if sampling == True:
            batch_size, num_steps = 1, 1
        else:
            batch_size, num_steps = batch_size, num_steps

        tf.reset_default_graph()
        
        # Build the input placeholder tensors
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps)

        # Build the LSTM cell
        cell, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)

        ### Run the data through the RNN layers
        # First, one-hot encode the input tokens
        x_one_hot = tf.one_hot(self.inputs, num_classes)
        
        # Run each sequence step through the RNN and collect the outputs
        outputs, state = tf.nn.dynamic_rnn(cell, x_one_hot, initial_state=self.initial_state)
        self.final_state = state
        
        # Get softmax predictions and logits
        self.prediction, self.logits = build_output(outputs, lstm_size, num_classes)
        
        # Loss and optimizer (with gradient clipping)
        self.loss = build_loss(self.logits, self.targets, lstm_size, num_classes)
        self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)

### Hyperparameters
Here I'm defining the hyperparameters for the network.
* batch_size - Number of sequences running through the network in one pass.
* num_steps - Number of characters in the sequence the network is trained on. Larger is better typically, the network will learn * more long range dependencies. But it takes longer to train. 100 is typically a good number here.
* lstm_size - The number of units in the hidden layers.
* num_layers - Number of hidden LSTM layers to use
* learning_rate - Learning rate for training
* keep_prob - The dropout keep probability when training. If you're network is overfitting, try decreasing this.

In [15]:
batch_size = 100        # Sequences per batch
num_steps = 100         # Number of sequence steps per batch
lstm_size = 512         # Size of hidden layers in LSTMs
num_layers = 2          # Number of LSTM layers
learning_rate = 0.001   # Learning rate
keep_prob = 0.5         # Dropout keep probability

## Training

In [16]:
epochs = 20
# Save every N iterations
save_every_n = 200

model = CharRNN(len(vocab), batch_size=batch_size, num_steps=num_steps,
                lstm_size=lstm_size, num_layers=num_layers, 
                learning_rate=learning_rate)

saver = tf.train.Saver(max_to_keep=100)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Use the line below to load a checkpoint and resume training
    #saver.restore(sess, 'checkpoints/______.ckpt')
    counter = 0
    for e in range(epochs):
        # Train network
        new_state = sess.run(model.initial_state)
        loss = 0
        for x, y in get_batches(encoded, batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            batch_loss, new_state, _ = sess.run([model.loss, 
                                                 model.final_state, 
                                                 model.optimizer], 
                                                 feed_dict=feed)
            
            end = time.time()
            print('Epoch: {}/{}... '.format(e+1, epochs),
                  'Training Step: {}... '.format(counter),
                  'Training loss: {:.4f}... '.format(batch_loss),
                  '{:.4f} sec/batch'.format((end-start)))
        
            if (counter % save_every_n == 0):
                saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))
    
    saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))

Epoch: 1/20...  Training Step: 1...  Training loss: 4.4209...  0.3951 sec/batch
Epoch: 1/20...  Training Step: 2...  Training loss: 4.3325...  0.1530 sec/batch
Epoch: 1/20...  Training Step: 3...  Training loss: 3.8526...  0.1530 sec/batch
Epoch: 1/20...  Training Step: 4...  Training loss: 5.2568...  0.1490 sec/batch
Epoch: 1/20...  Training Step: 5...  Training loss: 4.3578...  0.1510 sec/batch
Epoch: 1/20...  Training Step: 6...  Training loss: 3.9851...  0.1500 sec/batch
Epoch: 1/20...  Training Step: 7...  Training loss: 3.8226...  0.1500 sec/batch
Epoch: 1/20...  Training Step: 8...  Training loss: 3.6760...  0.1640 sec/batch
Epoch: 1/20...  Training Step: 9...  Training loss: 3.5467...  0.1510 sec/batch
Epoch: 1/20...  Training Step: 10...  Training loss: 3.4759...  0.1490 sec/batch
Epoch: 1/20...  Training Step: 11...  Training loss: 3.4065...  0.1500 sec/batch
Epoch: 1/20...  Training Step: 12...  Training loss: 3.3933...  0.1480 sec/batch
Epoch: 1/20...  Training Step: 13... 

Epoch: 1/20...  Training Step: 103...  Training loss: 3.0997...  0.1500 sec/batch
Epoch: 1/20...  Training Step: 104...  Training loss: 3.0879...  0.1490 sec/batch
Epoch: 1/20...  Training Step: 105...  Training loss: 3.0928...  0.1500 sec/batch
Epoch: 1/20...  Training Step: 106...  Training loss: 3.0929...  0.1500 sec/batch
Epoch: 1/20...  Training Step: 107...  Training loss: 3.0609...  0.1510 sec/batch
Epoch: 1/20...  Training Step: 108...  Training loss: 3.0717...  0.1510 sec/batch
Epoch: 1/20...  Training Step: 109...  Training loss: 3.0749...  0.1490 sec/batch
Epoch: 1/20...  Training Step: 110...  Training loss: 3.0536...  0.1560 sec/batch
Epoch: 1/20...  Training Step: 111...  Training loss: 3.0615...  0.1500 sec/batch
Epoch: 1/20...  Training Step: 112...  Training loss: 3.0634...  0.1490 sec/batch
Epoch: 1/20...  Training Step: 113...  Training loss: 3.0507...  0.1520 sec/batch
Epoch: 1/20...  Training Step: 114...  Training loss: 3.0491...  0.1510 sec/batch
Epoch: 1/20...  

Epoch: 2/20...  Training Step: 203...  Training loss: 2.5040...  0.1520 sec/batch
Epoch: 2/20...  Training Step: 204...  Training loss: 2.5013...  0.1500 sec/batch
Epoch: 2/20...  Training Step: 205...  Training loss: 2.5007...  0.1690 sec/batch
Epoch: 2/20...  Training Step: 206...  Training loss: 2.5042...  0.1510 sec/batch
Epoch: 2/20...  Training Step: 207...  Training loss: 2.5241...  0.1510 sec/batch
Epoch: 2/20...  Training Step: 208...  Training loss: 2.4802...  0.1500 sec/batch
Epoch: 2/20...  Training Step: 209...  Training loss: 2.4751...  0.1500 sec/batch
Epoch: 2/20...  Training Step: 210...  Training loss: 2.4849...  0.1490 sec/batch
Epoch: 2/20...  Training Step: 211...  Training loss: 2.4822...  0.1520 sec/batch
Epoch: 2/20...  Training Step: 212...  Training loss: 2.5042...  0.1510 sec/batch
Epoch: 2/20...  Training Step: 213...  Training loss: 2.4770...  0.1501 sec/batch
Epoch: 2/20...  Training Step: 214...  Training loss: 2.4754...  0.1570 sec/batch
Epoch: 2/20...  

Epoch: 2/20...  Training Step: 303...  Training loss: 2.2680...  0.1590 sec/batch
Epoch: 2/20...  Training Step: 304...  Training loss: 2.2755...  0.1530 sec/batch
Epoch: 2/20...  Training Step: 305...  Training loss: 2.2770...  0.1520 sec/batch
Epoch: 2/20...  Training Step: 306...  Training loss: 2.2965...  0.1540 sec/batch
Epoch: 2/20...  Training Step: 307...  Training loss: 2.2876...  0.1500 sec/batch
Epoch: 2/20...  Training Step: 308...  Training loss: 2.2520...  0.1570 sec/batch
Epoch: 2/20...  Training Step: 309...  Training loss: 2.2772...  0.1500 sec/batch
Epoch: 2/20...  Training Step: 310...  Training loss: 2.2881...  0.1500 sec/batch
Epoch: 2/20...  Training Step: 311...  Training loss: 2.2550...  0.1540 sec/batch
Epoch: 2/20...  Training Step: 312...  Training loss: 2.2474...  0.1500 sec/batch
Epoch: 2/20...  Training Step: 313...  Training loss: 2.2503...  0.1500 sec/batch
Epoch: 2/20...  Training Step: 314...  Training loss: 2.2175...  0.1510 sec/batch
Epoch: 2/20...  

Epoch: 3/20...  Training Step: 403...  Training loss: 2.1251...  0.1544 sec/batch
Epoch: 3/20...  Training Step: 404...  Training loss: 2.1314...  0.1555 sec/batch
Epoch: 3/20...  Training Step: 405...  Training loss: 2.1561...  0.1572 sec/batch
Epoch: 3/20...  Training Step: 406...  Training loss: 2.1309...  0.1542 sec/batch
Epoch: 3/20...  Training Step: 407...  Training loss: 2.1149...  0.1555 sec/batch
Epoch: 3/20...  Training Step: 408...  Training loss: 2.1030...  0.1485 sec/batch
Epoch: 3/20...  Training Step: 409...  Training loss: 2.1248...  0.1520 sec/batch
Epoch: 3/20...  Training Step: 410...  Training loss: 2.1595...  0.1500 sec/batch
Epoch: 3/20...  Training Step: 411...  Training loss: 2.1119...  0.1530 sec/batch
Epoch: 3/20...  Training Step: 412...  Training loss: 2.0987...  0.1500 sec/batch
Epoch: 3/20...  Training Step: 413...  Training loss: 2.1140...  0.1520 sec/batch
Epoch: 3/20...  Training Step: 414...  Training loss: 2.1510...  0.1540 sec/batch
Epoch: 3/20...  

Epoch: 3/20...  Training Step: 503...  Training loss: 1.9959...  0.1500 sec/batch
Epoch: 3/20...  Training Step: 504...  Training loss: 2.0101...  0.1519 sec/batch
Epoch: 3/20...  Training Step: 505...  Training loss: 2.0088...  0.1545 sec/batch
Epoch: 3/20...  Training Step: 506...  Training loss: 2.0068...  0.1565 sec/batch
Epoch: 3/20...  Training Step: 507...  Training loss: 1.9975...  0.1670 sec/batch
Epoch: 3/20...  Training Step: 508...  Training loss: 1.9894...  0.1490 sec/batch
Epoch: 3/20...  Training Step: 509...  Training loss: 1.9883...  0.1500 sec/batch
Epoch: 3/20...  Training Step: 510...  Training loss: 1.9804...  0.1580 sec/batch
Epoch: 3/20...  Training Step: 511...  Training loss: 1.9757...  0.1500 sec/batch
Epoch: 3/20...  Training Step: 512...  Training loss: 1.9441...  0.1480 sec/batch
Epoch: 3/20...  Training Step: 513...  Training loss: 1.9909...  0.1550 sec/batch
Epoch: 3/20...  Training Step: 514...  Training loss: 1.9712...  0.1490 sec/batch
Epoch: 3/20...  

Epoch: 4/20...  Training Step: 603...  Training loss: 1.9436...  0.1555 sec/batch
Epoch: 4/20...  Training Step: 604...  Training loss: 1.9001...  0.1534 sec/batch
Epoch: 4/20...  Training Step: 605...  Training loss: 1.8955...  0.1520 sec/batch
Epoch: 4/20...  Training Step: 606...  Training loss: 1.8879...  0.1525 sec/batch
Epoch: 4/20...  Training Step: 607...  Training loss: 1.9029...  0.1515 sec/batch
Epoch: 4/20...  Training Step: 608...  Training loss: 1.9493...  0.1629 sec/batch
Epoch: 4/20...  Training Step: 609...  Training loss: 1.8956...  0.1579 sec/batch
Epoch: 4/20...  Training Step: 610...  Training loss: 1.8735...  0.1490 sec/batch
Epoch: 4/20...  Training Step: 611...  Training loss: 1.9024...  0.1620 sec/batch
Epoch: 4/20...  Training Step: 612...  Training loss: 1.9443...  0.1580 sec/batch
Epoch: 4/20...  Training Step: 613...  Training loss: 1.9068...  0.1630 sec/batch
Epoch: 4/20...  Training Step: 614...  Training loss: 1.9032...  0.1510 sec/batch
Epoch: 4/20...  

Epoch: 4/20...  Training Step: 703...  Training loss: 1.8364...  0.1500 sec/batch
Epoch: 4/20...  Training Step: 704...  Training loss: 1.8325...  0.1520 sec/batch
Epoch: 4/20...  Training Step: 705...  Training loss: 1.8178...  0.1500 sec/batch
Epoch: 4/20...  Training Step: 706...  Training loss: 1.8078...  0.1520 sec/batch
Epoch: 4/20...  Training Step: 707...  Training loss: 1.8126...  0.1500 sec/batch
Epoch: 4/20...  Training Step: 708...  Training loss: 1.8045...  0.1510 sec/batch
Epoch: 4/20...  Training Step: 709...  Training loss: 1.7930...  0.1490 sec/batch
Epoch: 4/20...  Training Step: 710...  Training loss: 1.7809...  0.1540 sec/batch
Epoch: 4/20...  Training Step: 711...  Training loss: 1.8204...  0.1560 sec/batch
Epoch: 4/20...  Training Step: 712...  Training loss: 1.8082...  0.1511 sec/batch
Epoch: 4/20...  Training Step: 713...  Training loss: 1.8188...  0.1610 sec/batch
Epoch: 4/20...  Training Step: 714...  Training loss: 1.8086...  0.1490 sec/batch
Epoch: 4/20...  

Epoch: 5/20...  Training Step: 803...  Training loss: 1.7433...  0.1540 sec/batch
Epoch: 5/20...  Training Step: 804...  Training loss: 1.7409...  0.1490 sec/batch
Epoch: 5/20...  Training Step: 805...  Training loss: 1.7610...  0.1520 sec/batch
Epoch: 5/20...  Training Step: 806...  Training loss: 1.7988...  0.1500 sec/batch
Epoch: 5/20...  Training Step: 807...  Training loss: 1.7528...  0.1500 sec/batch
Epoch: 5/20...  Training Step: 808...  Training loss: 1.7333...  0.1520 sec/batch
Epoch: 5/20...  Training Step: 809...  Training loss: 1.7570...  0.1510 sec/batch
Epoch: 5/20...  Training Step: 810...  Training loss: 1.7927...  0.1500 sec/batch
Epoch: 5/20...  Training Step: 811...  Training loss: 1.7625...  0.1510 sec/batch
Epoch: 5/20...  Training Step: 812...  Training loss: 1.7587...  0.1600 sec/batch
Epoch: 5/20...  Training Step: 813...  Training loss: 1.7427...  0.1520 sec/batch
Epoch: 5/20...  Training Step: 814...  Training loss: 1.7836...  0.1490 sec/batch
Epoch: 5/20...  

Epoch: 5/20...  Training Step: 903...  Training loss: 1.6983...  0.1500 sec/batch
Epoch: 5/20...  Training Step: 904...  Training loss: 1.6989...  0.1560 sec/batch
Epoch: 5/20...  Training Step: 905...  Training loss: 1.6984...  0.1530 sec/batch
Epoch: 5/20...  Training Step: 906...  Training loss: 1.6915...  0.1490 sec/batch
Epoch: 5/20...  Training Step: 907...  Training loss: 1.6766...  0.1540 sec/batch
Epoch: 5/20...  Training Step: 908...  Training loss: 1.6563...  0.1500 sec/batch
Epoch: 5/20...  Training Step: 909...  Training loss: 1.7027...  0.1500 sec/batch
Epoch: 5/20...  Training Step: 910...  Training loss: 1.6896...  0.1500 sec/batch
Epoch: 5/20...  Training Step: 911...  Training loss: 1.6930...  0.1510 sec/batch
Epoch: 5/20...  Training Step: 912...  Training loss: 1.6912...  0.1530 sec/batch
Epoch: 5/20...  Training Step: 913...  Training loss: 1.7035...  0.1520 sec/batch
Epoch: 5/20...  Training Step: 914...  Training loss: 1.6743...  0.1590 sec/batch
Epoch: 5/20...  

Epoch: 6/20...  Training Step: 1003...  Training loss: 1.6645...  0.1530 sec/batch
Epoch: 6/20...  Training Step: 1004...  Training loss: 1.6985...  0.1540 sec/batch
Epoch: 6/20...  Training Step: 1005...  Training loss: 1.6454...  0.1570 sec/batch
Epoch: 6/20...  Training Step: 1006...  Training loss: 1.6328...  0.1500 sec/batch
Epoch: 6/20...  Training Step: 1007...  Training loss: 1.6602...  0.1530 sec/batch
Epoch: 6/20...  Training Step: 1008...  Training loss: 1.6824...  0.1530 sec/batch
Epoch: 6/20...  Training Step: 1009...  Training loss: 1.6560...  0.1540 sec/batch
Epoch: 6/20...  Training Step: 1010...  Training loss: 1.6583...  0.1560 sec/batch
Epoch: 6/20...  Training Step: 1011...  Training loss: 1.6561...  0.1490 sec/batch
Epoch: 6/20...  Training Step: 1012...  Training loss: 1.6874...  0.1510 sec/batch
Epoch: 6/20...  Training Step: 1013...  Training loss: 1.6428...  0.1540 sec/batch
Epoch: 6/20...  Training Step: 1014...  Training loss: 1.6571...  0.1510 sec/batch
Epoc

Epoch: 6/20...  Training Step: 1103...  Training loss: 1.6131...  0.1520 sec/batch
Epoch: 6/20...  Training Step: 1104...  Training loss: 1.6012...  0.1560 sec/batch
Epoch: 6/20...  Training Step: 1105...  Training loss: 1.5834...  0.1510 sec/batch
Epoch: 6/20...  Training Step: 1106...  Training loss: 1.5746...  0.1490 sec/batch
Epoch: 6/20...  Training Step: 1107...  Training loss: 1.6190...  0.1580 sec/batch
Epoch: 6/20...  Training Step: 1108...  Training loss: 1.6065...  0.1490 sec/batch
Epoch: 6/20...  Training Step: 1109...  Training loss: 1.6073...  0.1550 sec/batch
Epoch: 6/20...  Training Step: 1110...  Training loss: 1.5998...  0.1490 sec/batch
Epoch: 6/20...  Training Step: 1111...  Training loss: 1.6162...  0.1500 sec/batch
Epoch: 6/20...  Training Step: 1112...  Training loss: 1.5658...  0.1500 sec/batch
Epoch: 6/20...  Training Step: 1113...  Training loss: 1.5677...  0.1520 sec/batch
Epoch: 6/20...  Training Step: 1114...  Training loss: 1.6164...  0.1500 sec/batch
Epoc

Epoch: 7/20...  Training Step: 1203...  Training loss: 1.5707...  0.1520 sec/batch
Epoch: 7/20...  Training Step: 1204...  Training loss: 1.5629...  0.1500 sec/batch
Epoch: 7/20...  Training Step: 1205...  Training loss: 1.5837...  0.1510 sec/batch
Epoch: 7/20...  Training Step: 1206...  Training loss: 1.6046...  0.1510 sec/batch
Epoch: 7/20...  Training Step: 1207...  Training loss: 1.5820...  0.1550 sec/batch
Epoch: 7/20...  Training Step: 1208...  Training loss: 1.5936...  0.1520 sec/batch
Epoch: 7/20...  Training Step: 1209...  Training loss: 1.5696...  0.1500 sec/batch
Epoch: 7/20...  Training Step: 1210...  Training loss: 1.5934...  0.1500 sec/batch
Epoch: 7/20...  Training Step: 1211...  Training loss: 1.5554...  0.1540 sec/batch
Epoch: 7/20...  Training Step: 1212...  Training loss: 1.5731...  0.1520 sec/batch
Epoch: 7/20...  Training Step: 1213...  Training loss: 1.5847...  0.1530 sec/batch
Epoch: 7/20...  Training Step: 1214...  Training loss: 1.5326...  0.1500 sec/batch
Epoc

Epoch: 7/20...  Training Step: 1303...  Training loss: 1.5138...  0.1520 sec/batch
Epoch: 7/20...  Training Step: 1304...  Training loss: 1.5069...  0.1490 sec/batch
Epoch: 7/20...  Training Step: 1305...  Training loss: 1.5474...  0.1530 sec/batch
Epoch: 7/20...  Training Step: 1306...  Training loss: 1.5426...  0.1490 sec/batch
Epoch: 7/20...  Training Step: 1307...  Training loss: 1.5337...  0.1520 sec/batch
Epoch: 7/20...  Training Step: 1308...  Training loss: 1.5302...  0.1510 sec/batch
Epoch: 7/20...  Training Step: 1309...  Training loss: 1.5373...  0.1510 sec/batch
Epoch: 7/20...  Training Step: 1310...  Training loss: 1.5038...  0.1490 sec/batch
Epoch: 7/20...  Training Step: 1311...  Training loss: 1.5018...  0.1510 sec/batch
Epoch: 7/20...  Training Step: 1312...  Training loss: 1.5528...  0.1520 sec/batch
Epoch: 7/20...  Training Step: 1313...  Training loss: 1.5427...  0.1520 sec/batch
Epoch: 7/20...  Training Step: 1314...  Training loss: 1.5043...  0.1490 sec/batch
Epoc

Epoch: 8/20...  Training Step: 1403...  Training loss: 1.5286...  0.1510 sec/batch
Epoch: 8/20...  Training Step: 1404...  Training loss: 1.5390...  0.1510 sec/batch
Epoch: 8/20...  Training Step: 1405...  Training loss: 1.5306...  0.1630 sec/batch
Epoch: 8/20...  Training Step: 1406...  Training loss: 1.5391...  0.1490 sec/batch
Epoch: 8/20...  Training Step: 1407...  Training loss: 1.5074...  0.1540 sec/batch
Epoch: 8/20...  Training Step: 1408...  Training loss: 1.5336...  0.1510 sec/batch
Epoch: 8/20...  Training Step: 1409...  Training loss: 1.5120...  0.1530 sec/batch
Epoch: 8/20...  Training Step: 1410...  Training loss: 1.5219...  0.1490 sec/batch
Epoch: 8/20...  Training Step: 1411...  Training loss: 1.5274...  0.1530 sec/batch
Epoch: 8/20...  Training Step: 1412...  Training loss: 1.4812...  0.1530 sec/batch
Epoch: 8/20...  Training Step: 1413...  Training loss: 1.4777...  0.1540 sec/batch
Epoch: 8/20...  Training Step: 1414...  Training loss: 1.5279...  0.1490 sec/batch
Epoc

Epoch: 8/20...  Training Step: 1503...  Training loss: 1.4858...  0.1500 sec/batch
Epoch: 8/20...  Training Step: 1504...  Training loss: 1.4939...  0.1490 sec/batch
Epoch: 8/20...  Training Step: 1505...  Training loss: 1.4812...  0.1520 sec/batch
Epoch: 8/20...  Training Step: 1506...  Training loss: 1.4784...  0.1540 sec/batch
Epoch: 8/20...  Training Step: 1507...  Training loss: 1.4858...  0.1500 sec/batch
Epoch: 8/20...  Training Step: 1508...  Training loss: 1.4530...  0.1520 sec/batch
Epoch: 8/20...  Training Step: 1509...  Training loss: 1.4463...  0.1540 sec/batch
Epoch: 8/20...  Training Step: 1510...  Training loss: 1.4968...  0.1490 sec/batch
Epoch: 8/20...  Training Step: 1511...  Training loss: 1.4862...  0.1580 sec/batch
Epoch: 8/20...  Training Step: 1512...  Training loss: 1.4443...  0.1500 sec/batch
Epoch: 8/20...  Training Step: 1513...  Training loss: 1.4935...  0.1520 sec/batch
Epoch: 8/20...  Training Step: 1514...  Training loss: 1.4908...  0.1570 sec/batch
Epoc

Epoch: 9/20...  Training Step: 1603...  Training loss: 1.4806...  0.1510 sec/batch
Epoch: 9/20...  Training Step: 1604...  Training loss: 1.4903...  0.1600 sec/batch
Epoch: 9/20...  Training Step: 1605...  Training loss: 1.4552...  0.1520 sec/batch
Epoch: 9/20...  Training Step: 1606...  Training loss: 1.4925...  0.1540 sec/batch
Epoch: 9/20...  Training Step: 1607...  Training loss: 1.4610...  0.1560 sec/batch
Epoch: 9/20...  Training Step: 1608...  Training loss: 1.4876...  0.1490 sec/batch
Epoch: 9/20...  Training Step: 1609...  Training loss: 1.4694...  0.1500 sec/batch
Epoch: 9/20...  Training Step: 1610...  Training loss: 1.4274...  0.1500 sec/batch
Epoch: 9/20...  Training Step: 1611...  Training loss: 1.4442...  0.1500 sec/batch
Epoch: 9/20...  Training Step: 1612...  Training loss: 1.4847...  0.1530 sec/batch
Epoch: 9/20...  Training Step: 1613...  Training loss: 1.4804...  0.1510 sec/batch
Epoch: 9/20...  Training Step: 1614...  Training loss: 1.4829...  0.1510 sec/batch
Epoc

Epoch: 9/20...  Training Step: 1703...  Training loss: 1.4250...  0.1510 sec/batch
Epoch: 9/20...  Training Step: 1704...  Training loss: 1.4365...  0.1500 sec/batch
Epoch: 9/20...  Training Step: 1705...  Training loss: 1.4364...  0.1500 sec/batch
Epoch: 9/20...  Training Step: 1706...  Training loss: 1.4154...  0.1490 sec/batch
Epoch: 9/20...  Training Step: 1707...  Training loss: 1.3990...  0.1500 sec/batch
Epoch: 9/20...  Training Step: 1708...  Training loss: 1.4385...  0.1490 sec/batch
Epoch: 9/20...  Training Step: 1709...  Training loss: 1.4432...  0.1500 sec/batch
Epoch: 9/20...  Training Step: 1710...  Training loss: 1.4052...  0.1550 sec/batch
Epoch: 9/20...  Training Step: 1711...  Training loss: 1.4566...  0.1490 sec/batch
Epoch: 9/20...  Training Step: 1712...  Training loss: 1.4530...  0.1510 sec/batch
Epoch: 9/20...  Training Step: 1713...  Training loss: 1.4338...  0.1490 sec/batch
Epoch: 9/20...  Training Step: 1714...  Training loss: 1.4056...  0.1520 sec/batch
Epoc

Epoch: 10/20...  Training Step: 1803...  Training loss: 1.4215...  0.1530 sec/batch
Epoch: 10/20...  Training Step: 1804...  Training loss: 1.4426...  0.1490 sec/batch
Epoch: 10/20...  Training Step: 1805...  Training loss: 1.4143...  0.1520 sec/batch
Epoch: 10/20...  Training Step: 1806...  Training loss: 1.4322...  0.1510 sec/batch
Epoch: 10/20...  Training Step: 1807...  Training loss: 1.4211...  0.1510 sec/batch
Epoch: 10/20...  Training Step: 1808...  Training loss: 1.3802...  0.1500 sec/batch
Epoch: 10/20...  Training Step: 1809...  Training loss: 1.3914...  0.1500 sec/batch
Epoch: 10/20...  Training Step: 1810...  Training loss: 1.4371...  0.1490 sec/batch
Epoch: 10/20...  Training Step: 1811...  Training loss: 1.4262...  0.1510 sec/batch
Epoch: 10/20...  Training Step: 1812...  Training loss: 1.4366...  0.1520 sec/batch
Epoch: 10/20...  Training Step: 1813...  Training loss: 1.4072...  0.1500 sec/batch
Epoch: 10/20...  Training Step: 1814...  Training loss: 1.3878...  0.1490 se

Epoch: 10/20...  Training Step: 1901...  Training loss: 1.3955...  0.1530 sec/batch
Epoch: 10/20...  Training Step: 1902...  Training loss: 1.3839...  0.1490 sec/batch
Epoch: 10/20...  Training Step: 1903...  Training loss: 1.3956...  0.1500 sec/batch
Epoch: 10/20...  Training Step: 1904...  Training loss: 1.3617...  0.1610 sec/batch
Epoch: 10/20...  Training Step: 1905...  Training loss: 1.3369...  0.1530 sec/batch
Epoch: 10/20...  Training Step: 1906...  Training loss: 1.4008...  0.1500 sec/batch
Epoch: 10/20...  Training Step: 1907...  Training loss: 1.3909...  0.1560 sec/batch
Epoch: 10/20...  Training Step: 1908...  Training loss: 1.3490...  0.1510 sec/batch
Epoch: 10/20...  Training Step: 1909...  Training loss: 1.4042...  0.1530 sec/batch
Epoch: 10/20...  Training Step: 1910...  Training loss: 1.3980...  0.1490 sec/batch
Epoch: 10/20...  Training Step: 1911...  Training loss: 1.3818...  0.1580 sec/batch
Epoch: 10/20...  Training Step: 1912...  Training loss: 1.3618...  0.1490 se

Epoch: 11/20...  Training Step: 1999...  Training loss: 1.3868...  0.1510 sec/batch
Epoch: 11/20...  Training Step: 2000...  Training loss: 1.4113...  0.1490 sec/batch
Epoch: 11/20...  Training Step: 2001...  Training loss: 1.3730...  0.1640 sec/batch
Epoch: 11/20...  Training Step: 2002...  Training loss: 1.4035...  0.1710 sec/batch
Epoch: 11/20...  Training Step: 2003...  Training loss: 1.3703...  0.1500 sec/batch
Epoch: 11/20...  Training Step: 2004...  Training loss: 1.3918...  0.1500 sec/batch
Epoch: 11/20...  Training Step: 2005...  Training loss: 1.3902...  0.1490 sec/batch
Epoch: 11/20...  Training Step: 2006...  Training loss: 1.3344...  0.1510 sec/batch
Epoch: 11/20...  Training Step: 2007...  Training loss: 1.3591...  0.1500 sec/batch
Epoch: 11/20...  Training Step: 2008...  Training loss: 1.4051...  0.1490 sec/batch
Epoch: 11/20...  Training Step: 2009...  Training loss: 1.3961...  0.1570 sec/batch
Epoch: 11/20...  Training Step: 2010...  Training loss: 1.4043...  0.1510 se

Epoch: 11/20...  Training Step: 2097...  Training loss: 1.3786...  0.1520 sec/batch
Epoch: 11/20...  Training Step: 2098...  Training loss: 1.3776...  0.1570 sec/batch
Epoch: 11/20...  Training Step: 2099...  Training loss: 1.3574...  0.1500 sec/batch
Epoch: 11/20...  Training Step: 2100...  Training loss: 1.3667...  0.1500 sec/batch
Epoch: 11/20...  Training Step: 2101...  Training loss: 1.3632...  0.1520 sec/batch
Epoch: 11/20...  Training Step: 2102...  Training loss: 1.3343...  0.1500 sec/batch
Epoch: 11/20...  Training Step: 2103...  Training loss: 1.3139...  0.1490 sec/batch
Epoch: 11/20...  Training Step: 2104...  Training loss: 1.3709...  0.1490 sec/batch
Epoch: 11/20...  Training Step: 2105...  Training loss: 1.3592...  0.1520 sec/batch
Epoch: 11/20...  Training Step: 2106...  Training loss: 1.3199...  0.1500 sec/batch
Epoch: 11/20...  Training Step: 2107...  Training loss: 1.3826...  0.1520 sec/batch
Epoch: 11/20...  Training Step: 2108...  Training loss: 1.3811...  0.1520 se

Epoch: 12/20...  Training Step: 2195...  Training loss: 1.3666...  0.1510 sec/batch
Epoch: 12/20...  Training Step: 2196...  Training loss: 1.3896...  0.1500 sec/batch
Epoch: 12/20...  Training Step: 2197...  Training loss: 1.3645...  0.1520 sec/batch
Epoch: 12/20...  Training Step: 2198...  Training loss: 1.3785...  0.1520 sec/batch
Epoch: 12/20...  Training Step: 2199...  Training loss: 1.3576...  0.1560 sec/batch
Epoch: 12/20...  Training Step: 2200...  Training loss: 1.3740...  0.1510 sec/batch
Epoch: 12/20...  Training Step: 2201...  Training loss: 1.3455...  0.1630 sec/batch
Epoch: 12/20...  Training Step: 2202...  Training loss: 1.3575...  0.1530 sec/batch
Epoch: 12/20...  Training Step: 2203...  Training loss: 1.3578...  0.1550 sec/batch
Epoch: 12/20...  Training Step: 2204...  Training loss: 1.3124...  0.1510 sec/batch
Epoch: 12/20...  Training Step: 2205...  Training loss: 1.3327...  0.1590 sec/batch
Epoch: 12/20...  Training Step: 2206...  Training loss: 1.3732...  0.1520 se

Epoch: 12/20...  Training Step: 2293...  Training loss: 1.3143...  0.1510 sec/batch
Epoch: 12/20...  Training Step: 2294...  Training loss: 1.3116...  0.1500 sec/batch
Epoch: 12/20...  Training Step: 2295...  Training loss: 1.3487...  0.1510 sec/batch
Epoch: 12/20...  Training Step: 2296...  Training loss: 1.3538...  0.1570 sec/batch
Epoch: 12/20...  Training Step: 2297...  Training loss: 1.3291...  0.1500 sec/batch
Epoch: 12/20...  Training Step: 2298...  Training loss: 1.3309...  0.1570 sec/batch
Epoch: 12/20...  Training Step: 2299...  Training loss: 1.3418...  0.1540 sec/batch
Epoch: 12/20...  Training Step: 2300...  Training loss: 1.3132...  0.1500 sec/batch
Epoch: 12/20...  Training Step: 2301...  Training loss: 1.2911...  0.1500 sec/batch
Epoch: 12/20...  Training Step: 2302...  Training loss: 1.3496...  0.1520 sec/batch
Epoch: 12/20...  Training Step: 2303...  Training loss: 1.3357...  0.1520 sec/batch
Epoch: 12/20...  Training Step: 2304...  Training loss: 1.2951...  0.1500 se

Epoch: 13/20...  Training Step: 2391...  Training loss: 1.3202...  0.1510 sec/batch
Epoch: 13/20...  Training Step: 2392...  Training loss: 1.3188...  0.1520 sec/batch
Epoch: 13/20...  Training Step: 2393...  Training loss: 1.3494...  0.1490 sec/batch
Epoch: 13/20...  Training Step: 2394...  Training loss: 1.3591...  0.1510 sec/batch
Epoch: 13/20...  Training Step: 2395...  Training loss: 1.3350...  0.1490 sec/batch
Epoch: 13/20...  Training Step: 2396...  Training loss: 1.3507...  0.1540 sec/batch
Epoch: 13/20...  Training Step: 2397...  Training loss: 1.3382...  0.1570 sec/batch
Epoch: 13/20...  Training Step: 2398...  Training loss: 1.3597...  0.1490 sec/batch
Epoch: 13/20...  Training Step: 2399...  Training loss: 1.3268...  0.1490 sec/batch
Epoch: 13/20...  Training Step: 2400...  Training loss: 1.3404...  0.1480 sec/batch
Epoch: 13/20...  Training Step: 2401...  Training loss: 1.3381...  0.1580 sec/batch
Epoch: 13/20...  Training Step: 2402...  Training loss: 1.2797...  0.1550 se

Epoch: 13/20...  Training Step: 2489...  Training loss: 1.3286...  0.1510 sec/batch
Epoch: 13/20...  Training Step: 2490...  Training loss: 1.3066...  0.1500 sec/batch
Epoch: 13/20...  Training Step: 2491...  Training loss: 1.2946...  0.1500 sec/batch
Epoch: 13/20...  Training Step: 2492...  Training loss: 1.2801...  0.1490 sec/batch
Epoch: 13/20...  Training Step: 2493...  Training loss: 1.3235...  0.1500 sec/batch
Epoch: 13/20...  Training Step: 2494...  Training loss: 1.3300...  0.1490 sec/batch
Epoch: 13/20...  Training Step: 2495...  Training loss: 1.3044...  0.1530 sec/batch
Epoch: 13/20...  Training Step: 2496...  Training loss: 1.3080...  0.1590 sec/batch
Epoch: 13/20...  Training Step: 2497...  Training loss: 1.3150...  0.1500 sec/batch
Epoch: 13/20...  Training Step: 2498...  Training loss: 1.2833...  0.1480 sec/batch
Epoch: 13/20...  Training Step: 2499...  Training loss: 1.2685...  0.1500 sec/batch
Epoch: 13/20...  Training Step: 2500...  Training loss: 1.3215...  0.1500 se

Epoch: 14/20...  Training Step: 2587...  Training loss: 1.3147...  0.1510 sec/batch
Epoch: 14/20...  Training Step: 2588...  Training loss: 1.3311...  0.1610 sec/batch
Epoch: 14/20...  Training Step: 2589...  Training loss: 1.3037...  0.1540 sec/batch
Epoch: 14/20...  Training Step: 2590...  Training loss: 1.2858...  0.1500 sec/batch
Epoch: 14/20...  Training Step: 2591...  Training loss: 1.3414...  0.1560 sec/batch
Epoch: 14/20...  Training Step: 2592...  Training loss: 1.3406...  0.1520 sec/batch
Epoch: 14/20...  Training Step: 2593...  Training loss: 1.3271...  0.1570 sec/batch
Epoch: 14/20...  Training Step: 2594...  Training loss: 1.3415...  0.1490 sec/batch
Epoch: 14/20...  Training Step: 2595...  Training loss: 1.3115...  0.1520 sec/batch
Epoch: 14/20...  Training Step: 2596...  Training loss: 1.3289...  0.1490 sec/batch
Epoch: 14/20...  Training Step: 2597...  Training loss: 1.3089...  0.1500 sec/batch
Epoch: 14/20...  Training Step: 2598...  Training loss: 1.3185...  0.1530 se

Epoch: 14/20...  Training Step: 2685...  Training loss: 1.2900...  0.1530 sec/batch
Epoch: 14/20...  Training Step: 2686...  Training loss: 1.3019...  0.1530 sec/batch
Epoch: 14/20...  Training Step: 2687...  Training loss: 1.3026...  0.1580 sec/batch
Epoch: 14/20...  Training Step: 2688...  Training loss: 1.2880...  0.1510 sec/batch
Epoch: 14/20...  Training Step: 2689...  Training loss: 1.2759...  0.1490 sec/batch
Epoch: 14/20...  Training Step: 2690...  Training loss: 1.2653...  0.1540 sec/batch
Epoch: 14/20...  Training Step: 2691...  Training loss: 1.3044...  0.1510 sec/batch
Epoch: 14/20...  Training Step: 2692...  Training loss: 1.3211...  0.1510 sec/batch
Epoch: 14/20...  Training Step: 2693...  Training loss: 1.3020...  0.1510 sec/batch
Epoch: 14/20...  Training Step: 2694...  Training loss: 1.2934...  0.1490 sec/batch
Epoch: 14/20...  Training Step: 2695...  Training loss: 1.2946...  0.1510 sec/batch
Epoch: 14/20...  Training Step: 2696...  Training loss: 1.2604...  0.1510 se

Epoch: 15/20...  Training Step: 2783...  Training loss: 1.2751...  0.1480 sec/batch
Epoch: 15/20...  Training Step: 2784...  Training loss: 1.2964...  0.1506 sec/batch
Epoch: 15/20...  Training Step: 2785...  Training loss: 1.3090...  0.1517 sec/batch
Epoch: 15/20...  Training Step: 2786...  Training loss: 1.3067...  0.1520 sec/batch
Epoch: 15/20...  Training Step: 2787...  Training loss: 1.2890...  0.1650 sec/batch
Epoch: 15/20...  Training Step: 2788...  Training loss: 1.2706...  0.1479 sec/batch
Epoch: 15/20...  Training Step: 2789...  Training loss: 1.3121...  0.1528 sec/batch
Epoch: 15/20...  Training Step: 2790...  Training loss: 1.3243...  0.1500 sec/batch
Epoch: 15/20...  Training Step: 2791...  Training loss: 1.3057...  0.1530 sec/batch
Epoch: 15/20...  Training Step: 2792...  Training loss: 1.3170...  0.1500 sec/batch
Epoch: 15/20...  Training Step: 2793...  Training loss: 1.2909...  0.1540 sec/batch
Epoch: 15/20...  Training Step: 2794...  Training loss: 1.3169...  0.1500 se

Epoch: 15/20...  Training Step: 2881...  Training loss: 1.2749...  0.1510 sec/batch
Epoch: 15/20...  Training Step: 2882...  Training loss: 1.3012...  0.1490 sec/batch
Epoch: 15/20...  Training Step: 2883...  Training loss: 1.2729...  0.1500 sec/batch
Epoch: 15/20...  Training Step: 2884...  Training loss: 1.2805...  0.1490 sec/batch
Epoch: 15/20...  Training Step: 2885...  Training loss: 1.2900...  0.1500 sec/batch
Epoch: 15/20...  Training Step: 2886...  Training loss: 1.2810...  0.1510 sec/batch
Epoch: 15/20...  Training Step: 2887...  Training loss: 1.2557...  0.1510 sec/batch
Epoch: 15/20...  Training Step: 2888...  Training loss: 1.2485...  0.1490 sec/batch
Epoch: 15/20...  Training Step: 2889...  Training loss: 1.2944...  0.1510 sec/batch
Epoch: 15/20...  Training Step: 2890...  Training loss: 1.3027...  0.1490 sec/batch
Epoch: 15/20...  Training Step: 2891...  Training loss: 1.2850...  0.1520 sec/batch
Epoch: 15/20...  Training Step: 2892...  Training loss: 1.2820...  0.1490 se

Epoch: 16/20...  Training Step: 2979...  Training loss: 1.2867...  0.1500 sec/batch
Epoch: 16/20...  Training Step: 2980...  Training loss: 1.2645...  0.1490 sec/batch
Epoch: 16/20...  Training Step: 2981...  Training loss: 1.2681...  0.1560 sec/batch
Epoch: 16/20...  Training Step: 2982...  Training loss: 1.2703...  0.1510 sec/batch
Epoch: 16/20...  Training Step: 2983...  Training loss: 1.2898...  0.1530 sec/batch
Epoch: 16/20...  Training Step: 2984...  Training loss: 1.2922...  0.1560 sec/batch
Epoch: 16/20...  Training Step: 2985...  Training loss: 1.2610...  0.1510 sec/batch
Epoch: 16/20...  Training Step: 2986...  Training loss: 1.2641...  0.1490 sec/batch
Epoch: 16/20...  Training Step: 2987...  Training loss: 1.2915...  0.1560 sec/batch
Epoch: 16/20...  Training Step: 2988...  Training loss: 1.3001...  0.1520 sec/batch
Epoch: 16/20...  Training Step: 2989...  Training loss: 1.2801...  0.1500 sec/batch
Epoch: 16/20...  Training Step: 2990...  Training loss: 1.3081...  0.1490 se

Epoch: 16/20...  Training Step: 3077...  Training loss: 1.2727...  0.1490 sec/batch
Epoch: 16/20...  Training Step: 3078...  Training loss: 1.2689...  0.1490 sec/batch
Epoch: 16/20...  Training Step: 3079...  Training loss: 1.2585...  0.1530 sec/batch
Epoch: 16/20...  Training Step: 3080...  Training loss: 1.2840...  0.1490 sec/batch
Epoch: 16/20...  Training Step: 3081...  Training loss: 1.2578...  0.1510 sec/batch
Epoch: 16/20...  Training Step: 3082...  Training loss: 1.2797...  0.1560 sec/batch
Epoch: 16/20...  Training Step: 3083...  Training loss: 1.2771...  0.1500 sec/batch
Epoch: 16/20...  Training Step: 3084...  Training loss: 1.2632...  0.1510 sec/batch
Epoch: 16/20...  Training Step: 3085...  Training loss: 1.2389...  0.1570 sec/batch
Epoch: 16/20...  Training Step: 3086...  Training loss: 1.2433...  0.1490 sec/batch
Epoch: 16/20...  Training Step: 3087...  Training loss: 1.2663...  0.1510 sec/batch
Epoch: 16/20...  Training Step: 3088...  Training loss: 1.2861...  0.1520 se

Epoch: 17/20...  Training Step: 3175...  Training loss: 1.2638...  0.1510 sec/batch
Epoch: 17/20...  Training Step: 3176...  Training loss: 1.2639...  0.1510 sec/batch
Epoch: 17/20...  Training Step: 3177...  Training loss: 1.2662...  0.1520 sec/batch
Epoch: 17/20...  Training Step: 3178...  Training loss: 1.2620...  0.1490 sec/batch
Epoch: 17/20...  Training Step: 3179...  Training loss: 1.2562...  0.1520 sec/batch
Epoch: 17/20...  Training Step: 3180...  Training loss: 1.2690...  0.1500 sec/batch
Epoch: 17/20...  Training Step: 3181...  Training loss: 1.2735...  0.1500 sec/batch
Epoch: 17/20...  Training Step: 3182...  Training loss: 1.2790...  0.1490 sec/batch
Epoch: 17/20...  Training Step: 3183...  Training loss: 1.2371...  0.1500 sec/batch
Epoch: 17/20...  Training Step: 3184...  Training loss: 1.2365...  0.1510 sec/batch
Epoch: 17/20...  Training Step: 3185...  Training loss: 1.2754...  0.1540 sec/batch
Epoch: 17/20...  Training Step: 3186...  Training loss: 1.2681...  0.1490 se

Epoch: 17/20...  Training Step: 3273...  Training loss: 1.2521...  0.1520 sec/batch
Epoch: 17/20...  Training Step: 3274...  Training loss: 1.2480...  0.1560 sec/batch
Epoch: 17/20...  Training Step: 3275...  Training loss: 1.2649...  0.1490 sec/batch
Epoch: 17/20...  Training Step: 3276...  Training loss: 1.2682...  0.1510 sec/batch
Epoch: 17/20...  Training Step: 3277...  Training loss: 1.2549...  0.1510 sec/batch
Epoch: 17/20...  Training Step: 3278...  Training loss: 1.2741...  0.1500 sec/batch
Epoch: 17/20...  Training Step: 3279...  Training loss: 1.2411...  0.1490 sec/batch
Epoch: 17/20...  Training Step: 3280...  Training loss: 1.2623...  0.1630 sec/batch
Epoch: 17/20...  Training Step: 3281...  Training loss: 1.2624...  0.1510 sec/batch
Epoch: 17/20...  Training Step: 3282...  Training loss: 1.2515...  0.1500 sec/batch
Epoch: 17/20...  Training Step: 3283...  Training loss: 1.2319...  0.1510 sec/batch
Epoch: 17/20...  Training Step: 3284...  Training loss: 1.2297...  0.1490 se

Epoch: 18/20...  Training Step: 3371...  Training loss: 1.2393...  0.1570 sec/batch
Epoch: 18/20...  Training Step: 3372...  Training loss: 1.2270...  0.1510 sec/batch
Epoch: 18/20...  Training Step: 3373...  Training loss: 1.2494...  0.1510 sec/batch
Epoch: 18/20...  Training Step: 3374...  Training loss: 1.2441...  0.1490 sec/batch
Epoch: 18/20...  Training Step: 3375...  Training loss: 1.2642...  0.1500 sec/batch
Epoch: 18/20...  Training Step: 3376...  Training loss: 1.2519...  0.1530 sec/batch
Epoch: 18/20...  Training Step: 3377...  Training loss: 1.2402...  0.1510 sec/batch
Epoch: 18/20...  Training Step: 3378...  Training loss: 1.2446...  0.1530 sec/batch
Epoch: 18/20...  Training Step: 3379...  Training loss: 1.2619...  0.1490 sec/batch
Epoch: 18/20...  Training Step: 3380...  Training loss: 1.2615...  0.1510 sec/batch
Epoch: 18/20...  Training Step: 3381...  Training loss: 1.2439...  0.1510 sec/batch
Epoch: 18/20...  Training Step: 3382...  Training loss: 1.2329...  0.1500 se

Epoch: 18/20...  Training Step: 3469...  Training loss: 1.2376...  0.1540 sec/batch
Epoch: 18/20...  Training Step: 3470...  Training loss: 1.2393...  0.1510 sec/batch
Epoch: 18/20...  Training Step: 3471...  Training loss: 1.2386...  0.1510 sec/batch
Epoch: 18/20...  Training Step: 3472...  Training loss: 1.2457...  0.1520 sec/batch
Epoch: 18/20...  Training Step: 3473...  Training loss: 1.2501...  0.1510 sec/batch
Epoch: 18/20...  Training Step: 3474...  Training loss: 1.2523...  0.1510 sec/batch
Epoch: 18/20...  Training Step: 3475...  Training loss: 1.2310...  0.1500 sec/batch
Epoch: 18/20...  Training Step: 3476...  Training loss: 1.2566...  0.1490 sec/batch
Epoch: 18/20...  Training Step: 3477...  Training loss: 1.2289...  0.1500 sec/batch
Epoch: 18/20...  Training Step: 3478...  Training loss: 1.2532...  0.1510 sec/batch
Epoch: 18/20...  Training Step: 3479...  Training loss: 1.2438...  0.1500 sec/batch
Epoch: 18/20...  Training Step: 3480...  Training loss: 1.2314...  0.1510 se

Epoch: 19/20...  Training Step: 3567...  Training loss: 1.2406...  0.1510 sec/batch
Epoch: 19/20...  Training Step: 3568...  Training loss: 1.2649...  0.1500 sec/batch
Epoch: 19/20...  Training Step: 3569...  Training loss: 1.2227...  0.1530 sec/batch
Epoch: 19/20...  Training Step: 3570...  Training loss: 1.2045...  0.1490 sec/batch
Epoch: 19/20...  Training Step: 3571...  Training loss: 1.2360...  0.1520 sec/batch
Epoch: 19/20...  Training Step: 3572...  Training loss: 1.2455...  0.1510 sec/batch
Epoch: 19/20...  Training Step: 3573...  Training loss: 1.2480...  0.1530 sec/batch
Epoch: 19/20...  Training Step: 3574...  Training loss: 1.2373...  0.1490 sec/batch
Epoch: 19/20...  Training Step: 3575...  Training loss: 1.2336...  0.1560 sec/batch
Epoch: 19/20...  Training Step: 3576...  Training loss: 1.2368...  0.1500 sec/batch
Epoch: 19/20...  Training Step: 3577...  Training loss: 1.2480...  0.1620 sec/batch
Epoch: 19/20...  Training Step: 3578...  Training loss: 1.2623...  0.1500 se

Epoch: 19/20...  Training Step: 3665...  Training loss: 1.2247...  0.1510 sec/batch
Epoch: 19/20...  Training Step: 3666...  Training loss: 1.2229...  0.1510 sec/batch
Epoch: 19/20...  Training Step: 3667...  Training loss: 1.2428...  0.1500 sec/batch
Epoch: 19/20...  Training Step: 3668...  Training loss: 1.2224...  0.1510 sec/batch
Epoch: 19/20...  Training Step: 3669...  Training loss: 1.2402...  0.1500 sec/batch
Epoch: 19/20...  Training Step: 3670...  Training loss: 1.2300...  0.1500 sec/batch
Epoch: 19/20...  Training Step: 3671...  Training loss: 1.2475...  0.1500 sec/batch
Epoch: 19/20...  Training Step: 3672...  Training loss: 1.2531...  0.1500 sec/batch
Epoch: 19/20...  Training Step: 3673...  Training loss: 1.2269...  0.1550 sec/batch
Epoch: 19/20...  Training Step: 3674...  Training loss: 1.2543...  0.1510 sec/batch
Epoch: 19/20...  Training Step: 3675...  Training loss: 1.2137...  0.1490 sec/batch
Epoch: 19/20...  Training Step: 3676...  Training loss: 1.2424...  0.1630 se

Epoch: 20/20...  Training Step: 3763...  Training loss: 1.3617...  0.1700 sec/batch
Epoch: 20/20...  Training Step: 3764...  Training loss: 1.2456...  0.1510 sec/batch
Epoch: 20/20...  Training Step: 3765...  Training loss: 1.2233...  0.1540 sec/batch
Epoch: 20/20...  Training Step: 3766...  Training loss: 1.2570...  0.1500 sec/batch
Epoch: 20/20...  Training Step: 3767...  Training loss: 1.2169...  0.1510 sec/batch
Epoch: 20/20...  Training Step: 3768...  Training loss: 1.1962...  0.1480 sec/batch
Epoch: 20/20...  Training Step: 3769...  Training loss: 1.2298...  0.1490 sec/batch
Epoch: 20/20...  Training Step: 3770...  Training loss: 1.2270...  0.1500 sec/batch
Epoch: 20/20...  Training Step: 3771...  Training loss: 1.2334...  0.1510 sec/batch
Epoch: 20/20...  Training Step: 3772...  Training loss: 1.2307...  0.1490 sec/batch
Epoch: 20/20...  Training Step: 3773...  Training loss: 1.2205...  0.1530 sec/batch
Epoch: 20/20...  Training Step: 3774...  Training loss: 1.2331...  0.1500 se

Epoch: 20/20...  Training Step: 3861...  Training loss: 1.2089...  0.1530 sec/batch
Epoch: 20/20...  Training Step: 3862...  Training loss: 1.2081...  0.1550 sec/batch
Epoch: 20/20...  Training Step: 3863...  Training loss: 1.2213...  0.1580 sec/batch
Epoch: 20/20...  Training Step: 3864...  Training loss: 1.2195...  0.1520 sec/batch
Epoch: 20/20...  Training Step: 3865...  Training loss: 1.2271...  0.1550 sec/batch
Epoch: 20/20...  Training Step: 3866...  Training loss: 1.2130...  0.1570 sec/batch
Epoch: 20/20...  Training Step: 3867...  Training loss: 1.2164...  0.1550 sec/batch
Epoch: 20/20...  Training Step: 3868...  Training loss: 1.2202...  0.1510 sec/batch
Epoch: 20/20...  Training Step: 3869...  Training loss: 1.2346...  0.1570 sec/batch
Epoch: 20/20...  Training Step: 3870...  Training loss: 1.2293...  0.1510 sec/batch
Epoch: 20/20...  Training Step: 3871...  Training loss: 1.2131...  0.1530 sec/batch
Epoch: 20/20...  Training Step: 3872...  Training loss: 1.2390...  0.1580 se

Epoch: 20/20...  Training Step: 3959...  Training loss: 1.2148...  0.1500 sec/batch
Epoch: 20/20...  Training Step: 3960...  Training loss: 1.2188...  0.1560 sec/batch


### Saved checkpoints

In [17]:
tf.train.get_checkpoint_state('checkpoints')

model_checkpoint_path: "checkpoints\\i3960_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i200_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i400_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i600_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i800_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1000_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1200_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1400_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1600_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1800_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2000_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2200_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2400_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2600_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2800_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i3000_l512.ckpt"
all_model_checkpoint_paths: "checkpoints\\i3200_l512.ckpt"
all_mo

## Sampling
Now that the network is trained, we'll can use it to generate new text. The idea is that we pass in a character, then the network will predict the next character. We can use the new one, to predict the next one. And we keep doing this to generate all new text. I also included some functionality to prime the network with some text by passing in a string and building up a state from that.

The network gives us predictions for each character. To reduce noise and make things a little less random, I'm going to only choose a new character from the top N most likely characters.

In [18]:
def pick_top_n(preds, vocab_size, top_n=5):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(vocab_size, 1, p=p)[0]
    return c

In [19]:
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="The "):
    samples = [c for c in prime]
    model = CharRNN(len(vocab), lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1, 1))
            x[0,0] = vocab_to_int[c]
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

        c = pick_top_n(preds, len(vocab))
        samples.append(int_to_vocab[c])

        for i in range(n_samples):
            x[0,0] = c
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

            c = pick_top_n(preds, len(vocab))
            samples.append(int_to_vocab[c])
        
    return ''.join(samples)

Here, pass in the path to a checkpoint and sample from the network.

In [20]:
tf.train.latest_checkpoint('checkpoints')

'checkpoints\\i3960_l512.ckpt'

In [21]:
checkpoint = tf.train.latest_checkpoint('checkpoints')
samp = sample(checkpoint, 2000, lstm_size, len(vocab), prime="Far")
print(samp)

INFO:tensorflow:Restoring parameters from checkpoints\i3960_l512.ckpt
Farther that in the
days of her subjects of the first teash, and she had seen their soul,
because he was in his wife, and he had been and she seemed to see her that the
man had that lade to be simply angry. At the same as a maturing of husband
to bring the best tervors. He had the prayer was so ashes of simply than
he would be delighted; and that he would no more her soft and all, and
went to his brother, and was several to the proform of his wheels and
attention of anyence.

"If you can did not certain you and so so sat down," she asked.

"I have been all right, to be at the poor of all the chorse. Thyse confries
is not at abternating in that subject, that I can't both her one arouse,
have the party of the figure of the such thought.
I think it's a sensable condition of that is sometices another
to home, that he has suppressed. That's the sace well all were to be in sicture
in which it he have to be controlled to he