# Projeto: Escreva como um diplomata

In [133]:
# Imports
import numpy as np
import pandas as pd
from collections import Counter
import tensorflow as tf

In [134]:
def get_data(filepath):
    return open(filepath, 'r').read()

def preprocess(text):

    # Replace punctuation with tokens so we can use them in our model
    text = text.lower()
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('(', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    # text = text.replace('\n', ' <NEW_LINE> ')
    text = text.replace(':', ' <COLON> ')
    words = text.split()
    vocab = set(words)
    
    counts = Counter(words)
    vocab = sorted(counts, key=counts.get, reverse=True)
    vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
    int_to_vocab = {ii: word for ii, word in vocab_to_int.items()}
    
    text_ints = []
    for word in words:
        text_ints.append(vocab_to_int[word])
    
    text_ints = np.array(text_ints)
    labels_ints = np.zeros_like(text_ints)
    labels_ints[:-1], labels_ints[-1] = text_ints[1:], text_ints[0]

    return text_ints, labels_ints, int_to_vocab, vocab_to_int


features, labels, int_to_vocab, vocab_to_int = preprocess(get_data('data/model.txt'))

In [135]:
inputs[:100], targets[:100]

(array([   6,   70,   98, 2441, 3841,    2, 2442, 1508,  262,    4, 1396,
          28,   92,  114,   12,  203,   25, 4791,    8, 3197,    5,  209,
          30,  387,   17, 4792,    1,   17, 2172,    5,   10, 1953, 2173,
           1,   26,    9,  121, 3198,   24, 1954,   58, 3842,    3, 1623,
        3843,    1,  424,    2, 2174,   48, 6741,   30,  242,  120,    9,
         294,   55, 3844,   54,    1,   12,  366,  110,  624, 6742,    5,
         263,    2, 3845,    9, 6743,   66, 1955, 3846,    1,   25,  589,
        3199,   28,   92,  114,   12,   16,  523,   55, 6744,   54,    1,
          12,    4, 3200,    2,  373, 1397,    1,  253,    2,  294,    1,
         110]),
 array([  70,   98, 2441, 3841,    2, 2442, 1508,  262,    4, 1396,   28,
          92,  114,   12,  203,   25, 4791,    8, 3197,    5,  209,   30,
         387,   17, 4792,    1,   17, 2172,    5,   10, 1953, 2173,    1,
          26,    9,  121, 3198,   24, 1954,   58, 3842,    3, 1623, 3843,
           1,  424,   

# Get Batches



Implement get_batches to create batches of input and targets using int_text. The batches should be a Numpy array with the shape (number of batches, 2, batch size, sequence length). Each batch contains two elements:
The first element is a single batch of input with the shape [batch size, sequence length]
The second element is a single batch of targets with the shape [batch size, sequence length]
If you can't fill the last batch with enough data, drop the last batch.
For exmple, get_batches([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 3, 2) would return a 

Numpy array of the following:

[
  #First Batch
  [
    #Batch of Input
    [[ 1  2], [ 7  8], [13 14]]
    #Batch of targets
    [[ 2  3], [ 8  9], [14 15]]
  ]

  #Second Batch
  [
    #Batch of Input
    [[ 3  4], [ 9 10], [15 16]]
    #Batch of targets
    [[ 4  5], [10 11], [16 17]]
  ]

  #Third Batch
  [
    #Batch of Input
    [[ 5  6], [11 12], [17 18]]
    #Batch of targets
    [[ 6  7], [12 13], [18  1]]
  ]
  
]


Notice that the last target value in the last batch is the first input value of the first batch. In this case, 1. This is a common technique used when creating sequence batches, although it is rather unintuitive.

In [136]:
# Batch
def get_batches(features, targets, batch_size, seq_length):
    assert len(features) == len(targets), "Features and labels must have the same shape."
    
    # 1. Calculate the number of batches
    n_elements = batch_size * seq_length
    n_batches = len(features) // n_elements
    
    # 2. Trim features and targets to keep only full batches
    # 3. Reshape features and targets to a matrix (num_of_batches, batch_size * seq_length)
    features = np.reshape(features[:n_batches*n_elements], (n_batches, -1))
    targets = np.reshape(targets[:n_batches*n_elements], (n_batches, -1))
    
    # 4. Iterate over the num of batches and reshape each batch in (batch_size X seq_length)
    batches = []
    for i in range(n_batches):
        batch_of_features = features[i].reshape(batch_size,seq_length)
        batch_of_targets = targets[i].reshape(batch_size,seq_length)
        batches.append([batch_of_features,batch_of_targets])

    return np.array(batches)


## Hyperparameters

In [137]:
# Number of Epochs
num_epochs = 128
# Batch Size
batch_size = 256
# RNN Size
rnn_size = 512
# Embedding Dimension Size
embed_dim = 300
# Sequence Length
seq_length = 14
# Learning Rate
learning_rate = 0.001
# Show stats for every n number of batches
show_every_n_batches = 5


## Build Graph

In [138]:
train_graph = tf.Graph()
with train_graph.as_default():
    # Get Inputs
    input_text = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets') 
    lr = tf.placeholder(tf.float32, name='learning_rate') 
    
    # Get Init Cell
    num_of_lstm_layers = 1
    keep_prob = 0.3
    cell = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    dropout = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
    input_data_shape = tf.shape(input_text) # esse parametro é o batch_size da função get_init_cell
    cell = tf.contrib.rnn.MultiRNNCell([dropout] * num_of_lstm_layers)
    initial_state = tf.identity(cell.zero_state(batch_size, tf.int32), name = 'initial_state')
    
    
    # Build NN
    
    ## Get Embed
    vocab_size = len(int_to_vocab)
    n_embedding = 200 # Number of embedding features 
    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_dim), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, input_text)
    
    ## Build RNN
    output, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=None, dtype=tf.float32)
    final_state = tf.identity(final_state, name="final_state")
    
    ## Fully connected layer with a linear activation and vocab_size as the number of outputs 
    logits = tf.contrib.layers.fully_connected(
        output,
        vocab_size,
        weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
        biases_initializer=tf.zeros_initializer(),
        activation_fn=None,
    )
    
    
    # One-hot encode targets and reshape to match logits, one row per batch_size per step
    y_one_hot = tf.one_hot(targets, vocab_size)
    #y_reshaped = tf.reshape(y_one_hot, logits.get_shape())
    
    # Softmax cross entropy loss
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_one_hot)
    loss = tf.reduce_mean(loss)
    
    # Optimizer
    optimizer = tf.train.AdamOptimizer(lr)
    
    # Gradient Clipping
    gradients = optimizer.compute_gradients(loss)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

## Train

In [141]:
print(targets)
batches = get_batches(features, targets, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})
        
        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)
            # Show every <show_every_n_batches> batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))

Tensor("targets:0", shape=(?, ?), dtype=int32)


TypeError: object of type 'Tensor' has no len()