# Projeto: Escreva como um diplomata

In [54]:
# Imports
import numpy as np
import pandas as pd
from collections import Counter

In [55]:
def get_data(filepath):
    return open(filepath, 'r').read()

def preprocess(text):

    # Replace punctuation with tokens so we can use them in our model
    text = text.lower()
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('(', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    # text = text.replace('\n', ' <NEW_LINE> ')
    text = text.replace(':', ' <COLON> ')
    words = text.split()
    vocab = set(words)
    
    counts = Counter(words)
    vocab = sorted(counts, key=counts.get, reverse=True)
    vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
    
    text_ints = []
    for word in words:
        text_ints.append(vocab_to_int[word])
    
    text_ints = np.array(text_ints)
    labels_ints = np.zeros_like(text_ints)
    labels_ints[:-1], labels_ints[-1] = text_ints[1:], text_ints[0]

    return text_ints, labels_ints, vocab, vocab_to_int


features, targets, vocab, vocab_to_int = preprocess(get_data('data/model.txt'))

In [56]:
inputs[:100], targets[:100]

(array([   6,   70,   98, 2441, 3841,    2, 2442, 1508,  262,    4, 1396,
          28,   92,  114,   12,  203,   25, 4791,    8, 3197,    5,  209,
          30,  387,   17, 4792,    1,   17, 2172,    5,   10, 1953, 2173,
           1,   26,    9,  121, 3198,   24, 1954,   58, 3842,    3, 1623,
        3843,    1,  424,    2, 2174,   48, 6741,   30,  242,  120,    9,
         294,   55, 3844,   54,    1,   12,  366,  110,  624, 6742,    5,
         263,    2, 3845,    9, 6743,   66, 1955, 3846,    1,   25,  589,
        3199,   28,   92,  114,   12,   16,  523,   55, 6744,   54,    1,
          12,    4, 3200,    2,  373, 1397,    1,  253,    2,  294,    1,
         110]),
 array([  70,   98, 2441, 3841,    2, 2442, 1508,  262,    4, 1396,   28,
          92,  114,   12,  203,   25, 4791,    8, 3197,    5,  209,   30,
         387,   17, 4792,    1,   17, 2172,    5,   10, 1953, 2173,    1,
          26,    9,  121, 3198,   24, 1954,   58, 3842,    3, 1623, 3843,
           1,  424,   

# Get Batches



In [84]:
# Batch
def get_batches(features, targets, batch_size, seq_length):
    assert len(features) == len(targets), "Features and labels must have the same shape."
    
    # Calculate the number of batches
    n_elements = batch_size * seq_length
    n_batches = len(features) // n_elements
    
    # Trim features and targets to keep only full batches
    # Reshape features and targets to a matrix (num_of_batches, batch_size * seq_length)
    features = np.reshape(features[:n_batches*n_elements], (n_batches, -1))
    targets = np.reshape(targets[:n_batches*n_elements], (n_batches, -1))
    
    # Iterate over the num of batches and reshape each batch in (batch_size X seq_length)
    batches = []
    for i in range(n_batches):
        batch_of_features = features[i].reshape(batch_size,seq_length)
        batch_of_targets = targets[i].reshape(batch_size,seq_length)
        batches.append([batch_of_features,batch_of_targets])

    return np.array(batches)

    

get_batches(features, targets,256,14)

[[[   6   70   98 ...,   28   92  114]
  [  12  203   25 ..., 4792    1   17]
  [2172    5   10 ..., 1954   58 3842]
  ..., 
  [   1   33   72 ..., 2198    2  677]
  [ 752   13  560 ...,   44    1    4]
  [ 778   14    2 ..., 1414    1    2]]

 [[  70   98 2441 ...,   92  114   12]
  [ 203   25 4791 ...,    1   17 2172]
  [   5   10 1953 ...,   58 3842    3]
  ..., 
  [  33   72  168 ...,    2  677  752]
  [  13  560    9 ...,    1    4  778]
  [  14    2  145 ...,    1    2  145]]]


3034
