# NLU Project

In [1]:
from ramble.on import *

In [2]:
#? config
cfg['required_files'].append('model.py')

In [3]:
%%writefile model.py
# -*- coding: utf-8 -*-

"""
O código original deste arquivo está em https://github.com/dennybritz/cnn-text-classification-tf/blob/master/text_cnn.py
e foi adaptado por Peterson Katagiri Zilli <peterson.zilli@gmail.com>
"""

import tensorflow as tf
import numpy as np

class TextCNN(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    """
    def __init__(
      self, sequence_length, num_classes, vocab_size,
      embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):

        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)

        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            self.W = tf.Variable(
                tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
                name="W")
            self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # CalculateMean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

Overwriting model.py


In [4]:
%%writefile data_helpers.py
# -*- coding: utf-8 -*-
# baseado no original em: https://github.com/dennybritz/cnn-text-classification-tf/blob/master/data_helpers.py

import re
import codecs
import numpy as np

def clean_str(string):
    """
    Tokenização/limpeza do dataset baseado no trabalho do yoonkin
    em https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    # tratamento dos caracteres especiais em português
    string = re.sub(r"ç", "c", string)
    string = re.sub(r"ã", "a", string)
    string = re.sub(r"á", "a", string)
    string = re.sub(r"à", "a", string)
    string = re.sub(r"â", "a", string)
    string = re.sub(r"é", "e", string)
    string = re.sub(r"ê", "e", string)
    string = re.sub(r"í", "i", string)
    string = re.sub(r"õ", "o", string)
    string = re.sub(r"ó", "o", string)
    string = re.sub(r"ô", "o", string)
    string = re.sub(r"ú", "u", string)

    # substitui tudo que não for esses caracteres abaixo por espaço
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)    

    # parte do treinamento original do yoonkin para inglês    
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)

    # tratamento da pontuação
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\"", " \" ", string)
    string = re.sub(r"\?", " \? ", string)
    
    # tratamento dos espaços duplicados
    string = re.sub(r"\s{2,}", " ", string)
    
    return string.strip().lower()

def compact_str(string):
    # Detecção: oi, oooi, ooooieeeee, olaaa, olar
    string = re.sub(r"^(o+i+e*|o+l+a+r*)", "oie", string)
    # Detecção: hahahahaha, heheheh, kkkk, rsrs
    string = re.sub(r"([aei]?(h[aei]){2,}h?|k{3,})|(rs){2,}", "hahaha", string)
    # Detecção: nooooossa --> nossa (mais de 2 caracteres consecutivos viram 1 só)
    # Resultado aparentemente insatisfatório.
    #string = re.sub(r"(.)\1{2,}", "$1", string)
    return string

def gen_label(size, index):
    """
    Gera uma lista de tamanho 'size' toda de zeros, com excessão de colocar um '1' na posição index
    """
    aux = [0] * size
    aux[index] = 1
    return aux
    
def load_data_and_labels(data_folder):
    """
    Carrega os dados de arquivos de dados na pasta, faz o split dos textos em palavras e gera os labels
    Baseado no original em: https://github.com/dennybritz/cnn-text-classification-tf/blob/master/data_helpers.py
    Retorna as frases splitadas e os labels
    """

    import os
    files = [file for file in os.listdir(data_folder) if file.lower().endswith(".txt")]
    num_files = len(files)

    x_text = []
    y = []

    for index_file, file in enumerate(files):
        text_examples = list(codecs.open(os.path.join(data_folder, file), "r", "utf-8").readlines())
        text_examples = [compact_str(clean_str(s)).strip() for s in text_examples]
        text_labels = [gen_label(num_files, index_file) for _ in text_examples]
        x_text += text_examples
        y += text_labels
        
    y = np.array(y)

    return [files, x_text, y]


def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]


Overwriting data_helpers.py


In [43]:
%%writefile train.py
# -*- coding: utf-8 -*-
# !/usr/bin/env python

"""
Adaptado de https://github.com/dennybritz/cnn-text-classification-tf/blob/master/train.py
por Peterson Katagiri Zilli <peterson.zilli@gmail.com>
"""

import datetime
import os
import time

import numpy as np
import tensorflow as tf
from tensorflow.contrib import learn

import data_helpers
from model import TextCNN

import sys

# Parameters
# ==================================================

#reseting parameters
import argparse as _argparse
tf.flags._global_parser = _argparse.ArgumentParser()

# Train data and epochs params
tf.flags.DEFINE_string("data_folder", "./data", "Data source folder.")
tf.flags.DEFINE_integer("num_epochs", 50, "Number of training epochs (default: 50)")

# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")

# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "2,3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")

# Training parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 10, "Save model after this many steps (default: 10)")
tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")

# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()

print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

Overwriting train.py


In [44]:
%%writefile -a train.py
# Data Preparation
# ==================================================

# Load data
print("Loading data...")
categx, x_text, y = data_helpers.load_data_and_labels(FLAGS.data_folder)
categx2int = {c: i for (i, c) in enumerate(categx)}

# Testing new way of reading files
# print("X:")
# print(x_text)
# print("Y:")
# print(y)
# sys.exit()

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Extract word:id mapping from the object.
vocab_dict_word_2_int = vocab_processor.vocabulary_._mapping
vocab_dict_int_2_word = {i: w for w, i in vocab_dict_word_2_int.items()}

# Randomly shuffle data
np.random.seed(7)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))


Appending to train.py


In [45]:
%%writefile -a train.py

# Training
# ==================================================

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=x_train.shape[1],
            num_classes=y_train.shape[1],
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=FLAGS.embedding_dim,
            filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
            num_filters=FLAGS.num_filters,
            l2_reg_lambda=FLAGS.l2_reg_lambda)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name.replace(':','_')), g)
                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name.replace(':','_')), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        current_dir = os.path.dirname(os.path.realpath(__file__))
        out_dir = os.path.abspath(os.path.join(current_dir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))

        # Initialize all variables
        sess.run(tf.global_variables_initializer())


        # Carrega o modelo já treinado anteriormente
        # load_checkpoint_from = os.path.join(".\\runs\\1499531641\\checkpoints\\", 'model-1800')
        # saver = tf.train.import_meta_graph("{}.meta".format(load_checkpoint_from))
        # saver.restore(sess, load_checkpoint_from)
        # print("Loading checkpoint from {}\n".format(load_checkpoint_from))

        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
                cnn.input_x: x_batch,
                cnn.input_y: y_batch,
                cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)


        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
                cnn.input_x: x_batch,
                cnn.input_y: y_batch,
                cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy, y_preds = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.predictions],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            # print(x_batch)
            # print(y_batch)
            # print(y_preds)
            # print(x_batch.shape)
            # print(vocab_dict_int_2_word)
            # vocab_dict_word_2_int
            for i in range(x_batch.shape[0]):
                frase = ""
                categoria_original = ""
                categoria_predicao = ""
                for w in range(x_batch.shape[1]):
                    frase += vocab_dict_int_2_word[x_batch[i][w]] + " "
                categoria_original = "".join(["" if a == 0 else b for (a, b) in zip(y_batch[i], categx)])

                print("Sample: {}\t -- categ_original: {}({})\t categ_predicao: {}({})".format(
                    frase.encode(sys.stdout.encoding, errors='replace'), categx2int[categoria_original],
                    categoria_original, y_preds[i], categx[y_preds[i]]))

            if writer:
                writer.add_summary(summaries, step)

            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))


        # Generate batches
        batches = data_helpers.batch_iter(
            list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)

        print("Num Total Iters:", (int((len(x_train) - 1) / FLAGS.batch_size) + 1) * FLAGS.num_epochs)

        # Training loop. For each batch...
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % FLAGS.evaluate_every == 0:
                print("\nEvaluation:")
                dev_step(x_dev, y_dev, writer=dev_summary_writer)
                print("")
            if current_step % FLAGS.checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))


Appending to train.py


In [46]:
%run -i train.py


Parameters:
ALLOW_SOFT_PLACEMENT=True
BATCH_SIZE=64
CHECKPOINT_DIR=./runs/1516586601/checkpoints/
CHECKPOINT_EVERY=10
DATA_FOLDER=./data
DEV_SAMPLE_PERCENTAGE=0.1
DROPOUT_KEEP_PROB=0.5
EMBEDDING_DIM=128
EVAL_TRAIN=True
EVALUATE_EVERY=100
FILTER_SIZES=2,3,4,5
L2_REG_LAMBDA=0.0
LOG_DEVICE_PLACEMENT=False
NUM_CHECKPOINTS=5
NUM_EPOCHS=50
NUM_FILTERS=128

Loading data...
Vocabulary Size: 86
Train/Dev split: 34/3
Writing to C:\Users\peter\Projects\NLU\src\runs\1516587033

Num Total Iters: 50
2018-01-22T00:10:39.006212: step 1, loss 3.21829, acc 0.294118
2018-01-22T00:10:39.061860: step 2, loss 2.08111, acc 0.382353
2018-01-22T00:10:39.107481: step 3, loss 1.93059, acc 0.529412
2018-01-22T00:10:39.148591: step 4, loss 0.957532, acc 0.676471
2018-01-22T00:10:39.193715: step 5, loss 0.387543, acc 0.794118
2018-01-22T00:10:39.234824: step 6, loss 0.910093, acc 0.705882
2018-01-22T00:10:39.278948: step 7, loss 0.413751, acc 0.882353
2018-01-22T00:10:39.330093: step 8, loss 0.51228, acc 0.882353


In [18]:
%%writefile eval.py
# -*- coding: utf-8 -*-
#!/usr/bin/env python

# baseado no original em: https://github.com/dennybritz/cnn-text-classification-tf/blob/master/eval.py

import os

import numpy as np
import tensorflow as tf
from tensorflow.contrib import learn
import csv
import sys


import data_helpers




# Parameters
# ==================================================

#reseting parameters
import argparse as _argparse
tf.flags._global_parser = _argparse.ArgumentParser()


# Data Parameters
tf.flags.DEFINE_string("data_folder", "./data", "Data source folder.")


# Eval Parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run")
tf.flags.DEFINE_boolean("eval_train", True, "Evaluate on all training data")
tf.flags.DEFINE_string("x", "", "x (default: '')")

# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

# A variável FLAGs deve ser definida nos parâmetros de entrada do app
# O comando executado deve conter o checkpoint desejado, como no exemplo:
# python eval.py --eval_train --checkpoint_dir="./runs/1499500000/checkpoints/"
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()

# Caso FLAG não seja definido inicialmente
# checkpoint_dir = último timestamp
if FLAGS.checkpoint_dir == "":
    print("Flag não declarada.")
    this_file_path = os.path.dirname(os.path.realpath(__file__))
    runs = os.listdir(os.path.join(this_file_path, "runs"))
    runs = [os.path.join(this_file_path+"\\runs", f) for f in runs]
    runs.sort(key=lambda x: os.path.getmtime(x))
    last_run = runs[-1]
    print("Usando run '"+str(last_run)+"' no lugar.")
    checkpoint_file = os.path.join(this_file_path, "runs", last_run, "checkpoints")
    print(checkpoint_file)
    FLAGS.checkpoint_dir = checkpoint_file

print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# CHANGE THIS: Load data. Load your own data here
if FLAGS.eval_train:
    categs_raw, x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.data_folder)
    y_test = np.argmax(y_test, axis=1)
else:
    #categs_raw = ['categ1', 'categ2']
    #x_raw = ["a masterpiece four years in the making", "everything is off."]
    #y_test = [1, 0]
    categs_raw, _, _ = data_helpers.load_data_and_labels(FLAGS.data_folder)
    x_raw = [FLAGS.x]
    y_test = [0]
    
categx2int = { c : i for (i, c) in enumerate(categs_raw)}


# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

# Extract word:id mapping from the object.
vocab_dict_word_2_int = vocab_processor.vocabulary_._mapping
vocab_dict_int_2_word = {i: w for w, i in vocab_dict_word_2_int.items()}

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Load the saved meta graph and restore variables
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        # Get the placeholders from the graph by name
        input_x = graph.get_operation_by_name("input_x").outputs[0]
        # input_y = graph.get_operation_by_name("input_y").outputs[0]
        dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

        # Tensors we want to evaluate
        predictions = graph.get_operation_by_name("output/predictions").outputs[0]

        # Generate batches for one epoch
        batches = data_helpers.batch_iter(list(zip(x_test, y_test)), FLAGS.batch_size, 1, shuffle=False)

        # Collect the predictions here
        all_predictions = []

        for batch in batches:
            x_test_batch, y_test_batch = zip(*batch)
            batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
            all_predictions = np.concatenate([all_predictions, batch_predictions])
            
            #print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            #print(x_test_batch)
            #print(y_test_batch)
            #print(batch_predictions)
            #print(x_batch.shape)
            #print(vocab_dict_int_2_word)
            #vocab_dict_word_2_int
            for i in range(len(x_test_batch[:10])):

                frase = ""
                categoria_predicao = ""
                for w in range(x_test_batch[0].shape[0]):
                    frase += (vocab_dict_int_2_word[x_test_batch[i][w]] if vocab_dict_int_2_word[x_test_batch[i][
                        w]] != "<UNK>" else "") + " "
                categoria_original = categs_raw[y_test_batch[i]]
                if FLAGS.eval_train:
                    if categx2int[categoria_original] != batch_predictions[i]:
                        print("FALSE MATCH: {}\t -- original: {}({})\t predicao: {}({})".format(frase.encode(sys.stdout.encoding, errors='replace'),
                            categx2int[categoria_original], categoria_original, batch_predictions[i], categs_raw[batch_predictions[i]]))
                else:
                    print("PREDICTION {}\t predicao: {}({})".format(frase.encode(sys.stdout.encoding, errors='replace'),
                            batch_predictions[i], categs_raw[batch_predictions[i]]))
                

# Print accuracy if y_test is defined
if y_test is not None:
    correct_predictions = float(sum(all_predictions == y_test))
    print("Total number of test examples: {}".format(len(y_test)))
    if FLAGS.eval_train:
        print("Accuracy: {:g}".format(correct_predictions / float(len(y_test))))

# Save the evaluation to a csv
predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions))
out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv")
print("Saving evaluation to {0}".format(out_path))

import codecs
with codecs.open(out_path, 'w', "utf-8") as f:
    csv.writer(f).writerows(predictions_human_readable)

Overwriting eval.py


In [10]:
%run -i eval.py --eval_train=False --checkpoint_dir="./runs/1516586601/checkpoints/" --x="como faco um investimento?"


Parameters:
ALLOW_SOFT_PLACEMENT=True
BATCH_SIZE=64
CHECKPOINT_DIR=./runs/1516586601/checkpoints/
DATA_FOLDER=./data
EVAL_TRAIN=False
LOG_DEVICE_PLACEMENT=False
X=como faco um investimento?


Evaluating...

INFO:tensorflow:Restoring parameters from C:\Users\peter\Projects\NLU\src\runs\1516586601\checkpoints\model-10
PREDICTION b'como faco um investimento           '	 predicao: 2(Informação Investimento.txt)
Total number of test examples: 1
Accuracy: 0
Saving evaluation to ./runs/1516586601/checkpoints/..\prediction.csv


In [50]:
%run -i eval.py --eval_train=True --checkpoint_dir="./runs/1516586601/checkpoints/" --x="teste"


Parameters:
ALLOW_SOFT_PLACEMENT=True
BATCH_SIZE=64
CHECKPOINT_DIR=./runs/1516586601/checkpoints/
CHECKPOINT_EVERY=10
DATA_FOLDER=./data
DEV_SAMPLE_PERCENTAGE=0.1
DROPOUT_KEEP_PROB=0.5
EMBEDDING_DIM=128
EVAL_TRAIN=True
EVALUATE_EVERY=100
FILTER_SIZES=2,3,4,5
L2_REG_LAMBDA=0.0
LOG_DEVICE_PLACEMENT=False
NUM_CHECKPOINTS=5
NUM_EPOCHS=50
NUM_FILTERS=128


Evaluating...

INFO:tensorflow:Restoring parameters from C:\Users\peter\Projects\NLU\src\runs\1516586601\checkpoints\model-10
FALSE MATCH: b'mostra minha fatura            '	 -- original: 0(Consulta Fatura.txt)	 predicao: 2(Informação Investimento.txt)
Total number of test examples: 37
Accuracy: 0.972973
Saving evaluation to ./runs/1516586601/checkpoints/..\prediction.csv


In [None]:
# %load ./runs/1516586601/checkpoints/..\prediction.csv
"fatura digital , me mostra \?",0.0
me mostra a fatura do cartao,0.0
qria ver minha fatura do cartao,0.0
mostra minha fatura \?,2.0
queo ver minha fatura digital,0.0
"pra ver a fatura , voce me mostra \?",0.0
traz a fatura digital pra mim,0.0
poe a fatura na tela \?,0.0
tem como ver mostrar fatura agora \?,0.0
quero ver minha fatura desse mes,0.0
preciso ver meus gastos do mes passado na fatura digital,0.0
quero ver os gastos do cartao agora,0.0
mostra na tela os gastos no cartao \?,0.0
"oie , gostaria de saber sobre a minha fatura digital pode me ajudar \?",1.0
tem informacoes da fatura digital \? onde posso encontrar \?,1.0
em que lugar do site consigo informacoes sobre quanto gastei no cartao \?,1.0
onde tem informacoes sobre o que gastei no cartao \?,1.0
tem mais informacoes sobre fatura digital \?,1.0
tem jeito de ver a fatura no site \?,1.0
pode mandar a fatura por e mail ou nao \?,1.0
como que faz pra saber mais sobre a fatutra do cartao \?,1.0
tem jeito de informar mais sobre a fatura digital,1.0
como contrato o servico de fatura digital \?,1.0
onde encontro informacoes sobre a fatura do cartao \?,1.0
pode me falar mais sobre fatura digital \?,1.0
o que e a fatura digital \?,1.0
"e os gastos do cartao , como faco pra ver \?",1.0
onde invisto \?,2.0
como faco pra investir na poupanca \?,2.0
como investir em acoes \?,2.0
tem como investir em acoes \?,2.0
poupanca e um bom investimento \?,2.0
"e se eu investir em titulos , quanto que rende \?",2.0
rende muito investir em poupanca \?,2.0
qual o rendimento de titulos publicos \?,2.0
voce pode me explicar sobre investimentos \?,2.0
sabe de investimentos \?,2.0
