In [1]:
from __future__ import print_function
from __future__ import division
from importlib import reload

from nltk.tokenize.treebank import TreebankWordTokenizer
import collections
import nltk
import numpy as np
import glob
import math

import pandas as pd
import tensorflow as tf
import os, sys, re, json, time, datetime, shutil
import random

# Helper libraries
from w266_common import utils, vocabulary
from w266_common import patched_numpy_io

from sentence import Sentence
from embedding import *
from training_set import TrainingSet 

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [2]:
KID_INAPPROPRIATE = 0
KID_APPROPRIATE = 1

labels = [KID_INAPPROPRIATE, KID_APPROPRIATE]


In [3]:
book_files = glob.glob('/home/peterg/mids-w266-final-project/final_proj/books/*.txt')
fourchan_files = glob.glob('/home/peterg/mids-w266-final-project/final_proj/4chan/*.txt')
single_4chan_file = ["/home/peterg/mids-w266-final-project/final_proj/4chan/4chan_s4s.txt",]
single_simple_wiki_file = ["/home/peterg/mids-w266-final-project/final_proj/wiki/simple.txt",]
single_normal_wiki_file = ["/home/peterg/mids-w266-final-project/final_proj/wiki/normal.txt",]

In [4]:
NUM_EXAMPLES = 1500
# NUM_EXAMPLES = 150000
MAX_VOCAB = 70000
PAD_LEN = 40
tokenizer = TreebankWordTokenizer()

test_frac = 0.2
train_frac = 0.8

In [5]:
comment_set = TrainingSet(book_files, single_4chan_file, tokenizer, NUM_EXAMPLES)
wiki_set = TrainingSet(single_simple_wiki_file, single_normal_wiki_file, tokenizer, NUM_EXAMPLES)


In [6]:
comment_set.generate_vocab(MAX_VOCAB)
wiki_set.generate_vocab(MAX_VOCAB)

In [7]:
comment_set.prep_sents(PAD_LEN)
wiki_set.prep_sents(PAD_LEN)


In [8]:
x = comment_set.pos_sentences[0]
print(x.tokens, x.padded_tokens, x.token_ids, x.padded_tokens_ids)

print("\n\n")
x = wiki_set.pos_sentences[0]
print(x.tokens, x.padded_tokens, x.token_ids, x.padded_tokens_ids)

['the', 'song', 'of', 'hiawatha', 'henry', 'w.', 'longfellow', 'contents', 'introductory', 'note', 'introduction', 'i', '.'] ['the', 'song', 'of', 'hiawatha', 'henry', 'w.', 'longfellow', 'contents', 'introductory', 'note', 'introduction', 'i', '.', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'] [5, 237, 7, 29, 1362, 2204, 1141, 2205, 1705, 1363, 1706, 24, 8] [5, 237, 7, 29, 1362, 2204, 1141, 2205, 1705, 1363, 1706, 24, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]



['it', 'is', 'the', 'county', 'seat', 'of', 'alfalfa', 'county', '.'] ['it', 'is', 'the', 'county', 'seat', 'of', 'alfalfa', 'county', '.', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 

In [9]:
test_frac = 0.2
train_frac = 0.8

comment_set.divide_test_train(test_frac, train_frac)
wiki_set.divide_test_train(test_frac, train_frac)

comment_set.prep_sets()
wiki_set.prep_sets()

In [10]:
import models;

# ts = comment_set
ts = wiki_set

x, ns, y, ext_y = ts.trains

# Training params
batch_size = 64
model_params = dict(V=ts.vocab, embed_dim=90, hidden_dims=[50,25], num_classes=len(labels),
                    encoder_type='bow',
                    lr=0.1, optimizer='adagrad', beta=0.01)
model_fn = models.classifier_model_fn

# training
total_batches = 0
total_examples = 0
total_loss = 0
loss_ema = np.log(2)  # track exponential-moving-average of loss
ema_decay = np.exp(-1/10)  # decay parameter for moving average = np.exp(-1/history_length)
with tf.Graph().as_default(), tf.Session() as sess:
    ##
    # Construct the graph here. No session.run calls - just wiring up Tensors.
    ##
    # Add placeholders so we can feed in data.
    x_ph_  = tf.placeholder(tf.int32, shape=[None, x.shape[1]])  # [batch_size, max_len]
    ns_ph_ = tf.placeholder(tf.int32, shape=[None])              # [batch_size]
    y_ph_  = tf.placeholder(tf.int32, shape=[None])              # [batch_size]
    
    # Construct the graph using model_fn
    features = {"ids": x_ph_, "ns": ns_ph_}  # note that values are Tensors
    estimator_spec = model_fn(features, labels=y_ph_, mode=tf.estimator.ModeKeys.TRAIN,
                              params=model_params)
    loss_     = estimator_spec.loss
    train_op_ = estimator_spec.train_op
    
    ##
    # Done constructing the graph, now we can make session.run calls.
    ##
    sess.run(tf.global_variables_initializer())
    
    # Run a single epoch
    t0 = time.time()
    
    for (bx, bns, by) in utils.multi_batch_generator(batch_size, x, ns, y):
        # feed NumPy arrays into the placeholder Tensors
        feed_dict = {x_ph_: bx, ns_ph_: bns, y_ph_: by}
        batch_loss, _ = sess.run([loss_, train_op_], feed_dict=feed_dict)
        
        # Compute some statistics
        total_batches += 1
        total_examples += len(bx)
        total_loss += batch_loss * len(bx)  # re-scale, since batch loss is mean
        # Compute moving average to smooth out noisy per-batch loss
        loss_ema = ema_decay * loss_ema + (1 - ema_decay) * batch_loss
        
        if (total_batches % 25 == 0):
            print("{:5,} examples, moving-average loss {:.2f}".format(total_examples, 
                                                                      loss_ema))    
    print("Completed one epoch in {:s}".format(utils.pretty_timedelta(since=t0)))


1,600 examples, moving-average loss 1.79
Completed one epoch in 0:00:00


In [11]:
# Specify model hyperparameters as used by model_fn
model_params = dict(V=ts.vocab, embed_dim=70, hidden_dims=[75,50,25], num_classes=2,
                    encoder_type='bow',
                    lr=0.1, optimizer='adagrad', beta=0.01)

checkpoint_dir = "/tmp/tf_bow_sst_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
# Write vocabulary to file, so TensorBoard can label embeddings.
# creates checkpoint_dir/projector_config.pbtxt and checkpoint_dir/metadata.tsv
ts.vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

model = tf.estimator.Estimator(model_fn=models.classifier_model_fn, 
                               params=model_params,
                               model_dir=checkpoint_dir)
print("")
print("To view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006")

Vocabulary (7,969 words) written to '/tmp/tf_bow_sst_20180416-0220/metadata.tsv'
Projector config written to /tmp/tf_bow_sst_20180416-0220/projector_config.pbtxt
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tf_bow_sst_20180416-0220', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f11d7201630>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_bow_sst_20180416-0220' --port 6006

Then in your browser, open: http://localhost:6006


In [12]:
# Training params, just used in this cell for the input_fn-s
train_params = dict(batch_size=32, total_epochs=10, eval_every=5)
assert(train_params['total_epochs'] % train_params['eval_every'] == 0)

# Construct and train the model, saving checkpoints to the directory above.
# Input function for training set batches
# Do 'eval_every' epochs at once, followed by evaluating on the dev set.
# NOTE: use patch_numpy_io.numpy_input_fn instead of tf.estimator.inputs.numpy_input_fn
train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": x, "ns": ns}, y=y,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'], shuffle=True, seed=42
                 )


In [13]:
# Input function for dev set batches. As above, but:
# - Don't randomize order
# - Iterate exactly once (one epoch)
dev_x, dev_ns, dev_y, dev_y_ext = ts.devs
dev_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": dev_x, "ns": dev_ns}, y=dev_y,
                    batch_size=128, num_epochs=1, shuffle=False
                )

for _ in range(train_params['total_epochs'] // train_params['eval_every']):
    # Train for a few epochs, then evaluate on dev
    model.train(input_fn=train_input_fn)
    eval_metrics = model.evaluate(input_fn=dev_input_fn, name="dev")

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tf_bow_sst_20180416-0220/model.ckpt.
INFO:tensorflow:loss = 1.822403, step = 1
INFO:tensorflow:global_step/sec: 79.7305
INFO:tensorflow:loss = 1.3861363, step = 101 (1.258 sec)
INFO:tensorflow:global_step/sec: 81.3898
INFO:tensorflow:loss = 1.3492652, step = 201 (1.229 sec)
INFO:tensorflow:Saving checkpoints for 300 into /tmp/tf_bow_sst_20180416-0220/model.ckpt.
INFO:tensorflow:Loss for final step: 1.1903294.
INFO:tensorflow:Starting evaluation at 2018-04-16-02:20:24
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180416-0220/model.ckpt-300
INFO:tensorflow:Finished evaluation at 2018-04-16-02:20:24
INFO:tensorflow:Saving dict for global step 300: accuracy = 0.475, cross_entropy_loss = 0.7243004, global_step = 300, loss = 1.1484821
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180416-0220/model.ckpt-300
INFO:tensorflow:Savin

In [14]:
test_arr = [("ass and titties", KID_INAPPROPRIATE),
("fuck the police", KID_INAPPROPRIATE),
("cats are nice", KID_APPROPRIATE),
("one day I saw a horse on a hill and I liked it", KID_APPROPRIATE),
("mrs. tayler is a whore", KID_INAPPROPRIATE),
("I'd like to take you someplace nice and quiet", KID_INAPPROPRIATE),
("You gay ? Proceeds to chase him that's how bullying works", KID_INAPPROPRIATE),
("transfer responsibility dad temperature earn voter impossible radiation.", KID_APPROPRIATE),
("JADE CHYNOWETH AND JOSH KILLACKY LOVETEAM PLS", KID_APPROPRIATE),
("Lol, looks like the same generic crap you come to expect from these types of shows.  Guess it doesn't get old for some people.", KID_INAPPROPRIATE),
("Disney sold the rights to this?", KID_APPROPRIATE),
("Only step up 1 and 2 were good. All of the other ones have just been corny.", KID_APPROPRIATE),
("i rather not call it step up, its another level with different camera techniques and its so Channing Tatum style", KID_APPROPRIATE),
("I love the series its very good, im waiting for 2 one", KID_APPROPRIATE),
("GREAT", KID_APPROPRIATE),
("Job negotiate set alternative little introduction apparent crazy proper used care free.", KID_APPROPRIATE),
("Oxygen identify member dependent translate else card might handful.", KID_APPROPRIATE),
("Release accompany pole general something widely fly cup detective personnel.", KID_APPROPRIATE),
("Silly largely obstacle warrior charge flavor diabetes medal.", KID_APPROPRIATE),
("We need moose back", KID_APPROPRIATE),
("Exciting time to be alive bro watching our Prez Trump vs. the DeepState/Swamp that's the deal profs to him.", KID_APPROPRIATE),
("Idk if I want to watch it or not : ^/", KID_APPROPRIATE),
("Maximum testing blanket absolutely shock until actress sex liability.", KID_INAPPROPRIATE),
("3 was the best.", KID_APPROPRIATE),
("I knew they would have a all men gay dance group", KID_INAPPROPRIATE),
("Same old garbage; I'm sick of it !", KID_APPROPRIATE),
("GAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAYYYYYYYYYYYYYYYYYYYYYYYYY",KID_INAPPROPRIATE),
("Millenials = The worst generation",KID_APPROPRIATE),
("that opening statement is the most overused sentence i've seen",KID_APPROPRIATE),
("Pan spokesman via guard campaign characteristic movement expert attend garage.",KID_APPROPRIATE),
("ayyy https://youtu.be/wZwyQZDfyHY",KID_APPROPRIATE),
("WELP....the end of the step up franchise met a very disappointing end. Jesus this looks worse than most dumpster fires Ive seen",KID_APPROPRIATE),
("blacks that look white and wiggas.  yeah, looks great. (not!)",KID_INAPPROPRIATE),
("Rich And Diverse artistry- Advocating White Genocide. How is enrichment going on in South Africa?",KID_INAPPROPRIATE),
("step up <3",KID_APPROPRIATE),
("When is episode 5 coming out",KID_APPROPRIATE),
("This series is actually pretty good...watched it and was so happy I gave it a shot; I loved it! The storyline is actually stronger than I expected, and especially where they left off this first season, I'm so excited for what's to come; I hope they keep up the good work and produce even more (authentic stuff next season.",KID_APPROPRIATE),
("https://vk.com/club162181675 вступайте",KID_APPROPRIATE),
("lunch argument across expansion free govern healthy afternoon.",KID_APPROPRIATE),
("Modern day version of Fame!",KID_APPROPRIATE),
("Neyo!",KID_APPROPRIATE),
("I wasn't expecting this from YouTube but it's really good",KID_APPROPRIATE),
("The",KID_APPROPRIATE),
("what garbage i this want to be step up is more like stomp the yard",KID_APPROPRIATE),
("if you're actually thinking of watching this show... do it. i recommend it. it's so much more than what we ever got to see from step up or any dance movies at all. so much diversity & so many topics it's talking about. i was surprised at how much i actually enjoyed the show & how bad i need a (second season. just give it a try since the first 4 episodes are for free on here, i don't think y'all are gonna regret it",KID_APPROPRIATE),
("Aren’t u guys releasing more seasons and episodes??? we still need more of ‘em",KID_APPROPRIATE),
("Have Chris brown in the step up movies it’ll be dope",KID_APPROPRIATE),
("http://hikmatblogs.blogspot.com/2018/02/10-ways-to-improve-your-finances-today.html",KID_APPROPRIATE),
("No Moose no Step Up",KID_APPROPRIATE),
("Let me guess?? Suburban girl trying to prove her gangsta and most of the black men are gay.... WHACK!!!",KID_INAPPROPRIATE),
("What, no moose cameo ?",KID_APPROPRIATE),
("trash.",KID_APPROPRIATE),
("Why does everything have to be so gay  won't be watching this sick of all our black men in tv series being gay smh",KID_INAPPROPRIATE),
("Who's that girl on the trailer cover?",KID_APPROPRIATE),
("Atlanta is a n!66er filled cesspool.. i can't figure out why parents stopped teaching their children white from wrong.. why today's youth wants grey babies.. why ruin family pictures with those half n!66e kinky haired flat nosed kids..  why these fathers aren't teaching their daughters to stay off the chimpdicks.. i mean, for real, how low does your self esteem have to be to want to share your beautiful young white body with these savage nigloyds... i'm sorry your fathers have failed you girls.. I taught mine white from wrong.. my grandkids will be bright white with blonde hair and blue eyes.. not gray with kinky rugs and flat noses.. just the idea of beautiful young white girls with sweaty, greasy n!66ers up on them just makes me wanna puke.. parents.. it's time to start teaching some basic family values again.. not the kardashian kind where they just keep getting impregnated by stray chimps.. it's just disgusting... let's clean up and keep the white blood lines free from contaminants..  you young white girls... just say no.. find a decent white boy to make babies with.. you can't tell me that a n!66er is all you can get..  well, maybe the fat ugly girls.. but even you can do better.. find a fat ugly white guy..  give him that poontang .. not these foot stomping, drum banging monkeys .. talentless rappers...  all wanna pretend they are in the music business.. rap isn't music.. any tar baby can get a drum machine and call it a beat lab.. then they steal a macbook from a white girl.. and start their career as the white girl's baby daddy.. she says he's a musician.. but he's really just another worthless n166er  he will keep her on welfare.. just a waste.. february is black history month.. the other 11 is caucasian (history months.. we will celebrate by taking our women back..",KID_INAPPROPRIATE),
("i run them streetsXD",KID_APPROPRIATE),
("my answer is....hell noXD",KID_APPROPRIATE),
("Sail star diamond rate working love pond size that monument prevention celebrate.",KID_APPROPRIATE),
("So is Kevin Bacon they're Jesus or something?",KID_APPROPRIATE),
("Can someone please give me the name of the girl on the thumbnail please ?????",KID_APPROPRIATE),
("I would rather be prison raped than watch 5seconds of this garbage",KID_INAPPROPRIATE),
("Prosecution tool before endless visit dump shake sake remarkable hurt safe public.",KID_APPROPRIATE),
("total grief slave esyvbp question fourth fun basic fly sacrifice reply.",KID_INAPPROPRIATE),
("As she named the Empress, Anna Pávlovna’s face suddenly assumed an expression of profound and sincere devotion and respect mingled with sadness, and this occurred every time she mentioned her illustrious patroness. ", KID_INAPPROPRIATE),
("She added that Her Majesty had deigned to show Baron Funke beaucoup d’estime, and again her face clouded over with sadness.", KID_INAPPROPRIATE),
("The prince was silent and looked indifferent. ", KID_APPROPRIATE),
("But, with the womanly and courtierlike quickness and tact habitual to her, Anna Pávlovna wished both to rebuke him (for daring to speak as he had done of a ma ", KID_INAPPROPRIATE),
("recommended to the Empress) and at the same time to console him, so she said ", KID_INAPPROPRIATE)]

In [15]:

test_set = TrainingSet()

test_set.load_test_arr(test_arr,tokenizer)
test_set.set_vocab(ts.vocab)
test_set.prep_sents(PAD_LEN)
test_set.prep_sets()


sd_test_x, sd_test_ns, sd_test_y, sd_test_y_ext = ts.sds
test_x, test_ns, test_y, test_y_ext = test_set.ext_tests


In [16]:
test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": sd_test_x, "ns": sd_test_ns}, y=sd_test_y,
                    batch_size=128, num_epochs=1, shuffle=False
                )

eval_metrics = model.evaluate(input_fn=test_input_fn, name="eval")

#### END(YOUR CODE) ####
print("Accuracy on same domain test set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

INFO:tensorflow:Starting evaluation at 2018-04-16-02:20:32
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180416-0220/model.ckpt-600
INFO:tensorflow:Finished evaluation at 2018-04-16-02:20:32
INFO:tensorflow:Saving dict for global step 600: accuracy = 0.385, cross_entropy_loss = 1.0315592, global_step = 600, loss = 1.3168074
Accuracy on same domain test set: 38.50%


{'accuracy': 0.385,
 'cross_entropy_loss': 1.0315592,
 'global_step': 600,
 'loss': 1.3168074}

In [17]:
test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": test_x, "ns": test_ns}, y=test_y,
                    batch_size=128, num_epochs=1, shuffle=False
                )

eval_metrics = model.evaluate(input_fn=test_input_fn, name="eval")

#### END(YOUR CODE) ####
print("Accuracy on generalized test set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

INFO:tensorflow:Starting evaluation at 2018-04-16-02:20:33
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180416-0220/model.ckpt-600
INFO:tensorflow:Finished evaluation at 2018-04-16-02:20:33
INFO:tensorflow:Saving dict for global step 600: accuracy = 0.5294118, cross_entropy_loss = 0.7024518, global_step = 600, loss = 0.9120548
Accuracy on generalized test set: 52.94%


{'accuracy': 0.5294118,
 'cross_entropy_loss': 0.7024518,
 'global_step': 600,
 'loss': 0.9120548}

In [18]:
from sklearn.metrics import accuracy_score
predictions = list(model.predict(test_input_fn))  # list of dicts
y_pred = [p['max'] for p in predictions]
acc = accuracy_score(y_pred, test_y)
print("Accuracy on test set: {:.02%}".format(acc))

INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180416-0220/model.ckpt-600
Accuracy on test set: 52.94%


# CNN

In [19]:
EMBED_DIM = 90
CELL_SIZE = 40
RNN_NUM = 3
learning_rate = 0
dropout_keep_prob = 0
lam = 1
train_embedding = False
embedding_path = ""
model_dir = "runs"
summaries_dir = "summary"

In [20]:
import tensorflow as tf
import numpy as np
import os
import time
import datetime

from text_cnn import TextCNN
from tensorflow.contrib import learn

# Parameters
# ==================================================

# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.")
tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the negative data.")

# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")

# Training parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 10, "Number of training epochs (default: 10)")
tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


Parameters:
ALLOW_SOFT_PLACEMENT=True
BATCH_SIZE=64
CHECKPOINT_EVERY=100
DEV_SAMPLE_PERCENTAGE=0.1
DROPOUT_KEEP_PROB=0.5
EMBEDDING_DIM=128
EVALUATE_EVERY=100
FILTER_SIZES=3,4,5
L2_REG_LAMBDA=0.0
LOG_DEVICE_PLACEMENT=False
NEGATIVE_DATA_FILE=./data/rt-polaritydata/rt-polarity.neg
NUM_CHECKPOINTS=5
NUM_EPOCHS=10
NUM_FILTERS=128
POSITIVE_DATA_FILE=./data/rt-polaritydata/rt-polarity.pos



In [21]:
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [22]:

x, _, y, y_ext = ts.trains
dev_x, _, dev_y, dev_y_ext = ts.trains


print(y_ext.shape)
print(x.shape)
print(ts.vocab.size)


(1920, 2)
(1920, 40)
7969


In [23]:
# Training
# ==================================================
last_path = ""
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=x.shape[1],
#             num_classes=y.shape[1],
            num_classes=len(labels),
            vocab=ts.vocab,
            embedding_size=FLAGS.embedding_dim,
            filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
            num_filters=FLAGS.num_filters,
            l2_reg_lambda=FLAGS.l2_reg_lambda)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

        # Write vocabulary
        ts.vocab.write_flat_file(os.path.join(out_dir, "vocab"))

        # Initialize all variables
        sess.run(tf.global_variables_initializer())

        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)

        # Generate batches
        batches = batch_iter(
            list(zip(x, y_ext)), FLAGS.batch_size, FLAGS.num_epochs)
        # Training loop. For each batch...
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % FLAGS.evaluate_every == 0:
                print("\nEvaluation:")
                dev_step(dev_x, dev_y_ext, writer=dev_summary_writer)
                print("")
            if current_step % FLAGS.checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))
                last_path = path

INFO:tensorflow:Summary name embedding/W:0/grad/hist is illegal; using embedding/W_0/grad/hist instead.
INFO:tensorflow:Summary name embedding/W:0/grad/sparsity is illegal; using embedding/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/hist is illegal; using conv-maxpool-3/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/sparsity is illegal; using conv-maxpool-3/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/hist is illegal; using conv-maxpool-3/b_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/sparsity is illegal; using conv-maxpool-3/b_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-4/W:0/grad/hist is illegal; using conv-maxpool-4/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-4/W:0/grad/sparsity is illegal; using conv-maxpool-4/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-4/b:0/grad/hist is illegal; using 

2018-04-16T02:21:03.905597: step 95, loss 0.563653, acc 0.75
2018-04-16T02:21:04.197422: step 96, loss 0.595382, acc 0.671875
2018-04-16T02:21:04.476307: step 97, loss 0.601663, acc 0.671875
2018-04-16T02:21:04.765507: step 98, loss 0.57107, acc 0.796875
2018-04-16T02:21:05.046015: step 99, loss 0.539371, acc 0.765625
2018-04-16T02:21:05.331472: step 100, loss 0.551375, acc 0.765625

Evaluation:
2018-04-16T02:21:07.411206: step 100, loss 0.521785, acc 0.827604

Saved model checkpoint to /home/peterg/mids-w266-final-project/final_proj/runs/1523845235/checkpoints/model-100

2018-04-16T02:21:07.911048: step 101, loss 0.537428, acc 0.796875
2018-04-16T02:21:08.261921: step 102, loss 0.567793, acc 0.703125
2018-04-16T02:21:08.652906: step 103, loss 0.508657, acc 0.875
2018-04-16T02:21:08.946237: step 104, loss 0.553952, acc 0.75
2018-04-16T02:21:09.237971: step 105, loss 0.547054, acc 0.78125
2018-04-16T02:21:09.530860: step 106, loss 0.565306, acc 0.6875
2018-04-16T02:21:09.822501: step 10

2018-04-16T02:21:43.790956: step 216, loss 0.284216, acc 0.84375
2018-04-16T02:21:44.073293: step 217, loss 0.228443, acc 0.890625
2018-04-16T02:21:44.355817: step 218, loss 0.233113, acc 0.90625
2018-04-16T02:21:44.636650: step 219, loss 0.221548, acc 0.859375
2018-04-16T02:21:44.915887: step 220, loss 0.291929, acc 0.859375
2018-04-16T02:21:45.199265: step 221, loss 0.250575, acc 0.84375
2018-04-16T02:21:45.481218: step 222, loss 0.244447, acc 0.921875
2018-04-16T02:21:45.766302: step 223, loss 0.332786, acc 0.8125
2018-04-16T02:21:46.047944: step 224, loss 0.240679, acc 0.875
2018-04-16T02:21:46.327267: step 225, loss 0.286028, acc 0.84375
2018-04-16T02:21:46.627604: step 226, loss 0.288934, acc 0.8125
2018-04-16T02:21:47.006869: step 227, loss 0.249216, acc 0.875
2018-04-16T02:21:47.295828: step 228, loss 0.314726, acc 0.828125
2018-04-16T02:21:47.576116: step 229, loss 0.265601, acc 0.859375
2018-04-16T02:21:47.858757: step 230, loss 0.329729, acc 0.828125
2018-04-16T02:21:48.1411

In [24]:
test_x, _, test_y, test_y_ext = test_set.ext_tests
sd_test_x, _, sd_test_y, sd_test_y_ext = ts.sds

In [25]:
def test(test_x, test_y):
#     checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    checkpoint_file = path
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
          allow_soft_placement=FLAGS.allow_soft_placement,
          log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name("output/predictions").outputs[0]

            # Generate batches for one epoch
            batches = batch_iter(list(test_x), FLAGS.batch_size, 1, shuffle=False)

            # Collect the predictions here
            all_predictions = []

            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
                all_predictions = np.concatenate([all_predictions, batch_predictions])

    # Print accuracy if y_test is defined
    if test_y is not None:
        correct_predictions = float(sum(all_predictions == test_y))
        print("Total number of test examples: {}".format(len(test_y)))
        print("Accuracy: {:g}".format(correct_predictions/float(len(test_y))))

In [26]:
test(sd_test_x, sd_test_y)

INFO:tensorflow:Restoring parameters from /home/peterg/mids-w266-final-project/final_proj/runs/1523845235/checkpoints/model-300
Total number of test examples: 600
Accuracy: 0.34


In [27]:
test(test_x, test_y)

INFO:tensorflow:Restoring parameters from /home/peterg/mids-w266-final-project/final_proj/runs/1523845235/checkpoints/model-300
Total number of test examples: 68
Accuracy: 0.485294


-- https://www.tensorflow.org/tutorials/text_classification_with_tf_hub
-- https://github.com/dennybritz/cnn-text-classification-tf
-- With word2vec
-- https://github.com/cahya-wirawan/cnn-text-classification-tf