In [1]:
from __future__ import print_function
from __future__ import division
from importlib import reload

from nltk.tokenize.treebank import TreebankWordTokenizer
import collections
import nltk
import numpy as np
import glob
import math

import pandas as pd
import tensorflow as tf
import os, sys, re, json, time, datetime, shutil

# Helper libraries
from w266_common import utils, vocabulary
from w266_common import patched_numpy_io

  return f(*args, **kwds)


### Constants

In [2]:
KID_APPROPRIATE = 1
KID_INAPPROPRIATE = 0

### Grab Data

In [3]:
book_files = glob.glob('/home/peterg/2018-spring-main/assignment/final_proj/books/*.txt')
fourchan_files = glob.glob('/home/peterg/2018-spring-main/assignment/final_proj/4chan/*.txt')

In [4]:
pos_examples, neg_examples = [], []

# positive (appropriate for kids) examples come from 
# the children's bookshelf of project Gutenberg
for book in book_files[:60]:
    with open(book, "r") as pos_file:
        for line in pos_file:
            pos_examples.append(line)

# negative (inapprorpiate for kids) training examples come
# from the "shit 4chan says" board on 4chan.org
single_4chan_file = ["/home/peterg/2018-spring-main/assignment/final_proj/4chan/4chan_s4s.txt",]
for board in single_4chan_file:
    with open(board, 'r') as neg_file:
        for line in neg_file:
            neg_examples.append(line)

print(len(pos_examples), len(neg_examples))

154277 2760399


### Tokenization

In [5]:
tokenizer = TreebankWordTokenizer()

# the number of positive and negative examples to pull for training
num_sents = 250000

pos_tokens = [tokenizer.tokenize(x) for x in pos_examples[:num_sents]]
neg_tokens = [tokenizer.tokenize(x) for x in neg_examples[:num_sents]]

In [6]:
print(pos_examples[:5])
print(neg_examples[:5])

['      THE SONG OF HIAWATHA       Henry W. Longfellow CONTENTS Introductory Note Introduction I.\n', 'The Peace-Pipe II.\n', 'The Four Winds III.\n', "Hiawatha's Childhood IV.\n", "Hiawatha and Mudjekeewis V.    Hiawatha's Fasting VI.\n"]
["'I only browse on iphone\n", " (check your desktop privilege) How do I create memes on mobile? Does anyone know of an app that is similar to ms paint? Any basic art app that is free and good?'\n", "'( ͡° ͜ʖ ͡°)'\n", "'epin; simply ebin'\n", "'shit not again'\n"]


### Canonicalization

In [7]:
pos_tokens_canonical = [utils.canonicalize_words(x) for x in pos_tokens]
neg_tokens_canonical = [utils.canonicalize_words(x) for x in neg_tokens]

### Generate Vocabulary

In [8]:
all_tokens_canonical = pos_tokens_canonical + neg_tokens_canonical

all_tokens_flattened = [] 
for sublist in all_tokens_canonical:
    for item in sublist:
        all_tokens_flattened.append(item)

vocab = vocabulary.Vocabulary(all_tokens_flattened, size=70000)  # size=None means unlimited
print("Vocabulary size: {:,}".format(vocab.size))


# print("Vocabulary dict: ", vocab.word_to_id)
# x_ids = vocab.words_to_ids(all_tokens_flattened)
# print("x_ids =", x_ids)
# x_tokens_recovered = vocab.ids_to_words(x_ids)
# x_tokens_recovered

Vocabulary size: 70,000


### Generate training set

In [9]:
def pad_array(array, pad_len):
    length = len(array)
    if length <= pad_len:
        delta = pad_len - length
        for i in range(delta):
            array.append(0)
            
        return array, length
    if length > pad_len:
        return array[:pad_len], length

In [10]:
pos_ids = [vocab.words_to_ids(x) for x in pos_tokens_canonical]
neg_ids = [vocab.words_to_ids(x) for x in neg_tokens_canonical]

train_x, train_ns, train_y = [], [], []

pad_len = 40

train_frac = 0.5
dev_frac = 0.2

# train_frac + dev_frac must be <= 1
assert(train_frac + dev_frac <= 1.0)

num_sents = math.floor(len(pos_ids) * train_frac)
num_dev_sents = math.floor(len(pos_ids) * dev_frac)

# add one positive and one negative example
for i in range(num_sents):
    if i % 10000 == 0:
        print("Done with", i)
    padded, orig_len = pad_array(pos_ids[i],pad_len)
    train_x.append(padded)
    train_ns.append(orig_len)
    train_y.append(KID_APPROPRIATE)
    
    padded, orig_len = pad_array(neg_ids[i],pad_len)
    train_x.append(padded)
    train_ns.append(orig_len)
    train_y.append(KID_INAPPROPRIATE)
    
    
dev_x, dev_ns, dev_y = [], [], []

# take dev examples from the back half of the array
for i in range(num_dev_sents):
    if i % 10000 == 0:
        print("Done with", i)
    padded, orig_len = pad_array(pos_ids[-i],pad_len)
    dev_x.append(padded)
    dev_ns.append(orig_len)
    dev_y.append(KID_APPROPRIATE)
    
    padded, orig_len = pad_array(neg_ids[-i],pad_len)
    dev_x.append(padded)
    dev_ns.append(orig_len)
    dev_y.append(KID_INAPPROPRIATE)

# print(train_x[:3])
# print(vocab.ids_to_words(train_x[1]))
# print(train_ns[:3])
# print(train_y[:3])

batches = len(train_x)
max_len = pad_len


Done with 0
Done with 10000
Done with 20000
Done with 30000
Done with 40000
Done with 50000
Done with 60000
Done with 70000
Done with 0
Done with 10000
Done with 20000
Done with 30000


In [11]:
train_x_np = np.array(train_x)
train_ns_np = np.array(train_ns)
train_y_np = np.array(train_y)

dev_x_np = np.array(dev_x)
dev_ns_np = np.array(dev_ns)
dev_y_np = np.array(dev_y)

In [12]:
import models; reload(models)
x, ns, y = train_x_np, train_ns_np, train_y_np
batch_size = 32


# Specify model hyperparameters as used by model_fn
model_params = dict(V=vocab.size, embed_dim=90, hidden_dims=[50,25], num_classes=2,
                    encoder_type='bow',
                    lr=0.1, optimizer='adagrad', beta=0.01)
model_fn = models.classifier_model_fn

total_batches = 0
total_examples = 0
total_loss = 0
loss_ema = np.log(2)  # track exponential-moving-average of loss
ema_decay = np.exp(-1/10)  # decay parameter for moving average = np.exp(-1/history_length)
with tf.Graph().as_default(), tf.Session() as sess:
    ##
    # Construct the graph here. No session.run calls - just wiring up Tensors.
    ##
    # Add placeholders so we can feed in data.
    x_ph_  = tf.placeholder(tf.int32, shape=[None, x.shape[1]])  # [batch_size, max_len]
    ns_ph_ = tf.placeholder(tf.int32, shape=[None])              # [batch_size]
    y_ph_  = tf.placeholder(tf.int32, shape=[None])              # [batch_size]
    
    # Construct the graph using model_fn
    features = {"ids": x_ph_, "ns": ns_ph_}  # note that values are Tensors
    estimator_spec = model_fn(features, labels=y_ph_, mode=tf.estimator.ModeKeys.TRAIN,
                              params=model_params)
    loss_     = estimator_spec.loss
    train_op_ = estimator_spec.train_op
    
    ##
    # Done constructing the graph, now we can make session.run calls.
    ##
    sess.run(tf.global_variables_initializer())
    
    # Run a single epoch
    t0 = time.time()
    for (bx, bns, by) in utils.multi_batch_generator(batch_size, x, ns, y):
        # feed NumPy arrays into the placeholder Tensors
        feed_dict = {x_ph_: bx, ns_ph_: bns, y_ph_: by}
        batch_loss, _ = sess.run([loss_, train_op_], feed_dict=feed_dict)
        
        # Compute some statistics
        total_batches += 1
        total_examples += len(bx)
        total_loss += batch_loss * len(bx)  # re-scale, since batch loss is mean
        # Compute moving average to smooth out noisy per-batch loss
        loss_ema = ema_decay * loss_ema + (1 - ema_decay) * batch_loss
        
        if (total_batches % 25 == 0):
            print("{:5,} examples, moving-average loss {:.2f}".format(total_examples, 
                                                                      loss_ema))    
    print("Completed one epoch in {:s}".format(utils.pretty_timedelta(since=t0)))

  800 examples, moving-average loss 1.18
1,600 examples, moving-average loss 0.83
2,400 examples, moving-average loss 0.80
3,200 examples, moving-average loss 0.70
4,000 examples, moving-average loss 0.74
4,800 examples, moving-average loss 0.58
5,600 examples, moving-average loss 0.52
6,400 examples, moving-average loss 0.51
7,200 examples, moving-average loss 0.42
8,000 examples, moving-average loss 0.37
8,800 examples, moving-average loss 0.35
9,600 examples, moving-average loss 0.33
10,400 examples, moving-average loss 0.33
11,200 examples, moving-average loss 0.35
12,000 examples, moving-average loss 0.28
12,800 examples, moving-average loss 0.34
13,600 examples, moving-average loss 0.33
14,400 examples, moving-average loss 0.25
15,200 examples, moving-average loss 0.27
16,000 examples, moving-average loss 0.21
16,800 examples, moving-average loss 0.22
17,600 examples, moving-average loss 0.21
18,400 examples, moving-average loss 0.19
19,200 examples, moving-average loss 0.19
20,0

In [13]:
import models; reload(models)

# Specify model hyperparameters as used by model_fn
model_params = dict(V=vocab.size, embed_dim=70, hidden_dims=[75,50,25], num_classes=2,
                    encoder_type='bow',
                    lr=0.1, optimizer='adagrad', beta=0.01)

checkpoint_dir = "/tmp/tf_bow_sst_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
# Write vocabulary to file, so TensorBoard can label embeddings.
# creates checkpoint_dir/projector_config.pbtxt and checkpoint_dir/metadata.tsv
vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

model = tf.estimator.Estimator(model_fn=models.classifier_model_fn, 
                               params=model_params,
                               model_dir=checkpoint_dir)
print("")
print("To view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006")

Vocabulary (70,000 words) written to '/tmp/tf_bow_sst_20180327-2137/metadata.tsv'
Projector config written to /tmp/tf_bow_sst_20180327-2137/projector_config.pbtxt
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tf_bow_sst_20180327-2137', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f2db998cf60>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_bow_sst_20180327-2137' --port 6006

Then in your browser, open: http://localhost:6006


### Training

In [14]:
# Training params, just used in this cell for the input_fn-s
train_params = dict(batch_size=32, total_epochs=20, eval_every=5)
assert(train_params['total_epochs'] % train_params['eval_every'] == 0)

# Construct and train the model, saving checkpoints to the directory above.
# Input function for training set batches
# Do 'eval_every' epochs at once, followed by evaluating on the dev set.
# NOTE: use patch_numpy_io.numpy_input_fn instead of tf.estimator.inputs.numpy_input_fn
train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x_np, "ns": train_ns_np}, y=train_y_np,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'], shuffle=True, seed=42
                 )



In [15]:
# Input function for dev set batches. As above, but:
# - Don't randomize order
# - Iterate exactly once (one epoch)
dev_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": dev_x_np, "ns": dev_ns_np}, y=dev_y_np,
                    batch_size=128, num_epochs=1, shuffle=False
                )

for _ in range(train_params['total_epochs'] // train_params['eval_every']):
    # Train for a few epochs, then evaluate on dev
    model.train(input_fn=train_input_fn)
    eval_metrics = model.evaluate(input_fn=dev_input_fn, name="dev")

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tf_bow_sst_20180327-2137/model.ckpt.
INFO:tensorflow:loss = 1.82126, step = 1
INFO:tensorflow:global_step/sec: 59.9164
INFO:tensorflow:loss = 0.73191, step = 101 (1.672 sec)
INFO:tensorflow:global_step/sec: 71.2278
INFO:tensorflow:loss = 0.418399, step = 201 (1.404 sec)
INFO:tensorflow:global_step/sec: 68.7876
INFO:tensorflow:loss = 0.292477, step = 301 (1.454 sec)
INFO:tensorflow:global_step/sec: 64.8233
INFO:tensorflow:loss = 0.26894, step = 401 (1.544 sec)
INFO:tensorflow:global_step/sec: 62.5292
INFO:tensorflow:loss = 0.218901, step = 501 (1.598 sec)
INFO:tensorflow:global_step/sec: 70.955
INFO:tensorflow:loss = 0.179217, step = 601 (1.409 sec)
INFO:tensorflow:global_step/sec: 70.9919
INFO:tensorflow:loss = 0.33531, step = 701 (1.408 sec)
INFO:tensorflow:global_step/sec: 71.4076
INFO:tensorflow:loss = 0.13961, step = 801 (1.401 sec)
INFO:tensorflow:global_step/sec: 70.3332
INFO:tensorflow

INFO:tensorflow:loss = 0.306516, step = 8301 (1.491 sec)
INFO:tensorflow:global_step/sec: 68.3291
INFO:tensorflow:loss = 0.118575, step = 8401 (1.464 sec)
INFO:tensorflow:global_step/sec: 67.3678
INFO:tensorflow:loss = 0.127384, step = 8501 (1.485 sec)
INFO:tensorflow:global_step/sec: 63.0802
INFO:tensorflow:loss = 0.291643, step = 8601 (1.584 sec)
INFO:tensorflow:global_step/sec: 61.8418
INFO:tensorflow:loss = 0.0927649, step = 8701 (1.618 sec)
INFO:tensorflow:global_step/sec: 62.7107
INFO:tensorflow:loss = 0.16933, step = 8801 (1.595 sec)
INFO:tensorflow:global_step/sec: 59.2764
INFO:tensorflow:loss = 0.0947926, step = 8901 (1.686 sec)
INFO:tensorflow:global_step/sec: 68.4779
INFO:tensorflow:loss = 0.110266, step = 9001 (1.461 sec)
INFO:tensorflow:global_step/sec: 68.5699
INFO:tensorflow:loss = 0.109463, step = 9101 (1.458 sec)
INFO:tensorflow:global_step/sec: 66.9974
INFO:tensorflow:loss = 0.104234, step = 9201 (1.492 sec)
INFO:tensorflow:global_step/sec: 68.2664
INFO:tensorflow:los

INFO:tensorflow:loss = 0.094587, step = 16601 (1.492 sec)
INFO:tensorflow:global_step/sec: 69.3807
INFO:tensorflow:loss = 0.0920861, step = 16701 (1.441 sec)
INFO:tensorflow:global_step/sec: 69.8942
INFO:tensorflow:loss = 0.102028, step = 16801 (1.431 sec)
INFO:tensorflow:global_step/sec: 69.21
INFO:tensorflow:loss = 0.0951169, step = 16901 (1.445 sec)
INFO:tensorflow:global_step/sec: 70.4526
INFO:tensorflow:loss = 0.0951219, step = 17001 (1.419 sec)
INFO:tensorflow:global_step/sec: 64.4283
INFO:tensorflow:loss = 0.0940232, step = 17101 (1.554 sec)
INFO:tensorflow:global_step/sec: 61.0432
INFO:tensorflow:loss = 0.0928903, step = 17201 (1.638 sec)
INFO:tensorflow:global_step/sec: 68.3677
INFO:tensorflow:loss = 0.0983105, step = 17301 (1.462 sec)
INFO:tensorflow:global_step/sec: 66.4525
INFO:tensorflow:loss = 0.0883767, step = 17401 (1.506 sec)
INFO:tensorflow:global_step/sec: 68.3418
INFO:tensorflow:loss = 0.121454, step = 17501 (1.461 sec)
INFO:tensorflow:global_step/sec: 67.6507
INFO:

INFO:tensorflow:loss = 0.0881441, step = 24107
INFO:tensorflow:global_step/sec: 53.283
INFO:tensorflow:loss = 0.0992826, step = 24207 (1.883 sec)
INFO:tensorflow:global_step/sec: 59.0611
INFO:tensorflow:loss = 0.0839459, step = 24307 (1.694 sec)
INFO:tensorflow:global_step/sec: 61.1848
INFO:tensorflow:loss = 0.1021, step = 24407 (1.633 sec)
INFO:tensorflow:global_step/sec: 58.1607
INFO:tensorflow:loss = 0.096521, step = 24507 (1.720 sec)
INFO:tensorflow:global_step/sec: 59.8408
INFO:tensorflow:loss = 0.0848337, step = 24607 (1.672 sec)
INFO:tensorflow:global_step/sec: 57.984
INFO:tensorflow:loss = 0.0991274, step = 24707 (1.724 sec)
INFO:tensorflow:global_step/sec: 61.7515
INFO:tensorflow:loss = 0.365957, step = 24807 (1.618 sec)
INFO:tensorflow:global_step/sec: 53.9429
INFO:tensorflow:loss = 0.105823, step = 24907 (1.854 sec)
INFO:tensorflow:global_step/sec: 60.941
INFO:tensorflow:loss = 0.108318, step = 25007 (1.642 sec)
INFO:tensorflow:global_step/sec: 59.7932
INFO:tensorflow:loss =

INFO:tensorflow:loss = 0.304508, step = 32407 (1.751 sec)
INFO:tensorflow:global_step/sec: 58.3115
INFO:tensorflow:loss = 0.130737, step = 32507 (1.717 sec)
INFO:tensorflow:global_step/sec: 61.0827
INFO:tensorflow:loss = 0.102515, step = 32607 (1.636 sec)
INFO:tensorflow:global_step/sec: 61.7995
INFO:tensorflow:loss = 0.167181, step = 32707 (1.619 sec)
INFO:tensorflow:global_step/sec: 67.0282
INFO:tensorflow:loss = 0.0950027, step = 32807 (1.491 sec)
INFO:tensorflow:global_step/sec: 61.287
INFO:tensorflow:loss = 0.091853, step = 32907 (1.631 sec)
INFO:tensorflow:global_step/sec: 69.0114
INFO:tensorflow:loss = 0.0914278, step = 33007 (1.449 sec)
INFO:tensorflow:global_step/sec: 69.1819
INFO:tensorflow:loss = 0.100931, step = 33107 (1.446 sec)
INFO:tensorflow:global_step/sec: 66.9411
INFO:tensorflow:loss = 0.0952016, step = 33207 (1.495 sec)
INFO:tensorflow:global_step/sec: 63.6204
INFO:tensorflow:loss = 0.0937562, step = 33307 (1.571 sec)
INFO:tensorflow:global_step/sec: 64.3807
INFO:te

INFO:tensorflow:global_step/sec: 67.9237
INFO:tensorflow:loss = 0.0932385, step = 40707 (1.471 sec)
INFO:tensorflow:global_step/sec: 66.8155
INFO:tensorflow:loss = 0.0895199, step = 40807 (1.497 sec)
INFO:tensorflow:global_step/sec: 61.627
INFO:tensorflow:loss = 0.0928513, step = 40907 (1.622 sec)
INFO:tensorflow:global_step/sec: 69.1186
INFO:tensorflow:loss = 0.100634, step = 41007 (1.446 sec)
INFO:tensorflow:global_step/sec: 66.816
INFO:tensorflow:loss = 0.0948818, step = 41107 (1.497 sec)
INFO:tensorflow:global_step/sec: 66.6894
INFO:tensorflow:loss = 0.0960813, step = 41207 (1.499 sec)
INFO:tensorflow:global_step/sec: 68.6018
INFO:tensorflow:loss = 0.0871315, step = 41307 (1.458 sec)
INFO:tensorflow:global_step/sec: 70.4233
INFO:tensorflow:loss = 0.0937586, step = 41407 (1.420 sec)
INFO:tensorflow:global_step/sec: 67.4795
INFO:tensorflow:loss = 0.0885613, step = 41507 (1.483 sec)
INFO:tensorflow:global_step/sec: 66.4095
INFO:tensorflow:loss = 0.151837, step = 41607 (1.505 sec)
INFO

INFO:tensorflow:loss = 0.0864086, step = 48213
INFO:tensorflow:global_step/sec: 66.0535
INFO:tensorflow:loss = 0.0984625, step = 48313 (1.518 sec)
INFO:tensorflow:global_step/sec: 66.2534
INFO:tensorflow:loss = 0.0833657, step = 48413 (1.510 sec)
INFO:tensorflow:global_step/sec: 68.9222
INFO:tensorflow:loss = 0.104249, step = 48513 (1.450 sec)
INFO:tensorflow:global_step/sec: 63.3666
INFO:tensorflow:loss = 0.092096, step = 48613 (1.578 sec)
INFO:tensorflow:global_step/sec: 63.0965
INFO:tensorflow:loss = 0.0839222, step = 48713 (1.586 sec)
INFO:tensorflow:global_step/sec: 68.655
INFO:tensorflow:loss = 0.098169, step = 48813 (1.457 sec)
INFO:tensorflow:global_step/sec: 62.3499
INFO:tensorflow:loss = 0.370568, step = 48913 (1.603 sec)
INFO:tensorflow:global_step/sec: 69.1853
INFO:tensorflow:loss = 0.111697, step = 49013 (1.447 sec)
INFO:tensorflow:global_step/sec: 70.6979
INFO:tensorflow:loss = 0.110739, step = 49113 (1.414 sec)
INFO:tensorflow:global_step/sec: 66.8782
INFO:tensorflow:los

INFO:tensorflow:global_step/sec: 69.6248
INFO:tensorflow:loss = 0.299293, step = 56513 (1.436 sec)
INFO:tensorflow:global_step/sec: 74.4273
INFO:tensorflow:loss = 0.137544, step = 56613 (1.344 sec)
INFO:tensorflow:global_step/sec: 72.6233
INFO:tensorflow:loss = 0.103776, step = 56713 (1.377 sec)
INFO:tensorflow:global_step/sec: 67.097
INFO:tensorflow:loss = 0.158614, step = 56813 (1.490 sec)
INFO:tensorflow:global_step/sec: 66.5007
INFO:tensorflow:loss = 0.0947195, step = 56913 (1.503 sec)
INFO:tensorflow:global_step/sec: 74.0417
INFO:tensorflow:loss = 0.0883076, step = 57013 (1.350 sec)
INFO:tensorflow:global_step/sec: 69.0387
INFO:tensorflow:loss = 0.0896186, step = 57113 (1.449 sec)
INFO:tensorflow:global_step/sec: 69.4898
INFO:tensorflow:loss = 0.0968862, step = 57213 (1.441 sec)
INFO:tensorflow:global_step/sec: 65.0495
INFO:tensorflow:loss = 0.0959887, step = 57313 (1.538 sec)
INFO:tensorflow:global_step/sec: 68.9368
INFO:tensorflow:loss = 0.0929928, step = 57413 (1.449 sec)
INFO:

INFO:tensorflow:global_step/sec: 73.6302
INFO:tensorflow:loss = 0.0926713, step = 64813 (1.357 sec)
INFO:tensorflow:global_step/sec: 66.7114
INFO:tensorflow:loss = 0.0886536, step = 64913 (1.501 sec)
INFO:tensorflow:global_step/sec: 62.4177
INFO:tensorflow:loss = 0.0945891, step = 65013 (1.602 sec)
INFO:tensorflow:global_step/sec: 68.0916
INFO:tensorflow:loss = 0.098644, step = 65113 (1.468 sec)
INFO:tensorflow:global_step/sec: 68.6829
INFO:tensorflow:loss = 0.0956822, step = 65213 (1.457 sec)
INFO:tensorflow:global_step/sec: 67.4809
INFO:tensorflow:loss = 0.0926666, step = 65313 (1.480 sec)
INFO:tensorflow:global_step/sec: 68.5582
INFO:tensorflow:loss = 0.0862508, step = 65413 (1.459 sec)
INFO:tensorflow:global_step/sec: 70.3236
INFO:tensorflow:loss = 0.0925355, step = 65513 (1.423 sec)
INFO:tensorflow:global_step/sec: 71.0532
INFO:tensorflow:loss = 0.0877538, step = 65613 (1.406 sec)
INFO:tensorflow:global_step/sec: 68.1811
INFO:tensorflow:loss = 0.152816, step = 65713 (1.468 sec)
IN

INFO:tensorflow:loss = 0.0855288, step = 72319
INFO:tensorflow:global_step/sec: 55.4357
INFO:tensorflow:loss = 0.0996805, step = 72419 (1.808 sec)
INFO:tensorflow:global_step/sec: 62.1319
INFO:tensorflow:loss = 0.0837202, step = 72519 (1.611 sec)
INFO:tensorflow:global_step/sec: 59.2915
INFO:tensorflow:loss = 0.102883, step = 72619 (1.686 sec)
INFO:tensorflow:global_step/sec: 50.4674
INFO:tensorflow:loss = 0.0924919, step = 72719 (1.980 sec)
INFO:tensorflow:global_step/sec: 63.1222
INFO:tensorflow:loss = 0.0845939, step = 72819 (1.584 sec)
INFO:tensorflow:global_step/sec: 62.2588
INFO:tensorflow:loss = 0.0967581, step = 72919 (1.607 sec)
INFO:tensorflow:global_step/sec: 60.6902
INFO:tensorflow:loss = 0.201908, step = 73019 (1.647 sec)
INFO:tensorflow:global_step/sec: 63.0817
INFO:tensorflow:loss = 0.112613, step = 73119 (1.585 sec)
INFO:tensorflow:global_step/sec: 62.8377
INFO:tensorflow:loss = 0.113561, step = 73219 (1.591 sec)
INFO:tensorflow:global_step/sec: 68.5825
INFO:tensorflow:

INFO:tensorflow:loss = 0.294715, step = 80619 (1.487 sec)
INFO:tensorflow:global_step/sec: 73.65
INFO:tensorflow:loss = 0.140154, step = 80719 (1.358 sec)
INFO:tensorflow:global_step/sec: 67.7882
INFO:tensorflow:loss = 0.105468, step = 80819 (1.476 sec)
INFO:tensorflow:global_step/sec: 70.8939
INFO:tensorflow:loss = 0.157327, step = 80919 (1.410 sec)
INFO:tensorflow:global_step/sec: 63.5012
INFO:tensorflow:loss = 0.0953487, step = 81019 (1.576 sec)
INFO:tensorflow:global_step/sec: 65.5663
INFO:tensorflow:loss = 0.087736, step = 81119 (1.524 sec)
INFO:tensorflow:global_step/sec: 69.1492
INFO:tensorflow:loss = 0.090086, step = 81219 (1.448 sec)
INFO:tensorflow:global_step/sec: 66.9819
INFO:tensorflow:loss = 0.100764, step = 81319 (1.491 sec)
INFO:tensorflow:global_step/sec: 71.2857
INFO:tensorflow:loss = 0.0989997, step = 81419 (1.402 sec)
INFO:tensorflow:global_step/sec: 72.6901
INFO:tensorflow:loss = 0.0927463, step = 81519 (1.376 sec)
INFO:tensorflow:global_step/sec: 70.3409
INFO:tens

INFO:tensorflow:global_step/sec: 64.4701
INFO:tensorflow:loss = 0.0928883, step = 88919 (1.551 sec)
INFO:tensorflow:global_step/sec: 59.2592
INFO:tensorflow:loss = 0.0880541, step = 89019 (1.689 sec)
INFO:tensorflow:global_step/sec: 65.017
INFO:tensorflow:loss = 0.0935747, step = 89119 (1.536 sec)
INFO:tensorflow:global_step/sec: 62.3077
INFO:tensorflow:loss = 0.0989972, step = 89219 (1.605 sec)
INFO:tensorflow:global_step/sec: 67.078
INFO:tensorflow:loss = 0.0962449, step = 89319 (1.493 sec)
INFO:tensorflow:global_step/sec: 68.1937
INFO:tensorflow:loss = 0.0911696, step = 89419 (1.465 sec)
INFO:tensorflow:global_step/sec: 66.0295
INFO:tensorflow:loss = 0.085607, step = 89519 (1.513 sec)
INFO:tensorflow:global_step/sec: 57.9822
INFO:tensorflow:loss = 0.0925982, step = 89619 (1.726 sec)
INFO:tensorflow:global_step/sec: 62.4238
INFO:tensorflow:loss = 0.0875162, step = 89719 (1.601 sec)
INFO:tensorflow:global_step/sec: 61.5051
INFO:tensorflow:loss = 0.15195, step = 89819 (1.627 sec)
INFO:

### Testing

Note -- many test examples contain explicit content

In [16]:
test_sents = [("ass and titties", KID_INAPPROPRIATE),
("fuck the police", KID_INAPPROPRIATE),
("cats are nice", KID_APPROPRIATE),
("one day I saw a horse on a hill and I liked it", KID_APPROPRIATE),
("mrs. tayler is a whore", KID_INAPPROPRIATE),
("I'd like to take you someplace nice and quiet", KID_INAPPROPRIATE),
("You gay ? Proceeds to chase him that's how bullying works", KID_INAPPROPRIATE),
("transfer responsibility dad temperature earn voter impossible radiation.", KID_APPROPRIATE),
("JADE CHYNOWETH AND JOSH KILLACKY LOVETEAM PLS", KID_APPROPRIATE),
("Lol, looks like the same generic crap you come to expect from these types of shows.  Guess it doesn't get old for some people.", KID_INAPPROPRIATE),
("Disney sold the rights to this?", KID_APPROPRIATE),
("Only step up 1 and 2 were good. All of the other ones have just been corny.", KID_APPROPRIATE),
("i rather not call it step up, its another level with different camera techniques and its so Channing Tatum style", KID_APPROPRIATE),
("I love the series its very good, im waiting for 2 one", KID_APPROPRIATE),
("GREAT", KID_APPROPRIATE),
("Job negotiate set alternative little introduction apparent crazy proper used care free.", KID_APPROPRIATE),
("Oxygen identify member dependent translate else card might handful.", KID_APPROPRIATE),
("Release accompany pole general something widely fly cup detective personnel.", KID_APPROPRIATE),
("Silly largely obstacle warrior charge flavor diabetes medal.", KID_APPROPRIATE),
("We need moose back", KID_APPROPRIATE),
("Exciting time to be alive bro watching our Prez Trump vs. the DeepState/Swamp that's the deal profs to him.", KID_APPROPRIATE),
("Idk if I want to watch it or not : ^/", KID_APPROPRIATE),
("Maximum testing blanket absolutely shock until actress sex liability.", KID_INAPPROPRIATE),
("3 was the best.", KID_APPROPRIATE),
("I knew they would have a all men gay dance group", KID_INAPPROPRIATE),
("Same old garbage; I'm sick of it !", KID_APPROPRIATE),
("GAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAYYYYYYYYYYYYYYYYYYYYYYYYY",KID_INAPPROPRIATE),
("Millenials = The worst generation",KID_APPROPRIATE),
("that opening statement is the most overused sentence i've seen",KID_APPROPRIATE),
("Pan spokesman via guard campaign characteristic movement expert attend garage.",KID_APPROPRIATE),
("ayyy https://youtu.be/wZwyQZDfyHY",KID_APPROPRIATE),
("WELP....the end of the step up franchise met a very disappointing end. Jesus this looks worse than most dumpster fires Ive seen",KID_APPROPRIATE),
("blacks that look white and wiggas.  yeah, looks great. (not!)",KID_INAPPROPRIATE),
("Rich And Diverse artistry- Advocating White Genocide. How is enrichment going on in South Africa?",KID_INAPPROPRIATE),
("step up <3",KID_APPROPRIATE),
("When is episode 5 coming out",KID_APPROPRIATE),
("This series is actually pretty good...watched it and was so happy I gave it a shot; I loved it! The storyline is actually stronger than I expected, and especially where they left off this first season, I'm so excited for what's to come; I hope they keep up the good work and produce even more (authentic stuff next season.",KID_APPROPRIATE),
("https://vk.com/club162181675 вступайте",KID_APPROPRIATE),
("lunch argument across expansion free govern healthy afternoon.",KID_APPROPRIATE),
("Modern day version of Fame!",KID_APPROPRIATE),
("Neyo!",KID_APPROPRIATE),
("I wasn't expecting this from YouTube but it's really good",KID_APPROPRIATE),
("The",KID_APPROPRIATE),
("what garbage i this want to be step up is more like stomp the yard",KID_APPROPRIATE),
("if you're actually thinking of watching this show... do it. i recommend it. it's so much more than what we ever got to see from step up or any dance movies at all. so much diversity & so many topics it's talking about. i was surprised at how much i actually enjoyed the show & how bad i need a (second season. just give it a try since the first 4 episodes are for free on here, i don't think y'all are gonna regret it",KID_APPROPRIATE),
("Aren’t u guys releasing more seasons and episodes??? we still need more of ‘em",KID_APPROPRIATE),
("Have Chris brown in the step up movies it’ll be dope",KID_APPROPRIATE),
("http://hikmatblogs.blogspot.com/2018/02/10-ways-to-improve-your-finances-today.html",KID_APPROPRIATE),
("No Moose no Step Up",KID_APPROPRIATE),
("Let me guess?? Suburban girl trying to prove her gangsta and most of the black men are gay.... WHACK!!!",KID_INAPPROPRIATE),
("What, no moose cameo ?",KID_APPROPRIATE),
("trash.",KID_APPROPRIATE),
("Why does everything have to be so gay  won't be watching this sick of all our black men in tv series being gay smh",KID_INAPPROPRIATE),
("Who's that girl on the trailer cover?",KID_APPROPRIATE),
("Atlanta is a n!66er filled cesspool.. i can't figure out why parents stopped teaching their children white from wrong.. why today's youth wants grey babies.. why ruin family pictures with those half n!66e kinky haired flat nosed kids..  why these fathers aren't teaching their daughters to stay off the chimpdicks.. i mean, for real, how low does your self esteem have to be to want to share your beautiful young white body with these savage nigloyds... i'm sorry your fathers have failed you girls.. I taught mine white from wrong.. my grandkids will be bright white with blonde hair and blue eyes.. not gray with kinky rugs and flat noses.. just the idea of beautiful young white girls with sweaty, greasy n!66ers up on them just makes me wanna puke.. parents.. it's time to start teaching some basic family values again.. not the kardashian kind where they just keep getting impregnated by stray chimps.. it's just disgusting... let's clean up and keep the white blood lines free from contaminants..  you young white girls... just say no.. find a decent white boy to make babies with.. you can't tell me that a n!66er is all you can get..  well, maybe the fat ugly girls.. but even you can do better.. find a fat ugly white guy..  give him that poontang .. not these foot stomping, drum banging monkeys .. talentless rappers...  all wanna pretend they are in the music business.. rap isn't music.. any tar baby can get a drum machine and call it a beat lab.. then they steal a macbook from a white girl.. and start their career as the white girl's baby daddy.. she says he's a musician.. but he's really just another worthless n166er  he will keep her on welfare.. just a waste.. february is black history month.. the other 11 is caucasian (history months.. we will celebrate by taking our women back..",KID_INAPPROPRIATE),
("i run them streetsXD",KID_APPROPRIATE),
("my answer is....hell noXD",KID_APPROPRIATE),
("Sail star diamond rate working love pond size that monument prevention celebrate.",KID_APPROPRIATE),
("So is Kevin Bacon they're Jesus or something?",KID_APPROPRIATE),
("Can someone please give me the name of the girl on the thumbnail please ?????",KID_APPROPRIATE),
("I would rather be prison raped than watch 5seconds of this garbage",KID_INAPPROPRIATE),
("Prosecution tool before endless visit dump shake sake remarkable hurt safe public.",KID_APPROPRIATE),
("total grief slave esyvbp question fourth fun basic fly sacrifice reply.",KID_INAPPROPRIATE),
("As she named the Empress, Anna Pávlovna’s face suddenly assumed an expression of profound and sincere devotion and respect mingled with sadness, and this occurred every time she mentioned her illustrious patroness. ", KID_INAPPROPRIATE),
("She added that Her Majesty had deigned to show Baron Funke beaucoup d’estime, and again her face clouded over with sadness.", KID_INAPPROPRIATE),
("The prince was silent and looked indifferent. ", KID_APPROPRIATE),
("But, with the womanly and courtierlike quickness and tact habitual to her, Anna Pávlovna wished both to rebuke him (for daring to speak as he had done of a ma ", KID_INAPPROPRIATE),
("recommended to the Empress) and at the same time to console him, so she said ", KID_INAPPROPRIATE)]

In [17]:
test_tokens = [tokenizer.tokenize(x[0]) for x in test_sents]

test_tokens_canonical = [utils.canonicalize_words(x) for x in test_tokens]

test_ids = [vocab.words_to_ids(x) for x in test_tokens_canonical]

test_x, test_ns, test_y = [],[],[]

for i,x in enumerate(test_ids):

    padded, orig_len = pad_array(x,pad_len)

    test_x.append(padded)
    test_ns.append(orig_len)
    test_y.append(test_sents[i][1])

test_x_np = np.array(test_x)
test_ns_np = np.array(test_ns)
test_y_np = np.array(test_y)

In [18]:
test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": test_x_np, "ns": test_ns_np}, y=test_y_np,
                    batch_size=128, num_epochs=1, shuffle=False
                )

eval_metrics = model.evaluate(input_fn=test_input_fn, name="eval")

#### END(YOUR CODE) ####
print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

INFO:tensorflow:Starting evaluation at 2018-03-27-22:03:03
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180327-2137/model.ckpt-96424
INFO:tensorflow:Finished evaluation at 2018-03-27-22:03:03
INFO:tensorflow:Saving dict for global step 96424: accuracy = 0.411765, cross_entropy_loss = 2.22641, global_step = 96424, loss = 2.31485
Accuracy on test set: 41.18%


{'accuracy': 0.41176471,
 'cross_entropy_loss': 2.2264147,
 'global_step': 96424,
 'loss': 2.3148451}

In [19]:
from sklearn.metrics import accuracy_score
predictions = list(model.predict(test_input_fn))  # list of dicts
y_pred = [p['max'] for p in predictions]
acc = accuracy_score(y_pred, test_y)
print("Accuracy on test set: {:.02%}".format(acc))

INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180327-2137/model.ckpt-96424
Accuracy on test set: 41.18%


### Error Analysis

In [20]:
param_names = {KID_APPROPRIATE:   "Appropriate  ",
               KID_INAPPROPRIATE: "Inappropriate"}
template = "Sentence  :   {}\nPredicted :   {} \nActual    :   {}  \n\n" 
for i, example in enumerate(y_pred):
    print(template.format(test_sents[i][0], param_names[y_pred[i]], param_names[test_sents[i][1]]))
    
    

Sentence  :   ass and titties
Predicted :   Inappropriate 
Actual    :   Inappropriate  


Sentence  :   fuck the police
Predicted :   Inappropriate 
Actual    :   Inappropriate  


Sentence  :   cats are nice
Predicted :   Inappropriate 
Actual    :   Appropriate    


Sentence  :   one day I saw a horse on a hill and I liked it
Predicted :   Appropriate   
Actual    :   Appropriate    


Sentence  :   mrs. tayler is a whore
Predicted :   Inappropriate 
Actual    :   Inappropriate  


Sentence  :   I'd like to take you someplace nice and quiet
Predicted :   Inappropriate 
Actual    :   Inappropriate  


Sentence  :   You gay ? Proceeds to chase him that's how bullying works
Predicted :   Inappropriate 
Actual    :   Inappropriate  


Sentence  :   transfer responsibility dad temperature earn voter impossible radiation.
Predicted :   Inappropriate 
Actual    :   Appropriate    


Sentence  :   JADE CHYNOWETH AND JOSH KILLACKY LOVETEAM PLS
Predicted :   Inappropriate 
Actual    :   Appr

#### Error Notes
-  Are long sentences incorrectly categorized?
-  Misses innuendo (but innuendo is rare)
-  Frequently fails random words, but that's probably ok 
-  We're training sentences against (potentially) multi-sentence comments. Is that ok?
-  Would sentiment analysis help?
-  Would another embedding help?
-  Would readability scores help?
-  Worth pursuing a hybrid sensitivity/complexity model, then combine at classification-time?

#### Eng Notes
- How can we improve speed?
- How can we improve accuracy?
- Would adding wikipedia set help?
- How to adjust hyperparams
- How to deal with memory issues