In [34]:
from __future__ import print_function
from __future__ import division
from importlib import reload

from nltk.tokenize.treebank import TreebankWordTokenizer
import collections
import nltk
import numpy as np
import glob
import math

import pandas as pd
import tensorflow as tf
import os, sys, re, json, time, datetime, shutil

# Helper libraries
from w266_common import utils, vocabulary
from w266_common import patched_numpy_io

### Constants

In [35]:
KID_APPROPRIATE = 1
KID_INAPPROPRIATE = 0

### Grab Data

In [36]:
book_files = glob.glob('/home/peterg/mids-w266-final-project/final_proj/books/*.txt')
fourchan_files = glob.glob('/home/peterg/mids-w266-final-project/final_proj/4chan/*.txt')

In [37]:
pos_examples, neg_examples = [], []

# positive (appropriate for kids) examples come from 
# the children's bookshelf of project Gutenberg
for book in book_files[:60]:
    with open(book, "r") as pos_file:
        for line in pos_file:
            pos_examples.append(line)

# negative (inapprorpiate for kids) training examples come
# from the "shit 4chan says" board on 4chan.org
single_4chan_file = ["/home/peterg/mids-w266-final-project/final_proj/4chan/4chan_s4s.txt",]
for board in single_4chan_file:
    with open(board, 'r') as neg_file:
        for line in neg_file:
            neg_examples.append(line)

print(len(pos_examples), len(neg_examples))

154277 2760399


### Tokenization

In [38]:
tokenizer = TreebankWordTokenizer()

# the number of positive and negative examples to pull for training
num_sents = 250000

pos_tokens = [tokenizer.tokenize(x) for x in pos_examples[:num_sents]]
neg_tokens = [tokenizer.tokenize(x) for x in neg_examples[:num_sents]]

In [39]:
print(pos_examples[:5])
print(neg_examples[:5])

['      THE SONG OF HIAWATHA       Henry W. Longfellow CONTENTS Introductory Note Introduction I.\n', 'The Peace-Pipe II.\n', 'The Four Winds III.\n', "Hiawatha's Childhood IV.\n", "Hiawatha and Mudjekeewis V.    Hiawatha's Fasting VI.\n"]
["'I only browse on iphone\n", " (check your desktop privilege) How do I create memes on mobile? Does anyone know of an app that is similar to ms paint? Any basic art app that is free and good?'\n", "'( ͡° ͜ʖ ͡°)'\n", "'epin; simply ebin'\n", "'shit not again'\n"]


### Canonicalization

In [40]:
pos_tokens_canonical = [utils.canonicalize_words(x) for x in pos_tokens]
neg_tokens_canonical = [utils.canonicalize_words(x) for x in neg_tokens]

### Generate Vocabulary

In [8]:
all_tokens_canonical = pos_tokens_canonical + neg_tokens_canonical

all_tokens_flattened = [] 
for sublist in all_tokens_canonical:
    for item in sublist:
        all_tokens_flattened.append(item)

vocab = vocabulary.Vocabulary(all_tokens_flattened, size=70000)  # size=None means unlimited
print("Vocabulary size: {:,}".format(vocab.size))


# print("Vocabulary dict: ", vocab.word_to_id)
# x_ids = vocab.words_to_ids(all_tokens_flattened)
# print("x_ids =", x_ids)
# x_tokens_recovered = vocab.ids_to_words(x_ids)
# x_tokens_recovered

Vocabulary size: 70,000


### Generate training set

In [9]:
def pad_array(array, pad_len):
    length = len(array)
    if length <= pad_len:
        delta = pad_len - length
        for i in range(delta):
            array.append(0)
            
        return array, length
    if length > pad_len:
        return array[:pad_len], length

In [10]:
pos_ids = [vocab.words_to_ids(x) for x in pos_tokens_canonical]
neg_ids = [vocab.words_to_ids(x) for x in neg_tokens_canonical]

train_x, train_ns, train_y = [], [], []

pad_len = 40

train_frac = 0.5
dev_frac = 0.2

# train_frac + dev_frac must be <= 1
assert(train_frac + dev_frac <= 1.0)

num_sents = math.floor(len(pos_ids) * train_frac)
num_dev_sents = math.floor(len(pos_ids) * dev_frac)

# add one positive and one negative example
for i in range(num_sents):
    if i % 10000 == 0:
        print("Done with", i)
    padded, orig_len = pad_array(pos_ids[i],pad_len)
    train_x.append(padded)
    train_ns.append(orig_len)
    train_y.append(KID_APPROPRIATE)
    
    padded, orig_len = pad_array(neg_ids[i],pad_len)
    train_x.append(padded)
    train_ns.append(orig_len)
    train_y.append(KID_INAPPROPRIATE)
    
    
dev_x, dev_ns, dev_y = [], [], []

# take dev examples from the back half of the array
for i in range(num_dev_sents):
    if i % 10000 == 0:
        print("Done with", i)
    padded, orig_len = pad_array(pos_ids[-i],pad_len)
    dev_x.append(padded)
    dev_ns.append(orig_len)
    dev_y.append(KID_APPROPRIATE)
    
    padded, orig_len = pad_array(neg_ids[-i],pad_len)
    dev_x.append(padded)
    dev_ns.append(orig_len)
    dev_y.append(KID_INAPPROPRIATE)

# print(train_x[:3])
# print(vocab.ids_to_words(train_x[1]))
# print(train_ns[:3])
# print(train_y[:3])

batches = len(train_x)
max_len = pad_len


Done with 0
Done with 10000
Done with 20000
Done with 30000
Done with 40000
Done with 50000
Done with 60000
Done with 70000
Done with 0
Done with 10000
Done with 20000
Done with 30000


In [11]:
train_x_np = np.array(train_x)
train_ns_np = np.array(train_ns)
train_y_np = np.array(train_y)

dev_x_np = np.array(dev_x)
dev_ns_np = np.array(dev_ns)
dev_y_np = np.array(dev_y)

In [12]:
import models; reload(models)
x, ns, y = train_x_np, train_ns_np, train_y_np
batch_size = 32


# Specify model hyperparameters as used by model_fn
model_params = dict(V=vocab.size, embed_dim=90, hidden_dims=[50,25], num_classes=2,
                    encoder_type='bow',
                    lr=0.1, optimizer='adagrad', beta=0.01)
model_fn = models.classifier_model_fn

total_batches = 0
total_examples = 0
total_loss = 0
loss_ema = np.log(2)  # track exponential-moving-average of loss
ema_decay = np.exp(-1/10)  # decay parameter for moving average = np.exp(-1/history_length)
with tf.Graph().as_default(), tf.Session() as sess:
    ##
    # Construct the graph here. No session.run calls - just wiring up Tensors.
    ##
    # Add placeholders so we can feed in data.
    x_ph_  = tf.placeholder(tf.int32, shape=[None, x.shape[1]])  # [batch_size, max_len]
    ns_ph_ = tf.placeholder(tf.int32, shape=[None])              # [batch_size]
    y_ph_  = tf.placeholder(tf.int32, shape=[None])              # [batch_size]
    
    # Construct the graph using model_fn
    features = {"ids": x_ph_, "ns": ns_ph_}  # note that values are Tensors
    estimator_spec = model_fn(features, labels=y_ph_, mode=tf.estimator.ModeKeys.TRAIN,
                              params=model_params)
    loss_     = estimator_spec.loss
    train_op_ = estimator_spec.train_op
    
    ##
    # Done constructing the graph, now we can make session.run calls.
    ##
    sess.run(tf.global_variables_initializer())
    
    # Run a single epoch
    t0 = time.time()
    for (bx, bns, by) in utils.multi_batch_generator(batch_size, x, ns, y):
        # feed NumPy arrays into the placeholder Tensors
        feed_dict = {x_ph_: bx, ns_ph_: bns, y_ph_: by}
        batch_loss, _ = sess.run([loss_, train_op_], feed_dict=feed_dict)
        
        # Compute some statistics
        total_batches += 1
        total_examples += len(bx)
        total_loss += batch_loss * len(bx)  # re-scale, since batch loss is mean
        # Compute moving average to smooth out noisy per-batch loss
        loss_ema = ema_decay * loss_ema + (1 - ema_decay) * batch_loss
        
        if (total_batches % 25 == 0):
            print("{:5,} examples, moving-average loss {:.2f}".format(total_examples, 
                                                                      loss_ema))    
    print("Completed one epoch in {:s}".format(utils.pretty_timedelta(since=t0)))

  800 examples, moving-average loss 1.18
1,600 examples, moving-average loss 0.83
2,400 examples, moving-average loss 0.80
3,200 examples, moving-average loss 0.70
4,000 examples, moving-average loss 0.74
4,800 examples, moving-average loss 0.58
5,600 examples, moving-average loss 0.52
6,400 examples, moving-average loss 0.51
7,200 examples, moving-average loss 0.42
8,000 examples, moving-average loss 0.37
8,800 examples, moving-average loss 0.35
9,600 examples, moving-average loss 0.33
10,400 examples, moving-average loss 0.33
11,200 examples, moving-average loss 0.35
12,000 examples, moving-average loss 0.28
12,800 examples, moving-average loss 0.34
13,600 examples, moving-average loss 0.33
14,400 examples, moving-average loss 0.25
15,200 examples, moving-average loss 0.27
16,000 examples, moving-average loss 0.21
16,800 examples, moving-average loss 0.22
17,600 examples, moving-average loss 0.21
18,400 examples, moving-average loss 0.19
19,200 examples, moving-average loss 0.19
20,0

In [13]:
import models; reload(models)

# Specify model hyperparameters as used by model_fn
model_params = dict(V=vocab.size, embed_dim=70, hidden_dims=[75,50,25], num_classes=2,
                    encoder_type='bow',
                    lr=0.1, optimizer='adagrad', beta=0.01)

checkpoint_dir = "/tmp/tf_bow_sst_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
# Write vocabulary to file, so TensorBoard can label embeddings.
# creates checkpoint_dir/projector_config.pbtxt and checkpoint_dir/metadata.tsv
vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

model = tf.estimator.Estimator(model_fn=models.classifier_model_fn, 
                               params=model_params,
                               model_dir=checkpoint_dir)
print("")
print("To view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006")

Vocabulary (70,000 words) written to '/tmp/tf_bow_sst_20180407-1945/metadata.tsv'
Projector config written to /tmp/tf_bow_sst_20180407-1945/projector_config.pbtxt
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tf_bow_sst_20180407-1945', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f1b714061d0>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_bow_sst_20180407-1945' --port 6006

Then in your browser, open: http://localhost:6006


### Training

In [14]:
# Training params, just used in this cell for the input_fn-s
train_params = dict(batch_size=32, total_epochs=20, eval_every=5)
assert(train_params['total_epochs'] % train_params['eval_every'] == 0)

# Construct and train the model, saving checkpoints to the directory above.
# Input function for training set batches
# Do 'eval_every' epochs at once, followed by evaluating on the dev set.
# NOTE: use patch_numpy_io.numpy_input_fn instead of tf.estimator.inputs.numpy_input_fn
train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x_np, "ns": train_ns_np}, y=train_y_np,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'], shuffle=True, seed=42
                 )



In [15]:
# Input function for dev set batches. As above, but:
# - Don't randomize order
# - Iterate exactly once (one epoch)
dev_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": dev_x_np, "ns": dev_ns_np}, y=dev_y_np,
                    batch_size=128, num_epochs=1, shuffle=False
                )

for _ in range(train_params['total_epochs'] // train_params['eval_every']):
    # Train for a few epochs, then evaluate on dev
    model.train(input_fn=train_input_fn)
    eval_metrics = model.evaluate(input_fn=dev_input_fn, name="dev")

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tf_bow_sst_20180407-1945/model.ckpt.
INFO:tensorflow:loss = 1.8212612, step = 1
INFO:tensorflow:global_step/sec: 70.1104
INFO:tensorflow:loss = 0.73191017, step = 101 (1.430 sec)
INFO:tensorflow:global_step/sec: 72.2771
INFO:tensorflow:loss = 0.4183992, step = 201 (1.383 sec)
INFO:tensorflow:global_step/sec: 71.1118
INFO:tensorflow:loss = 0.2924765, step = 301 (1.406 sec)
INFO:tensorflow:global_step/sec: 72.6891
INFO:tensorflow:loss = 0.26893952, step = 401 (1.376 sec)
INFO:tensorflow:global_step/sec: 73.063
INFO:tensorflow:loss = 0.21890113, step = 501 (1.368 sec)
INFO:tensorflow:global_step/sec: 69.1736
INFO:tensorflow:loss = 0.17921704, step = 601 (1.446 sec)
INFO:tensorflow:global_step/sec: 72.4554
INFO:tensorflow:loss = 0.3353104, step = 701 (1.381 sec)
INFO:tensorflow:global_step/sec: 72.8401
INFO:tensorflow:loss = 0.1396102, step = 801 (1.372 sec)
INFO:tensorflow:global_step/sec: 69.12

INFO:tensorflow:global_step/sec: 77.3291
INFO:tensorflow:loss = 0.12246691, step = 8201 (1.293 sec)
INFO:tensorflow:global_step/sec: 73.3492
INFO:tensorflow:loss = 0.30651557, step = 8301 (1.364 sec)
INFO:tensorflow:global_step/sec: 74.7583
INFO:tensorflow:loss = 0.11857522, step = 8401 (1.337 sec)
INFO:tensorflow:global_step/sec: 76.6271
INFO:tensorflow:loss = 0.127384, step = 8501 (1.305 sec)
INFO:tensorflow:global_step/sec: 76.3148
INFO:tensorflow:loss = 0.29164302, step = 8601 (1.311 sec)
INFO:tensorflow:global_step/sec: 72.77
INFO:tensorflow:loss = 0.092764914, step = 8701 (1.374 sec)
INFO:tensorflow:global_step/sec: 71.418
INFO:tensorflow:loss = 0.1693297, step = 8801 (1.401 sec)
INFO:tensorflow:global_step/sec: 76.2913
INFO:tensorflow:loss = 0.09479257, step = 8901 (1.311 sec)
INFO:tensorflow:global_step/sec: 70.7773
INFO:tensorflow:loss = 0.11026623, step = 9001 (1.413 sec)
INFO:tensorflow:global_step/sec: 73.5978
INFO:tensorflow:loss = 0.10946289, step = 9101 (1.359 sec)
INFO:

INFO:tensorflow:global_step/sec: 76.9318
INFO:tensorflow:loss = 0.09070624, step = 16401 (1.299 sec)
INFO:tensorflow:global_step/sec: 75.9481
INFO:tensorflow:loss = 0.11203512, step = 16501 (1.317 sec)
INFO:tensorflow:global_step/sec: 77.5254
INFO:tensorflow:loss = 0.09458701, step = 16601 (1.290 sec)
INFO:tensorflow:global_step/sec: 78.7782
INFO:tensorflow:loss = 0.09208614, step = 16701 (1.270 sec)
INFO:tensorflow:global_step/sec: 72.0388
INFO:tensorflow:loss = 0.10202843, step = 16801 (1.388 sec)
INFO:tensorflow:global_step/sec: 74.4737
INFO:tensorflow:loss = 0.09511693, step = 16901 (1.344 sec)
INFO:tensorflow:global_step/sec: 76.1341
INFO:tensorflow:loss = 0.09512192, step = 17001 (1.312 sec)
INFO:tensorflow:global_step/sec: 75.4755
INFO:tensorflow:loss = 0.0940232, step = 17101 (1.326 sec)
INFO:tensorflow:global_step/sec: 74.5323
INFO:tensorflow:loss = 0.092890345, step = 17201 (1.340 sec)
INFO:tensorflow:global_step/sec: 74.8635
INFO:tensorflow:loss = 0.098310485, step = 17301 (

INFO:tensorflow:Saving dict for global step 24106: accuracy = 0.9432831, cross_entropy_loss = 0.18783706, global_step = 24106, loss = 0.3112657
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180407-1945/model.ckpt-24106
INFO:tensorflow:Saving checkpoints for 24107 into /tmp/tf_bow_sst_20180407-1945/model.ckpt.
INFO:tensorflow:loss = 0.08814412, step = 24107
INFO:tensorflow:global_step/sec: 73.0021
INFO:tensorflow:loss = 0.09928262, step = 24207 (1.373 sec)
INFO:tensorflow:global_step/sec: 72.7828
INFO:tensorflow:loss = 0.08394588, step = 24307 (1.374 sec)
INFO:tensorflow:global_step/sec: 72.4215
INFO:tensorflow:loss = 0.10210018, step = 24407 (1.381 sec)
INFO:tensorflow:global_step/sec: 76.9567
INFO:tensorflow:loss = 0.09652099, step = 24507 (1.300 sec)
INFO:tensorflow:global_step/sec: 73.9364
INFO:tensorflow:loss = 0.08483375, step = 24607 (1.352 sec)
INFO:tensorflow:global_step/sec: 72.6802
INFO:tensorflow:loss = 0.09912737, st

INFO:tensorflow:global_step/sec: 73.242
INFO:tensorflow:loss = 0.095132865, step = 31907 (1.366 sec)
INFO:tensorflow:global_step/sec: 69.9948
INFO:tensorflow:loss = 0.10588285, step = 32007 (1.429 sec)
INFO:tensorflow:global_step/sec: 66.8518
INFO:tensorflow:loss = 0.15578082, step = 32107 (1.496 sec)
INFO:tensorflow:global_step/sec: 72.7154
INFO:tensorflow:loss = 0.11114587, step = 32207 (1.375 sec)
INFO:tensorflow:global_step/sec: 73.3189
INFO:tensorflow:loss = 0.13507344, step = 32307 (1.365 sec)
INFO:tensorflow:global_step/sec: 75.5054
INFO:tensorflow:loss = 0.30450794, step = 32407 (1.324 sec)
INFO:tensorflow:global_step/sec: 72.5039
INFO:tensorflow:loss = 0.13073687, step = 32507 (1.379 sec)
INFO:tensorflow:global_step/sec: 73.3795
INFO:tensorflow:loss = 0.102515236, step = 32607 (1.363 sec)
INFO:tensorflow:global_step/sec: 72.4624
INFO:tensorflow:loss = 0.16718143, step = 32707 (1.380 sec)
INFO:tensorflow:global_step/sec: 70.7979
INFO:tensorflow:loss = 0.095002726, step = 32807 

INFO:tensorflow:loss = 0.1017332, step = 40007 (1.413 sec)
INFO:tensorflow:global_step/sec: 65.3758
INFO:tensorflow:loss = 0.09847071, step = 40107 (1.530 sec)
INFO:tensorflow:global_step/sec: 70.1457
INFO:tensorflow:loss = 0.08969128, step = 40207 (1.425 sec)
INFO:tensorflow:global_step/sec: 72.5478
INFO:tensorflow:loss = 0.08733855, step = 40307 (1.379 sec)
INFO:tensorflow:global_step/sec: 74.0109
INFO:tensorflow:loss = 0.091902316, step = 40407 (1.352 sec)
INFO:tensorflow:global_step/sec: 74.4494
INFO:tensorflow:loss = 0.08986504, step = 40507 (1.342 sec)
INFO:tensorflow:global_step/sec: 79.0441
INFO:tensorflow:loss = 0.109724134, step = 40607 (1.265 sec)
INFO:tensorflow:global_step/sec: 80.3539
INFO:tensorflow:loss = 0.09323846, step = 40707 (1.245 sec)
INFO:tensorflow:global_step/sec: 79.7279
INFO:tensorflow:loss = 0.0895199, step = 40807 (1.254 sec)
INFO:tensorflow:global_step/sec: 77.4958
INFO:tensorflow:loss = 0.09285126, step = 40907 (1.290 sec)
INFO:tensorflow:global_step/sec

INFO:tensorflow:global_step/sec: 77.5732
INFO:tensorflow:loss = 0.0858433, step = 48207 (1.289 sec)
INFO:tensorflow:Saving checkpoints for 48212 into /tmp/tf_bow_sst_20180407-1945/model.ckpt.
INFO:tensorflow:Loss for final step: 0.095300265.
INFO:tensorflow:Starting evaluation at 2018-04-07-19:56:58
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180407-1945/model.ckpt-48212
INFO:tensorflow:Finished evaluation at 2018-04-07-19:57:08
INFO:tensorflow:Saving dict for global step 48212: accuracy = 0.94015557, cross_entropy_loss = 0.20572443, global_step = 48212, loss = 0.32540253
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180407-1945/model.ckpt-48212
INFO:tensorflow:Saving checkpoints for 48213 into /tmp/tf_bow_sst_20180407-1945/model.ckpt.
INFO:tensorflow:loss = 0.08640856, step = 48213
INFO:tensorflow:global_step/sec: 45.2644
INFO:tensorflow:loss = 0.098462485, step = 48313 (2.213 sec)
INFO:tensorflow:global_step/se

INFO:tensorflow:global_step/sec: 72.0689
INFO:tensorflow:loss = 0.09110436, step = 55613 (1.387 sec)
INFO:tensorflow:global_step/sec: 77.0421
INFO:tensorflow:loss = 0.09484466, step = 55713 (1.297 sec)
INFO:tensorflow:global_step/sec: 75.6906
INFO:tensorflow:loss = 0.08519053, step = 55813 (1.321 sec)
INFO:tensorflow:global_step/sec: 74.2109
INFO:tensorflow:loss = 0.40039152, step = 55913 (1.347 sec)
INFO:tensorflow:global_step/sec: 76.9561
INFO:tensorflow:loss = 0.095851324, step = 56013 (1.299 sec)
INFO:tensorflow:global_step/sec: 78.2357
INFO:tensorflow:loss = 0.10430713, step = 56113 (1.279 sec)
INFO:tensorflow:global_step/sec: 76.7095
INFO:tensorflow:loss = 0.14231339, step = 56213 (1.303 sec)
INFO:tensorflow:global_step/sec: 77.4968
INFO:tensorflow:loss = 0.107990995, step = 56313 (1.291 sec)
INFO:tensorflow:global_step/sec: 75.1665
INFO:tensorflow:loss = 0.12576553, step = 56413 (1.330 sec)
INFO:tensorflow:global_step/sec: 78.4003
INFO:tensorflow:loss = 0.29929343, step = 56513 

INFO:tensorflow:loss = 0.09551202, step = 63713 (1.317 sec)
INFO:tensorflow:global_step/sec: 74.406
INFO:tensorflow:loss = 0.12051158, step = 63813 (1.346 sec)
INFO:tensorflow:global_step/sec: 69.4098
INFO:tensorflow:loss = 0.11057994, step = 63913 (1.439 sec)
INFO:tensorflow:global_step/sec: 77.7101
INFO:tensorflow:loss = 0.09327662, step = 64013 (1.287 sec)
INFO:tensorflow:global_step/sec: 78.1082
INFO:tensorflow:loss = 0.102683745, step = 64113 (1.281 sec)
INFO:tensorflow:global_step/sec: 77.7361
INFO:tensorflow:loss = 0.10087173, step = 64213 (1.286 sec)
INFO:tensorflow:global_step/sec: 75.4329
INFO:tensorflow:loss = 0.08997594, step = 64313 (1.326 sec)
INFO:tensorflow:global_step/sec: 75.0673
INFO:tensorflow:loss = 0.08707526, step = 64413 (1.332 sec)
INFO:tensorflow:global_step/sec: 75.7815
INFO:tensorflow:loss = 0.09069418, step = 64513 (1.319 sec)
INFO:tensorflow:global_step/sec: 74.6087
INFO:tensorflow:loss = 0.08974802, step = 64613 (1.341 sec)
INFO:tensorflow:global_step/sec

INFO:tensorflow:global_step/sec: 80.3847
INFO:tensorflow:loss = 0.086648464, step = 71913 (1.244 sec)
INFO:tensorflow:global_step/sec: 77.1416
INFO:tensorflow:loss = 0.10935925, step = 72013 (1.298 sec)
INFO:tensorflow:global_step/sec: 74.4365
INFO:tensorflow:loss = 0.0946792, step = 72113 (1.342 sec)
INFO:tensorflow:global_step/sec: 74.7865
INFO:tensorflow:loss = 0.09238808, step = 72213 (1.337 sec)
INFO:tensorflow:global_step/sec: 49.1426
INFO:tensorflow:loss = 0.08585795, step = 72313 (2.035 sec)
INFO:tensorflow:Saving checkpoints for 72318 into /tmp/tf_bow_sst_20180407-1945/model.ckpt.
INFO:tensorflow:Loss for final step: 0.09543381.
INFO:tensorflow:Starting evaluation at 2018-04-07-20:02:49
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180407-1945/model.ckpt-72318
INFO:tensorflow:Finished evaluation at 2018-04-07-20:03:00
INFO:tensorflow:Saving dict for global step 72318: accuracy = 0.9380003, cross_entropy_loss = 0.21380843, global_step = 72318, loss = 0.33344015
IN

INFO:tensorflow:global_step/sec: 74.2406
INFO:tensorflow:loss = 0.10018317, step = 79319 (1.347 sec)
INFO:tensorflow:global_step/sec: 71.4065
INFO:tensorflow:loss = 0.09114133, step = 79419 (1.401 sec)
INFO:tensorflow:global_step/sec: 74.475
INFO:tensorflow:loss = 0.15363598, step = 79519 (1.342 sec)
INFO:tensorflow:global_step/sec: 70.9277
INFO:tensorflow:loss = 0.08986954, step = 79619 (1.411 sec)
INFO:tensorflow:global_step/sec: 71.7895
INFO:tensorflow:loss = 0.09133643, step = 79719 (1.393 sec)
INFO:tensorflow:global_step/sec: 74.3831
INFO:tensorflow:loss = 0.0954697, step = 79819 (1.344 sec)
INFO:tensorflow:global_step/sec: 73.4147
INFO:tensorflow:loss = 0.08460758, step = 79919 (1.362 sec)
INFO:tensorflow:global_step/sec: 72.7621
INFO:tensorflow:loss = 0.124873504, step = 80019 (1.374 sec)
INFO:tensorflow:global_step/sec: 76.4077
INFO:tensorflow:loss = 0.0957057, step = 80119 (1.309 sec)
INFO:tensorflow:global_step/sec: 76.0039
INFO:tensorflow:loss = 0.10236323, step = 80219 (1.3

INFO:tensorflow:loss = 0.36181876, step = 87419 (1.432 sec)
INFO:tensorflow:global_step/sec: 75.977
INFO:tensorflow:loss = 0.095850796, step = 87519 (1.317 sec)
INFO:tensorflow:global_step/sec: 73.4808
INFO:tensorflow:loss = 0.085042864, step = 87619 (1.361 sec)
INFO:tensorflow:global_step/sec: 72.9129
INFO:tensorflow:loss = 0.14848384, step = 87719 (1.371 sec)
INFO:tensorflow:global_step/sec: 74.9575
INFO:tensorflow:loss = 0.09718335, step = 87819 (1.334 sec)
INFO:tensorflow:global_step/sec: 75.9466
INFO:tensorflow:loss = 0.12048769, step = 87919 (1.315 sec)
INFO:tensorflow:global_step/sec: 72.7846
INFO:tensorflow:loss = 0.11522512, step = 88019 (1.375 sec)
INFO:tensorflow:global_step/sec: 77.4855
INFO:tensorflow:loss = 0.09210886, step = 88119 (1.289 sec)
INFO:tensorflow:global_step/sec: 79.8561
INFO:tensorflow:loss = 0.10007846, step = 88219 (1.253 sec)
INFO:tensorflow:global_step/sec: 67.7996
INFO:tensorflow:loss = 0.10119109, step = 88319 (1.477 sec)
INFO:tensorflow:global_step/se

INFO:tensorflow:global_step/sec: 79.6533
INFO:tensorflow:loss = 0.10137315, step = 95619 (1.256 sec)
INFO:tensorflow:global_step/sec: 73.9678
INFO:tensorflow:loss = 0.097521104, step = 95719 (1.352 sec)
INFO:tensorflow:global_step/sec: 57.1992
INFO:tensorflow:loss = 0.09194061, step = 95819 (1.751 sec)
INFO:tensorflow:global_step/sec: 62.2841
INFO:tensorflow:loss = 0.12049995, step = 95919 (1.602 sec)
INFO:tensorflow:global_step/sec: 65.6685
INFO:tensorflow:loss = 0.085889496, step = 96019 (1.524 sec)
INFO:tensorflow:global_step/sec: 74.9519
INFO:tensorflow:loss = 0.10970117, step = 96119 (1.333 sec)
INFO:tensorflow:global_step/sec: 75.5042
INFO:tensorflow:loss = 0.09593667, step = 96219 (1.324 sec)
INFO:tensorflow:global_step/sec: 76.1362
INFO:tensorflow:loss = 0.09309052, step = 96319 (1.313 sec)
INFO:tensorflow:global_step/sec: 77.226
INFO:tensorflow:loss = 0.085669324, step = 96419 (1.295 sec)
INFO:tensorflow:Saving checkpoints for 96424 into /tmp/tf_bow_sst_20180407-1945/model.ckp

### Testing

Note -- many test examples contain explicit content

In [16]:
test_sents = [("ass and titties", KID_INAPPROPRIATE),
("fuck the police", KID_INAPPROPRIATE),
("cats are nice", KID_APPROPRIATE),
("one day I saw a horse on a hill and I liked it", KID_APPROPRIATE),
("mrs. tayler is a whore", KID_INAPPROPRIATE),
("I'd like to take you someplace nice and quiet", KID_INAPPROPRIATE),
("You gay ? Proceeds to chase him that's how bullying works", KID_INAPPROPRIATE),
("transfer responsibility dad temperature earn voter impossible radiation.", KID_APPROPRIATE),
("JADE CHYNOWETH AND JOSH KILLACKY LOVETEAM PLS", KID_APPROPRIATE),
("Lol, looks like the same generic crap you come to expect from these types of shows.  Guess it doesn't get old for some people.", KID_INAPPROPRIATE),
("Disney sold the rights to this?", KID_APPROPRIATE),
("Only step up 1 and 2 were good. All of the other ones have just been corny.", KID_APPROPRIATE),
("i rather not call it step up, its another level with different camera techniques and its so Channing Tatum style", KID_APPROPRIATE),
("I love the series its very good, im waiting for 2 one", KID_APPROPRIATE),
("GREAT", KID_APPROPRIATE),
("Job negotiate set alternative little introduction apparent crazy proper used care free.", KID_APPROPRIATE),
("Oxygen identify member dependent translate else card might handful.", KID_APPROPRIATE),
("Release accompany pole general something widely fly cup detective personnel.", KID_APPROPRIATE),
("Silly largely obstacle warrior charge flavor diabetes medal.", KID_APPROPRIATE),
("We need moose back", KID_APPROPRIATE),
("Exciting time to be alive bro watching our Prez Trump vs. the DeepState/Swamp that's the deal profs to him.", KID_APPROPRIATE),
("Idk if I want to watch it or not : ^/", KID_APPROPRIATE),
("Maximum testing blanket absolutely shock until actress sex liability.", KID_INAPPROPRIATE),
("3 was the best.", KID_APPROPRIATE),
("I knew they would have a all men gay dance group", KID_INAPPROPRIATE),
("Same old garbage; I'm sick of it !", KID_APPROPRIATE),
("GAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAYYYYYYYYYYYYYYYYYYYYYYYYY",KID_INAPPROPRIATE),
("Millenials = The worst generation",KID_APPROPRIATE),
("that opening statement is the most overused sentence i've seen",KID_APPROPRIATE),
("Pan spokesman via guard campaign characteristic movement expert attend garage.",KID_APPROPRIATE),
("ayyy https://youtu.be/wZwyQZDfyHY",KID_APPROPRIATE),
("WELP....the end of the step up franchise met a very disappointing end. Jesus this looks worse than most dumpster fires Ive seen",KID_APPROPRIATE),
("blacks that look white and wiggas.  yeah, looks great. (not!)",KID_INAPPROPRIATE),
("Rich And Diverse artistry- Advocating White Genocide. How is enrichment going on in South Africa?",KID_INAPPROPRIATE),
("step up <3",KID_APPROPRIATE),
("When is episode 5 coming out",KID_APPROPRIATE),
("This series is actually pretty good...watched it and was so happy I gave it a shot; I loved it! The storyline is actually stronger than I expected, and especially where they left off this first season, I'm so excited for what's to come; I hope they keep up the good work and produce even more (authentic stuff next season.",KID_APPROPRIATE),
("https://vk.com/club162181675 вступайте",KID_APPROPRIATE),
("lunch argument across expansion free govern healthy afternoon.",KID_APPROPRIATE),
("Modern day version of Fame!",KID_APPROPRIATE),
("Neyo!",KID_APPROPRIATE),
("I wasn't expecting this from YouTube but it's really good",KID_APPROPRIATE),
("The",KID_APPROPRIATE),
("what garbage i this want to be step up is more like stomp the yard",KID_APPROPRIATE),
("if you're actually thinking of watching this show... do it. i recommend it. it's so much more than what we ever got to see from step up or any dance movies at all. so much diversity & so many topics it's talking about. i was surprised at how much i actually enjoyed the show & how bad i need a (second season. just give it a try since the first 4 episodes are for free on here, i don't think y'all are gonna regret it",KID_APPROPRIATE),
("Aren’t u guys releasing more seasons and episodes??? we still need more of ‘em",KID_APPROPRIATE),
("Have Chris brown in the step up movies it’ll be dope",KID_APPROPRIATE),
("http://hikmatblogs.blogspot.com/2018/02/10-ways-to-improve-your-finances-today.html",KID_APPROPRIATE),
("No Moose no Step Up",KID_APPROPRIATE),
("Let me guess?? Suburban girl trying to prove her gangsta and most of the black men are gay.... WHACK!!!",KID_INAPPROPRIATE),
("What, no moose cameo ?",KID_APPROPRIATE),
("trash.",KID_APPROPRIATE),
("Why does everything have to be so gay  won't be watching this sick of all our black men in tv series being gay smh",KID_INAPPROPRIATE),
("Who's that girl on the trailer cover?",KID_APPROPRIATE),
("Atlanta is a n!66er filled cesspool.. i can't figure out why parents stopped teaching their children white from wrong.. why today's youth wants grey babies.. why ruin family pictures with those half n!66e kinky haired flat nosed kids..  why these fathers aren't teaching their daughters to stay off the chimpdicks.. i mean, for real, how low does your self esteem have to be to want to share your beautiful young white body with these savage nigloyds... i'm sorry your fathers have failed you girls.. I taught mine white from wrong.. my grandkids will be bright white with blonde hair and blue eyes.. not gray with kinky rugs and flat noses.. just the idea of beautiful young white girls with sweaty, greasy n!66ers up on them just makes me wanna puke.. parents.. it's time to start teaching some basic family values again.. not the kardashian kind where they just keep getting impregnated by stray chimps.. it's just disgusting... let's clean up and keep the white blood lines free from contaminants..  you young white girls... just say no.. find a decent white boy to make babies with.. you can't tell me that a n!66er is all you can get..  well, maybe the fat ugly girls.. but even you can do better.. find a fat ugly white guy..  give him that poontang .. not these foot stomping, drum banging monkeys .. talentless rappers...  all wanna pretend they are in the music business.. rap isn't music.. any tar baby can get a drum machine and call it a beat lab.. then they steal a macbook from a white girl.. and start their career as the white girl's baby daddy.. she says he's a musician.. but he's really just another worthless n166er  he will keep her on welfare.. just a waste.. february is black history month.. the other 11 is caucasian (history months.. we will celebrate by taking our women back..",KID_INAPPROPRIATE),
("i run them streetsXD",KID_APPROPRIATE),
("my answer is....hell noXD",KID_APPROPRIATE),
("Sail star diamond rate working love pond size that monument prevention celebrate.",KID_APPROPRIATE),
("So is Kevin Bacon they're Jesus or something?",KID_APPROPRIATE),
("Can someone please give me the name of the girl on the thumbnail please ?????",KID_APPROPRIATE),
("I would rather be prison raped than watch 5seconds of this garbage",KID_INAPPROPRIATE),
("Prosecution tool before endless visit dump shake sake remarkable hurt safe public.",KID_APPROPRIATE),
("total grief slave esyvbp question fourth fun basic fly sacrifice reply.",KID_INAPPROPRIATE),
("As she named the Empress, Anna Pávlovna’s face suddenly assumed an expression of profound and sincere devotion and respect mingled with sadness, and this occurred every time she mentioned her illustrious patroness. ", KID_INAPPROPRIATE),
("She added that Her Majesty had deigned to show Baron Funke beaucoup d’estime, and again her face clouded over with sadness.", KID_INAPPROPRIATE),
("The prince was silent and looked indifferent. ", KID_APPROPRIATE),
("But, with the womanly and courtierlike quickness and tact habitual to her, Anna Pávlovna wished both to rebuke him (for daring to speak as he had done of a ma ", KID_INAPPROPRIATE),
("recommended to the Empress) and at the same time to console him, so she said ", KID_INAPPROPRIATE)]

In [17]:
test_tokens = [tokenizer.tokenize(x[0]) for x in test_sents]

test_tokens_canonical = [utils.canonicalize_words(x) for x in test_tokens]

test_ids = [vocab.words_to_ids(x) for x in test_tokens_canonical]

test_x, test_ns, test_y = [],[],[]

for i,x in enumerate(test_ids):

    padded, orig_len = pad_array(x,pad_len)

    test_x.append(padded)
    test_ns.append(orig_len)
    test_y.append(test_sents[i][1])

test_x_np = np.array(test_x)
test_ns_np = np.array(test_ns)
test_y_np = np.array(test_y)

In [18]:
test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": test_x_np, "ns": test_ns_np}, y=test_y_np,
                    batch_size=128, num_epochs=1, shuffle=False
                )

eval_metrics = model.evaluate(input_fn=test_input_fn, name="eval")

#### END(YOUR CODE) ####
print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

INFO:tensorflow:Starting evaluation at 2018-04-07-20:08:57
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180407-1945/model.ckpt-96424
INFO:tensorflow:Finished evaluation at 2018-04-07-20:08:59
INFO:tensorflow:Saving dict for global step 96424: accuracy = 0.4117647, cross_entropy_loss = 2.2264147, global_step = 96424, loss = 2.314845
Accuracy on test set: 41.18%


{'accuracy': 0.4117647,
 'cross_entropy_loss': 2.2264147,
 'global_step': 96424,
 'loss': 2.314845}

In [19]:
from sklearn.metrics import accuracy_score
predictions = list(model.predict(test_input_fn))  # list of dicts
y_pred = [p['max'] for p in predictions]
acc = accuracy_score(y_pred, test_y)
print("Accuracy on test set: {:.02%}".format(acc))

INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180407-1945/model.ckpt-96424
Accuracy on test set: 41.18%


### Error Analysis

In [20]:
param_names = {KID_APPROPRIATE:   "Appropriate  ",
               KID_INAPPROPRIATE: "Inappropriate"}
template = "Sentence  :   {}\nPredicted :   {} \nActual    :   {}  \n\n" 
for i, example in enumerate(y_pred):
    print(template.format(test_sents[i][0], param_names[y_pred[i]], param_names[test_sents[i][1]]))
    
    

Sentence  :   ass and titties
Predicted :   Inappropriate 
Actual    :   Inappropriate  


Sentence  :   fuck the police
Predicted :   Inappropriate 
Actual    :   Inappropriate  


Sentence  :   cats are nice
Predicted :   Inappropriate 
Actual    :   Appropriate    


Sentence  :   one day I saw a horse on a hill and I liked it
Predicted :   Appropriate   
Actual    :   Appropriate    


Sentence  :   mrs. tayler is a whore
Predicted :   Inappropriate 
Actual    :   Inappropriate  


Sentence  :   I'd like to take you someplace nice and quiet
Predicted :   Inappropriate 
Actual    :   Inappropriate  


Sentence  :   You gay ? Proceeds to chase him that's how bullying works
Predicted :   Inappropriate 
Actual    :   Inappropriate  


Sentence  :   transfer responsibility dad temperature earn voter impossible radiation.
Predicted :   Inappropriate 
Actual    :   Appropriate    


Sentence  :   JADE CHYNOWETH AND JOSH KILLACKY LOVETEAM PLS
Predicted :   Inappropriate 
Actual    :   Appr

#### Error Notes
-  Are long sentences incorrectly categorized?
-  Misses innuendo (but innuendo is rare)
-  Frequently fails random words, but that's probably ok 
-  We're training sentences against (potentially) multi-sentence comments. Is that ok?
-  Would sentiment analysis help?
-  Would another embedding help?
-  Would readability scores help?
-  Worth pursuing a hybrid sensitivity/complexity model, then combine at classification-time?

#### Eng Notes
- How can we improve speed?
- How can we improve accuracy?
- Would adding wikipedia set help?
- How to adjust hyperparams
- How to deal with memory issues

# RNN Classification

https://github.com/kamei86i/rnn-classifier-tf

In [44]:
import os
import random
import math

import tensorflow as tf
from embedding import *

from rnn_text_classifier import RnnTextClassifier
from sentence import Sentence

# Model parameters
rnn_num = 2
# cell_size = 64
embedding_size = 128
train_embedding = False
max_sent_length = 128

embedding_path = ""
model_dir = "runs"
summaries_dir = "summary"

dropout_keep_prob = 0
lam = 1

# Training parameters
batch_size = 128
num_epochs = 15
learning_rate = 0

PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'

TEST_FRAC = 0.2
DATASET_SIZE = 100000

if not os.path.exists(model_dir):
    os.mkdir(model_dir)


In [45]:
def pad_list(in_list, pad_len):
    length = len(in_list)
    if length <= pad_len:
        delta = pad_len - length
        for i in range(delta):
            in_list.append(PAD_TOKEN)
            
        return in_list, length
    if length > pad_len:
        return in_list[:pad_len], length

def get_dataset(length):
    num_pos_neg = length // 2 
    dataset = []
    labels = set([KID_APPROPRIATE, KID_INAPPROPRIATE])
    length = 0
    for idx in range(num_pos_neg):
        label = KID_APPROPRIATE
        sentence = " ".join(pos_tokens_canonical[idx])
        tokenized_sentence = pos_tokens_canonical[idx]
        if idx == 0:
            print(sentence)
            print(tokenized_sentence)

        dataset.append(Sentence(sentence, label, tokenized_sentence))
        label = KID_INAPPROPRIATE
        sentence = " ".join(neg_tokens_canonical[idx])
        tokenized_sentence = neg_tokens_canonical[idx]
        dataset.append(Sentence(sentence, label, tokenized_sentence))
    return dataset, length, labels


def build_vocab(dataset):
    print("Building vocab")
    vocab = dict()
    c = 0
    vocab[PAD_TOKEN] = c
    c += 1
    vocab[UNK_TOKEN] = c
    c += 1

    for d in dataset:
        for token in d.tokens:
            if token not in vocab:
                vocab[token] = c
                c += 1
    print("vocab size is: " + str(len(vocab)))
    return vocab


def build_label_dict(labels):
    ret = dict()
    i = 0
    for label in labels:
        ret[label] = i
        i += 1
    return ret


def load_data(size, max_length):
    print("Loading data...")
    dataset, _, labels = get_dataset(size)
    num_c = len(labels)
    total_count = len(dataset)
    split_index = math.floor(TEST_FRAC * total_count)
    test = dataset[:split_index]
    train = dataset[split_index:]

    vocab = build_vocab(train)
    label_dict = build_label_dict(labels)

    for example in train:
        example.pad_to(max_length, PAD_TOKEN)
        example.apply_vocabs(vocab, UNK_TOKEN, label_dict)
    for example in test:
        example.pad_to(max_length, PAD_TOKEN)
        example.apply_vocabs(vocab, UNK_TOKEN, label_dict)
    print("Data loaded")
    return train, test, vocab, max_length, labels


def shuffle(x):
    x = list(x)
    random.shuffle(x)
    return x


def split(data, factor):
    shuffled = shuffle(data)
    train_size = int(len(shuffled) * factor)
    return data[:train_size], data[train_size:]


def save(data, path):
    import pickle
    pickle.dump(data, open(path, 'wb'))


def load(data, path):
    import pickle
    vocab, max_length = pickle.load(open(path, 'wb'))
    return vocab, max_length


def get_xy(data):
    x = []
    y = []
    for d in data:
        x.append(d.tokens_ids)
        y.append(d.labels_ids)

    return np.array(x), np.array(y)


def get_training_batches(data, batch_size):
    num_batches = int(len(data) / batch_size)
    shuffled = shuffle(data)
    for batch_i in range(num_batches):
        start = batch_i * batch_size
        end = min((batch_i + 1) * batch_size, len(data))

        yield data[start: end]


def train_and_test():
    with tf.Graph().as_default():
        np.random.seed(10)
        tf.set_random_seed(10)

        all_train_data, test_data, vocab, max_len, labels = load_data(DATASET_SIZE, max_sent_length)
        num_classes = len(labels)
        save([vocab, labels, max_len], model_dir + "/params.pkl")
        train_data, valid_data = split(all_train_data, 0.8)


        sess = tf.Session()
        with sess.as_default():
            print("Initializing Embedding")
            print("max_len", max_len)

            embedding = None
            cell_size = max_len
            if embedding_path != "":
                embedding = Word2VecEmbedding(embedding_path, vocab, train_embedding)
            else:
                embedding = RandomEmbedding(len(vocab), embedding_size)

            print("Building nn_model")
            print( "embedding", embedding.get_w().shape)
            model = RnnTextClassifier(batch_size=batch_size, sentence_length=max_len,
                                      embedding=embedding, cell_layer_size=cell_size,
                                      cell_layer_num=rnn_num,
                                      num_classes=num_classes, lr=learning_rate, lam=lam)
            model.build_network()
            print("Building training operations")
            model.build_train_ops()
            model.summary()

            tf.global_variables_initializer().run()

            valid_x, valid_y = get_xy(valid_data)
            test_x, test_y = get_xy(test_data)
            saver = tf.train.Saver(max_to_keep=1)
            best_vd_accuracy = 0.0
            best_vd_loss = 0.0
            best_tt_accuracy = 0.0
            best_tt_loss = 0.0

            writer = tf.summary.FileWriter(summaries_dir + "/train",
                                           sess.graph)

            print("Start training")
            for epoch in range(num_epochs):
                batches = get_training_batches(train_data, batch_size)
                # Training on batches
                for batch in batches:
                    train_x, train_y = get_xy(batch)

                    step, loss, accuracy, summary = model.train(sess, train_x, train_y, dropout_keep_prob)
                    writer.add_summary(summary, step)
                    print("Training: epoch\t{:g}\tstep\t{:g}\tloss\t{:g}\taccuracy\t{:g}".format(epoch, step, loss,
                                                                                                 accuracy))

                # Evaluate on validation and test set
                vd_step, vd_loss, vd_accuracy, _ = model.step(sess, valid_x, valid_y)
                print("Validation: loss\t{:g}\taccuracy\t{:g}".format(vd_loss, vd_accuracy))
                tt_step, tt_loss, tt_accuracy, _ = model.step(sess, test_x, test_y)
                print("Testing: loss\t{:g}\taccuracy\t{:g}".format(tt_loss, tt_accuracy))

                if vd_accuracy > best_vd_accuracy:
                    best_vd_accuracy = vd_accuracy
                    best_vd_loss = vd_loss
                    best_tt_accuracy = tt_accuracy
                    best_tt_loss = tt_loss
                    print("Saving nn_model")
                    saver.save(sess, model_dir + "/qc_model")

            print("Best Validation: loss\t{:g}\taccuracy\t{:g}".format(best_vd_loss, best_vd_accuracy))
            print("Best Testing: loss\t{:g}\taccuracy\t{:g}".format(best_tt_loss, best_tt_accuracy))


In [46]:

train_and_test()

Loading data...
the song of hiawatha henry w. longfellow contents introductory note introduction i .
['the', 'song', 'of', 'hiawatha', 'henry', 'w.', 'longfellow', 'contents', 'introductory', 'note', 'introduction', 'i', '.']
Building vocab
vocab size is: 55621
Data loaded
Initializing Embedding
max_len 40
Building nn_model
embedding (55621, 128)
{'lr': 0, 'lam': 1, 'num_classes': 2, 'cell_layer_num': 2, 'cell_layer_size': 40, 'embedding': <embedding.RandomEmbedding object at 0x7f986985b1d0>, 'sentence_length': 40, 'batch_size': 128, 'self': <rnn_text_classifier.RnnTextClassifier object at 0x7f986985b208>}
x (?, 40) y (?, 2)
embedding_input (?, 40, 128)


ValueError: Dimensions must be equal, but are 80 and 168 for 'rnn_ops/rnn/while/rnn/multi_rnn_cell/cell_0/cell_0/gru_cell/MatMul_2' (op: 'MatMul') with input shapes: [?,80], [168,80].

In [None]:
!which python