Trying out estimators with our custom input pipeline

In [8]:
%matplotlib inline
# standard library
import itertools
import sys, os
import re
import glob

from collections import OrderedDict
from urllib.parse import urlparse

# pandas
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import dask


# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# tesnsorflow
import tensorflow as tf


# local imports
sys.path.append(os.path.join(os.getcwd(), "../src"))

from tf_utils import make_csv_pipeline, make_csv_col_tensors

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [2]:
# we'll initiate a simple pipeline that takes 2 csv files
df = pd.read_csv('../data/final/cleaneval/dom-full-00.csv', nrows=10)
feature_cols = list(filter(lambda x: x not in ['url', 'path', 'content_label'], df.columns))
label_cols = ['content_label']

feature_cols[:5]

['depth', 'sibling_pos', 'no_classes', 'id_len', 'class_len']

In [3]:
def input_fn():
    feature_tens, label_tens, = make_csv_pipeline(csv_files=['../data/final/cleaneval/dom-full-00.csv', '../data/final/cleaneval/dom-full-00.csv'],
                                                  feature_cols=feature_cols, label_cols=label_cols, num_epochs=20, batch_size=1000)

    return {'x': feature_tens}, label_tens

def test_input_fn():
    feature_tens, label_tens, = make_csv_pipeline(csv_files=['../data/final/cleaneval/dom-full-00.csv', '../data/final/cleaneval/dom-full-00.csv'],
                                                  feature_cols=feature_cols, label_cols=label_cols, num_epochs=1, batch_size=1000, shuffle=False)

    return {'x': feature_tens}, label_tens

Now that we have the tensors we should be able to just return them from a function and pass them to an estimator. To start with, we'll pass them to a simple linear classifier.

In [4]:
tf_feat_cols = [
    tf.feature_column.numeric_column('x', shape=(len(feature_cols)))
]
estiamtor = tf.contrib.learn.DNNClassifier(feature_columns=tf_feat_cols, hidden_units=(1000, 500, 200))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9819a16a58>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/tmp/tmpc55tgrni'}


In [5]:
%%time
# train it
estiamtor.fit(input_fn=input_fn) 

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpc55tgrni/model.ckpt.
INFO:tensorflow:loss = 53.9072, step = 1
INFO:tensorflow:global_step/sec: 0.238298
INFO:tensorflow:loss = 5.5348, step = 101 (419.645 sec)
INFO:tensorflow:Saving checkpoints for 144 into /tmp/tmpc55tgrni/model.ckpt.
INFO:tensorflow:global_step/sec: 0.238272
INFO:tensorflow:loss = 1.91086, step = 201 (419.689 sec)
INFO:tensorflow:Saving checkpoints for 287 into /tmp/tmpc55tgrni/model.ckpt.
INFO:tensorflow:global_step/sec: 0.238226
INFO:tensorflow:loss = 0.733354, step = 301 (419.769 sec)
INFO:tensorflow:Saving checkpoints for 364 into /tmp/tmpc55t

DNNClassifier(params={'head': <tensorflow.contrib.learn.python.learn.estimators.head._BinaryLogisticHead object at 0x7f98062857f0>, 'hidden_units': (1000, 500, 200), 'feature_columns': (_NumericColumn(key='x', shape=(1572,), default_value=None, dtype=tf.float32, normalizer_fn=None),), 'optimizer': None, 'activation_fn': <function relu at 0x7f980537bd90>, 'dropout': None, 'gradient_clip_norm': None, 'embedding_lr_multipliers': None, 'input_layer_min_slice_size': None})

In [6]:
estiamtor.evaluate(input_fn=test_input_fn)

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Starting evaluation at 2017-10-27-18:38:56
INFO:tensorflow:Restoring parameters from /tmp/tmpc55tgrni/model.ckpt-364
INFO:tensorflow:Finished evaluation at 2017-10-27-18:40:55
INFO:tensorflow:Saving dict for global step 364: accuracy = 0.844611, accuracy/baseline_label_mean = 0.143667, accuracy/threshold_0.500000_mean = 0.844611, auc = 0.544095, auc_precision_recall = 0.171649, global_step = 364, labels/actual_label_mean = 0.143667, labels/prediction_mean = 0.104894, loss = 1.18294, precision/positive_threshold_0.500000_mean = 0.242054, recall/positive_threshold_0.500000_mean = 0.0382831


{'accuracy': 0.84461111,
 'accuracy/baseline_label_mean': 0.14366667,
 'accuracy/threshold_0.500000_mean': 0.84461111,
 'auc': 0.54409468,
 'auc_precision_recall': 0.17164919,
 'global_step': 364,
 'labels/actual_label_mean': 0.14366667,
 'labels/prediction_mean': 0.10489396,
 'loss': 1.1829363,
 'precision/positive_threshold_0.500000_mean': 0.24205379,
 'recall/positive_threshold_0.500000_mean': 0.038283061}

In [7]:
tf.verify_tensor_all_finite

AttributeError: module 'tensorflow' has no attribute 'version'

In [10]:
tens_dict = make_csv_col_tensors(csv_files=['../data/final/cleaneval/dom-full-00.csv', '../data/final/cleaneval/dom-full-00.csv'], 
                                 shuffle=True, num_epochs=None, csv_decoder_kwargs={'convert_ints': True})

In [64]:
depth_tens, sibling_pos = tens_dict['depth'], tens_dict['sibling_pos']
depth_pred = tf.greater(depth_tens, [6])
selected_items = tf.reshape(tf.where(depth_pred), [-1])

found = tf.gather(tf.reshape(tf.stack([depth_tens, sibling_pos]), [-1, 2]), selected_items)
found

<tf.Tensor 'Gather_11:0' shape=(?, 2) dtype=float32>

In [65]:
with tf.Session() as sess:
    # variables must be initialized otherwise it fails 
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    # test the output
    for _ in range(40):
        print(sess.run([depth_tens, found]))
        print("\n=======") 

    # finish
    coord.request_stop()
    coord.join(threads)

[3.0, array([], shape=(0, 2), dtype=float32)]

[5.0, array([], shape=(0, 2), dtype=float32)]

[4.0, array([], shape=(0, 2), dtype=float32)]

[5.0, array([], shape=(0, 2), dtype=float32)]

[5.0, array([], shape=(0, 2), dtype=float32)]

[6.0, array([], shape=(0, 2), dtype=float32)]

[6.0, array([], shape=(0, 2), dtype=float32)]

[4.0, array([], shape=(0, 2), dtype=float32)]

[5.0, array([], shape=(0, 2), dtype=float32)]

[10.0, array([[ 10.,   0.]], dtype=float32)]

[10.0, array([[ 10.,   7.]], dtype=float32)]

[10.0, array([[ 10.,  11.]], dtype=float32)]

[10.0, array([[ 10.,  24.]], dtype=float32)]

[10.0, array([[ 10.,  36.]], dtype=float32)]

[10.0, array([[ 10.,  54.]], dtype=float32)]

[13.0, array([[ 13.,   1.]], dtype=float32)]

[12.0, array([[ 12.,  30.]], dtype=float32)]

[7.0, array([[ 7.,  0.]], dtype=float32)]

[5.0, array([], shape=(0, 2), dtype=float32)]

[6.0, array([], shape=(0, 2), dtype=float32)]

[4.0, array([], shape=(0, 2), dtype=float32)]

[10.0, array([[ 10.,   0.

In [54]:
tf.stack([depth_tens, sibling_pos])

<tf.Tensor 'stack_1:0' shape=(2,) dtype=float32>