Trying out estimators with our custom input pipeline

In [1]:
%matplotlib inline
# standard library
import itertools
import sys, os
import re
import glob

from collections import OrderedDict
from urllib.parse import urlparse

# pandas
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import dask


# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# tesnsorflow
import tensorflow as tf


# local imports
sys.path.append(os.path.join(os.getcwd(), "../src"))

from tf_utils import make_csv_pipeline

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [2]:
# we'll initiate a simple pipeline that takes 2 csv files
df = pd.read_csv('../data/final/cleaneval/dom-full-00.csv', nrows=10)
feature_cols = list(filter(lambda x: x not in ['url', 'path', 'content_label'], df.columns))
label_cols = ['content_label']

feature_cols[:5]

['depth', 'sibling_pos', 'no_classes', 'id_len', 'class_len']

In [3]:
def input_fn():
    feature_tens, label_tens, = make_csv_pipeline(csv_files=['../data/final/cleaneval/dom-full-00.csv', '../data/final/cleaneval/dom-full-00.csv'],
                                                  feature_cols=feature_cols, label_cols=label_cols, num_epochs=10)

    return {'x': feature_tens}, label_tens

Now that we have the tensors we should be able to just return them from a function and pass them to an estimator. To start with, we'll pass them to a simple linear classifier.

In [None]:
tf_feat_cols = [
    tf.feature_column.numeric_column('x', shape=(len(feature_cols)))
]
estiamtor = tf.contrib.learn.DNNClassifier(feature_columns=tf_feat_cols, hidden_units=(1000, 500, 200))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f4e16027518>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/tmp/tmpqh8th9z2'}


In [None]:
# train it
estiamtor.fit(input_fn=input_fn) 

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpqh8th9z2/model.ckpt.
INFO:tensorflow:loss = 264.995, step = 1
INFO:tensorflow:global_step/sec: 2.44046
INFO:tensorflow:loss = 0.921877, step = 101 (40.981 sec)
INFO:tensorflow:global_step/sec: 2.43422
INFO:tensorflow:loss = 0.910173, step = 201 (41.078 sec)
INFO:tensorflow:global_step/sec: 2.42428
INFO:tensorflow:loss = 0.530756, step = 301 (41.250 sec)
INFO:tensorflow:global_step/sec: 2.42096
INFO:tensorflow:loss = 0.345638, step = 401 (41.306 sec)
INFO:tensorflow:global_step/sec: 2.41782
INFO:tensorflow:loss = 0.322961, step = 501 (41.360 sec)
INFO:tensorflow:globa

In [None]:
estiamtor.evaluate(input_fn=input_fn)