In this notebook we will use the csv pipeline and the cleaneval dataset to train a model with a weighted cost function. 

In [1]:
%matplotlib inline
# standard library
import itertools
import sys, os
import re
import glob

from collections import OrderedDict
from urllib.parse import urlparse

# pandas
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import dask


# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# tesnsorflow
import tensorflow as tf


# local imports
sys.path.append(os.path.join(os.getcwd(), "../src"))

from tf_utils import make_csv_pipeline, make_csv_col_tensors

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [2]:
train_ddf = dd.read_csv('../data/final/cleaneval/dom-full-train-*.csv')
train_ddf.head()

Unnamed: 0,depth,sibling_pos,no_classes,id_len,class_len,no_children,text_len,descendant1_no_nodes,descendant1_no_children_avg,descendant1_id_len_avg,...,ancestor5_tag_content_area,ancestor5_tag_layer,ancestor5_tag_defanged_meta,ancestor5_tag_storysumm,ancestor5_tag_beginlock,ancestor5_tag_endlock,ancestor5_tag_dt,ancestor5_tag_noedit,ancestor5_tag_small,content_label
0,3,0,0,69,0,67,8594,67,1.014925,0.0,...,0,0,0,0,0,0,0,0,0,False
1,5,0,0,0,0,0,19,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,False
2,4,33,0,0,0,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,False
3,5,10,0,0,0,0,107,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,True
4,5,2,0,0,0,1,5,1,0.0,0.0,...,0,0,0,0,0,0,0,0,0,True


In [3]:
# get the label proportion 
label_proportion = train_ddf['content_label'].mean().compute()

In [4]:
# get the feature and label names
label_names = ['content_label']
feature_names = set(train_ddf.columns) - set(['content_label', 'url', 'path'])

In [14]:
%pdb off
# define the input function
def input_fn(csv_pattern, **kws):
    train_ddf = dd.read_csv(csv_pattern)
    # get the cols
    label_names = ['content_label']
    feature_names = set(train_ddf.columns) - set(['content_label', 'url', 'path'])
    
    # get the weight
    label_proportion = train_ddf['content_label'].mean().compute()
    
    # get the tensors
    feature_tens, label_tens, = make_csv_pipeline(csv_pattern=csv_pattern, feature_cols=feature_names, label_cols=label_names, **kws)
    
    batch_size = kws.get('batch_size')
    weight_tens = tf.where(tf.equal(label_tens, tf.constant(1.0)), 
                           tf.constant(0.5 / label_proportion, shape=(batch_size, 1)), 
                           tf.constant(0.5 / (1-label_proportion), shape=(batch_size, 1)))
    return {'x': feature_tens, 'weights': weight_tens}, label_tens

Automatic pdb calling has been turned OFF


In [15]:
input_fn('../data/final/cleaneval/dom-full-*.csv', num_epochs=2000, batch_size=100)

({'weights': <tf.Tensor 'Select_3:0' shape=(100, 1) dtype=float64>,
  'x': <tf.Tensor 'input_pipeline_2/shuffle_batch:0' shape=(100, 1572) dtype=float32>},
 <tf.Tensor 'input_pipeline_2/shuffle_batch:1' shape=(100, 1) dtype=float32>)

In [None]:
tf_feat_cols = [
    tf.feature_column.numeric_column('x', shape=(1572)),
    tf.feature_column.numeric_column('weights')
]
estiamtor = tf.contrib.learn.DNNClassifier(feature_columns=tf_feat_cols, hidden_units=(1000, 500, 200), weight_column_name='weights')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f549725acf8>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/tmp/tmpwssl10o3'}


In [None]:
estiamtor.fit(input_fn=lambda: input_fn('../data/final/cleaneval/dom-full-train-*.csv', num_epochs=2000, batch_size=100)) 

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpwssl10o3/model.ckpt.
INFO:tensorflow:loss = 120.992, step = 1
INFO:tensorflow:global_step/sec: 2.37004
INFO:tensorflow:loss = 27.2559, step = 101 (42.195 sec)
INFO:tensorflow:global_step/sec: 2.34483
INFO:tensorflow:loss = 1.6135, step = 201 (42.647 sec)
INFO:tensorflow:global_step/sec: 2.34563
INFO:tensorflow:loss = 3.53141, step = 301 (42.633 sec)
INFO:tensorflow:global_step/sec: 2.34866
INFO:tensorflow:loss = 5.5612, step = 401 (42.582 sec)
INFO:tensorflow:global_step/sec: 2.35208
INFO:tensorflow:loss = 1.20969, step = 501 (42.511 sec)
INFO:tensorflow:global_step/

In [21]:
estiamtor.evaluate(input_fn=lambda: input_fn('../data/final/cleaneval/dom-full-train-*.csv', num_epochs=1, batch_size=1000, shuffle=False)) 

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Starting evaluation at 2017-10-31-07:30:41
INFO:tensorflow:Restoring parameters from /tmp/tmpwssl10o3/model.ckpt-61973
INFO:tensorflow:Finished evaluation at 2017-10-31-07:56:12
INFO:tensorflow:Saving dict for global step 61973: accuracy = 0.750584, accuracy/baseline_label_mean = 0.499997, accuracy/threshold_0.500000_mean = 0.750584, auc = 0.837838, auc_precision_recall = 0.811292, global_step = 61973, labels/actual_label_mean = 0.499997, labels/prediction_mean = 0.458518, loss = 0.4874, precision/positive_threshold_0.500000_mean = 0.754067, recall/positive_threshold_0.500000_mean = 0.743729


{'accuracy': 0.75058401,
 'accuracy/baseline_label_mean': 0.49999663,
 'accuracy/threshold_0.500000_mean': 0.75058401,
 'auc': 0.83783799,
 'auc_precision_recall': 0.81129247,
 'global_step': 61973,
 'labels/actual_label_mean': 0.49999663,
 'labels/prediction_mean': 0.45851752,
 'loss': 0.48740026,
 'precision/positive_threshold_0.500000_mean': 0.75406724,
 'recall/positive_threshold_0.500000_mean': 0.74372864}

In [20]:
estiamtor.evaluate(input_fn=lambda: input_fn('../data/final/cleaneval/dom-full-test-*.csv', num_epochs=1, batch_size=1000, shuffle=False)) 

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Starting evaluation at 2017-10-31-07:18:28
INFO:tensorflow:Restoring parameters from /tmp/tmpwssl10o3/model.ckpt-61973
INFO:tensorflow:Finished evaluation at 2017-10-31-07:24:01
INFO:tensorflow:Saving dict for global step 61973: accuracy = 0.695138, accuracy/baseline_label_mean = 0.49904, accuracy/threshold_0.500000_mean = 0.695138, auc = 0.782362, auc_precision_recall = 0.74415, global_step = 61973, labels/actual_label_mean = 0.49904, labels/prediction_mean = 0.454101, loss = 0.557717, precision/positive_threshold_0.500000_mean = 0.704906, recall/positive_threshold_0.500000_mean = 0.669281


{'accuracy': 0.69513828,
 'accuracy/baseline_label_mean': 0.49903956,
 'accuracy/threshold_0.500000_mean': 0.69513828,
 'auc': 0.78236204,
 'auc_precision_recall': 0.74414968,
 'global_step': 61973,
 'labels/actual_label_mean': 0.49903956,
 'labels/prediction_mean': 0.45410097,
 'loss': 0.55771714,
 'precision/positive_threshold_0.500000_mean': 0.70490563,
 'recall/positive_threshold_0.500000_mean': 0.66928065}