In this notebook we will use the csv pipeline and the cleaneval dataset to train a model with a weighted cost function. 

In [1]:
%matplotlib inline
# standard library
import itertools
import sys, os
import re
import glob

from collections import OrderedDict
from urllib.parse import urlparse

# pandas
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import dask


# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# tesnsorflow
import tensorflow as tf

from tensorflow.contrib.hooks import ProfilerHook
# local imports
sys.path.append(os.path.join(os.getcwd(), "../src"))

from tf_utils import make_csv_pipeline, make_csv_col_tensors

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [2]:
train_ddf = dd.read_csv('../data/final/cleaneval/dom-full-train-*.csv')
train_ddf.head()

Unnamed: 0,depth,sibling_pos,no_classes,id_len,class_len,no_children,text_len,descendant1_no_nodes,descendant1_no_children_avg,descendant1_id_len_avg,...,ancestor5_tag_content_area,ancestor5_tag_layer,ancestor5_tag_defanged_meta,ancestor5_tag_storysumm,ancestor5_tag_beginlock,ancestor5_tag_endlock,ancestor5_tag_dt,ancestor5_tag_noedit,ancestor5_tag_small,content_label
0,3,0,0,69,0,67,8594,67,1.014925,0.0,...,0,0,0,0,0,0,0,0,0,False
1,5,0,0,0,0,0,19,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,False
2,4,33,0,0,0,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,False
3,5,10,0,0,0,0,107,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,True
4,5,2,0,0,0,1,5,1,0.0,0.0,...,0,0,0,0,0,0,0,0,0,True


In [None]:
# get the label proportion 
label_proportion = train_ddf['content_label'].mean().compute()

In [None]:
# get the feature and label names
label_names = ['content_label']
feature_names = set(train_ddf.columns) - set(['content_label', 'url', 'path'])

In [3]:
# define the input function
def get_input_fn(csv_pattern, **kws):
    train_ddf = dd.read_csv(csv_pattern)
    # get the cols
    label_names = set(['content_label'])
    feature_names = set(train_ddf.columns) - set(['content_label', 'url', 'path'])

    # get the weight
    label_proportion = train_ddf['content_label'].mean().compute()

    def input_fn():
        # get the tensors
        # all the graph building must be done in this function
        # otherwise tensorflow complains
   
        feature_tens, label_tens, = make_csv_pipeline(csv_pattern=csv_pattern, feature_cols=list(feature_names), label_cols=list(label_names), **kws)

        batch_size = kws.get('batch_size')
        weight_tens = tf.where(tf.equal(label_tens, tf.constant(1.0)), 
                               tf.constant(0.5 / label_proportion, shape=(batch_size, 1)), 
                               tf.constant(0.5 / (1-label_proportion), shape=(batch_size, 1)))
        return {'x': feature_tens, 'weights': weight_tens}, label_tens
    
    return input_fn

In [None]:
# sanity check to see the returned vectors
get_input_fn('../data/final/cleaneval/dom-full-*.csv', num_epochs=2000, batch_size=100)()

In [4]:
tf_feat_cols = [
    tf.feature_column.numeric_column('x', shape=(1572)),
    tf.feature_column.numeric_column('weights')
]
estiamtor = tf.estimator.DNNClassifier(feature_columns=tf_feat_cols, hidden_units=(1000, 500, 200), 
                                       weight_column='weights', model_dir='/home/nikitautiu/model_dir')


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/home/nikitautiu/model_dir', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [7]:
in_fn = get_input_fn('../data/final/cleaneval/dom-full-train.csv', num_epochs=2000, batch_size=1000, shuffle=True, num_threads=1)

In [8]:
estiamtor.train(input_fn=in_fn, max_steps=1000, hooks=[tf.contrib.hooks.ProfilerHook(save_steps=100, output_dir='/home/nikitautiu/traces/')]) 

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /home/nikitautiu/model_dir/model.ckpt-100
INFO:tensorflow:Saving checkpoints for 101 into /home/nikitautiu/model_dir/model.ckpt.
INFO:tensorflow:Saving timeline for 101 into '/home/nikitautiu/traces/timeline-101.json'.
INFO:tensorflow:loss = 4160.24, step = 101
INFO:tensorflow:global_step/sec: 0.213692
INFO:tensorflow:Saving timeline for 201 into '/home/nikitautiu/traces/timeline-201.json'.
INFO:tensorflow:loss = 847.706, step = 201 (467.942 sec)
INFO:tensorflow:Saving checkpoints for 231 into /home/nikitautiu/model_dir/model.ckpt.
INFO:tensorflow:global_step/sec: 0.214258
INFO:tensorflow:Saving timeline for 301 into '/home/nikitautiu/traces/timeline-301.json'.
INFO:tensorflow:loss = 644.73, step = 301 (466.739 sec)


KeyboardInterrupt: 

In [None]:
estiamtor.evaluate(input_fn=get_input_fn('../data/final/cleaneval/dom-full-train.csv', num_epochs=1, batch_size=1000, shuffle=False)) 

In [None]:
estiamtor.evaluate(input_fn=get_input_fn('../data/final/cleaneval/dom-full-test.csv', num_epochs=1, batch_size=1000, shuffle=False)) 