# Tensorflow model
Continuation of the last prototypes. We will try to input the data using the DAtaset api

In [7]:
%matplotlib inline
# standard library
import itertools
import sys, os
import re
import glob

from collections import OrderedDict
from urllib.parse import urlparse

# pandas
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import dask


# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# tesnsorflow
import tensorflow as tf

from tensorflow.contrib.hooks import ProfilerHook
# local imports
sys.path.append(os.path.join(os.getcwd(), "../src"))

from tf_utils import make_csv_pipeline, make_csv_col_tensors, csv_dataset, csv_to_tf_types, tfrecord_dataset

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

## CSV dataset

In [10]:
def build_fn(csv_pattern, num_epochs=1000, batch_size=100, shuffle=True, num_parallel_calls=4):
    # the entire graph has to be built here
    ddf = dd.read_csv(csv_pattern)
#     max_vals = ddf.max(numeric_only=True).compute()  # for normalization
#     min_vals = ddf.min(numeric_only=True).compute()

    # add the weight column based on proportions
    label_proportion = ddf['content_label'].mean().compute()
    positive_label_val = tf.constant(1.0)
    positive_proportion = tf.constant(0.5 / label_proportion, shape=())
    negative_proportion = tf.constant(0.5 / (1-label_proportion), shape=())
    def add_weights(tens_dict, label_tens):
        # the weights are added as a conditional based on the corresponding label
        weight_tens = tf.where(tf.equal(label_tens, positive_label_val), 
                               positive_proportion, 
                               negative_proportion)
        tens_dict['weights'] = weight_tens
        return tens_dict, label_tens
        
    def drop_strings(tens_dict, label):
        # drops the string columns
        return {k.strip(':'): v for k,v in tens_dict.items() if k not in ['url', 'path']}, label
    
    def normalize_features(tens_dict, label):
        # normalize values
        # dirty workaround
        max_vals.columns = [k.strip(':') for k in max_vals.columns]
        min_vals.columns = [k.strip(':') for k in min_vals.columns]
        # TODO: finish
        
    dataset = csv_dataset(csv_pattern, 'content_label')  # decode the csv
    dataset = dataset.map(drop_strings, num_parallel_calls=4)  # drop redundants
    dataset = dataset.map(add_weights, num_parallel_calls=4)  # add weight col

    if shuffle:
        dataset = dataset.shuffle(buffer_size=10000)  # shuffle the input
        
    # batch, repeate, iterate
    dataset = dataset.batch(batch_size)
    dataset = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()
    
    # return the iterator
    features, labels = iterator.get_next()
    return features, labels

In [11]:
tf_feat_cols = tf.contrib.learn.infer_real_valued_columns_from_input_fn(lambda: build_fn('../data/final/cleaneval/dom-full-train-*.csv', num_epochs=1, batch_size=1, shuffle=False))
estimator = tf.estimator.DNNClassifier(feature_columns=tf_feat_cols, hidden_units=(1000, 500, 200), 
                                        weight_column='weights', model_dir='/home/nikitautiu/model_dir')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/home/nikitautiu/model_dir', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f7dd805bdd8>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [12]:
estimator.train(input_fn=lambda: build_fn('../data/final/cleaneval/dom-full-train-*.csv', batch_size=100))

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /home/nikitautiu/model_dir/model.ckpt.
INFO:tensorflow:loss = 67096.2, step = 1
INFO:tensorflow:global_step/sec: 1.62328
INFO:tensorflow:loss = 250.422, step = 101 (61.576 sec)


KeyboardInterrupt: 

In [6]:
estimator.evaluate(input_fn=lambda: build_fn('../data/final/cleaneval/dom-full-train.csv', batch_size=1000, num_epochs=1, shuffle=False))

INFO:tensorflow:Starting evaluation at 2017-11-03-09:38:44
INFO:tensorflow:Restoring parameters from /home/nikitautiu/model_dir/model.ckpt-1697
INFO:tensorflow:Finished evaluation at 2017-11-03-09:59:46
INFO:tensorflow:Saving dict for global step 1697: accuracy = 0.65604, accuracy_baseline = 0.5, auc = 0.715335, auc_precision_recall = 0.690727, average_loss = 0.8081, global_step = 1697, label/mean = 0.5, loss = 804.761, prediction/mean = 0.44946


{'accuracy': 0.65603995,
 'accuracy_baseline': 0.5,
 'auc': 0.71533465,
 'auc_precision_recall': 0.69072664,
 'average_loss': 0.80810004,
 'global_step': 1697,
 'label/mean': 0.5,
 'loss': 804.76068,
 'prediction/mean': 0.44945988}

In [7]:
estimator.evaluate(input_fn=lambda: build_fn('../data/final/cleaneval/dom-full-test.csv', batch_size=1000, num_epochs=1, shuffle=False))

INFO:tensorflow:Starting evaluation at 2017-11-03-10:06:22
INFO:tensorflow:Restoring parameters from /home/nikitautiu/model_dir/model.ckpt-1697
INFO:tensorflow:Finished evaluation at 2017-11-03-10:10:08
INFO:tensorflow:Saving dict for global step 1697: accuracy = 0.578467, accuracy_baseline = 0.50836, auc = 0.623614, auc_precision_recall = 0.586798, average_loss = 0.920987, global_step = 1697, label/mean = 0.50836, loss = 928.914, prediction/mean = 0.443241


{'accuracy': 0.57846659,
 'accuracy_baseline': 0.50835955,
 'auc': 0.62361437,
 'auc_precision_recall': 0.58679771,
 'average_loss': 0.92098683,
 'global_step': 1697,
 'label/mean': 0.50835955,
 'loss': 928.91357,
 'prediction/mean': 0.443241}

## TFRecords dataset

In [2]:
def build_tfrecords_fn(tfrecords_files, csv_pattern, num_epochs=1000, batch_size=100, shuffle=True, num_parallel_calls=4):
    # the entire graph has to be built here
    # add the weight column based on proportions
    ddf = dd.read_csv(csv_pattern)
    label_proportion = ddf['content_label'].mean().compute()

    def add_weights(tens_dict, label_tens):
        # the weights are added as a conditional based on the corresponding label
        weight_tens = tf.where(tf.equal(label_tens, tf.constant(1, dtype=tf.int64)), 
                               tf.constant(0.5 / label_proportion, shape=(), dtype=tf.float64), 
                               tf.constant(0.5 / (1-label_proportion), shape=(), dtype=tf.float64))
        tens_dict['weights'] = weight_tens
        return tens_dict, label_tens
        
    def drop_strings(tens_dict, label):
        # drops the string columns
        return {k.strip(':'): v for k,v in tens_dict.items() if k not in ['url', 'path']}, label

    # get the tf types
    tf_types = csv_to_tf_types(csv_pattern)

    dataset = tfrecord_dataset(tfrecords_files, 'content_label', tf_types=tf_types)  # decode the csv
    dataset = dataset.map(drop_strings, num_parallel_calls=4)  # drop redundants
    dataset = dataset.map(add_weights, num_parallel_calls=4)  # add weight col

    if shuffle:
        dataset = dataset.shuffle(buffer_size=10000)  # shuffle the input
        
    # batch, repeate, iterate
    dataset = dataset.batch(batch_size)
    dataset = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()
    
    # return the iterator
    features, labels = iterator.get_next()
    return features, labels

In [3]:
tf_feat_cols = tf.contrib.learn.infer_real_valued_columns_from_input_fn(lambda: build_tfrecords_fn('../data/final/cleaneval/dom-full-train.tfrecords', '../data/final/cleaneval/dom-full-00.csv', num_epochs=1, batch_size=1, shuffle=False))
estimator = tf.estimator.DNNClassifier(feature_columns=tf_feat_cols, hidden_units=(1000, 500, 200), 
                                       weight_column='weights', model_dir='/home/nikitautiu/model_dir')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/home/nikitautiu/model_dir', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f7e86ad9c88>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [5]:
estimator.train(input_fn=lambda: build_tfrecords_fn('../data/final/cleaneval/dom-full-train.tfrecords', '../data/final/cleaneval/dom-full-00.csv', batch_size=2000),
                hooks=[tf.train.ProfilerHook(save_steps=50, output_dir='/home/nikitautiu/traces/')])

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /home/nikitautiu/model_dir/model.ckpt-1654
INFO:tensorflow:Saving checkpoints for 1655 into /home/nikitautiu/model_dir/model.ckpt.
INFO:tensorflow:Saving timeline for 1655 into '/home/nikitautiu/traces/timeline-1655.json'.
INFO:tensorflow:loss = 2427.34, step = 1655
INFO:tensorflow:Saving checkpoints for 1697 into /home/nikitautiu/model_dir/model.ckpt.
INFO:tensorflow:Saving timeline for 1705 into '/home/nikitautiu/traces/timeline-1705.json'.
INFO:tensorflow:Saving checkpoints for 1737 into /home/nikitautiu/model_dir/model.ckpt.
INFO:tensorflow:global_step/sec: 0.0681418
INFO:tensorflow:Saving timeline for 1755 into '/home/nikitautiu/traces/timeline-1755.json'.
INFO:tensorflow:loss = 1387.27, step = 1755 (1467.687 sec)
INFO:tensorflow:Saving checkpoints for 1782 into /home/nikitautiu/model_dir/model.ckpt.
INFO:tensorflow:Saving timeline for 1805 into '/home/nikitautiu/traces/timeline-1805.json'.
INFO:

KeyboardInterrupt: 