# Anatomy of Tensorflow Experiment Class
[tf.contrib.learn.Experiment](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Experiment)

In [15]:
from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
import tensorflow.contrib.learn as tflearn
import tensorflow.contrib.layers as tflayers
import tensorflow.contrib.metrics as tfmetrics
import tensorflow as tf
import numpy as np


## Provide an input function
TensorFlow Experiments needs a callback function that provides features and labels, and take no input.

In [16]:
CSV_COLUMNS  = ('ontime,dep_delay,taxiout,distance,avg_dep_delay,avg_arr_delay' + \
                ',carrier,dep_lat,dep_lon,arr_lat,arr_lon,origin,dest').split(',')
LABEL_COLUMN = 'ontime'
DEFAULTS     = [[0.0],[0.0],[0.0],[0.0],[0.0],[0.0],\
                ['na'],[0.0],[0.0],[0.0],[0.0],['na'],['na']]

def read_dataset(filename, mode=tf.contrib.learn.ModeKeys.EVAL, batch_size=512, num_training_epochs=10):

  # the actual input function passed to TensorFlow
  def _input_fn():
    num_epochs = num_training_epochs if mode == tf.contrib.learn.ModeKeys.TRAIN else 1

    # could be a path to one file or a file pattern.
    input_file_names = tf.train.match_filenames_once(filename)
    filename_queue = tf.train.string_input_producer(
        input_file_names, num_epochs=num_epochs, shuffle=True)
 
    # read CSV
    reader = tf.TextLineReader()
    _, value = reader.read_up_to(filename_queue, num_records=batch_size)
    value_column = tf.expand_dims(value, -1)
    columns = tf.decode_csv(value_column, record_defaults=DEFAULTS)
    features = dict(zip(CSV_COLUMNS, columns))
    label = features.pop(LABEL_COLUMN)
    return features, label
  
  return _input_fn

# Provide a model
## First: define a helper function
Here we can select the features to build models on.

In [17]:
def get_features_raw():
    real = {
      colname : tflayers.real_valued_column(colname) \
          for colname in \
            ('dep_delay,taxiout,distance,avg_dep_delay,avg_arr_delay' + 
             ',dep_lat,dep_lon,arr_lat,arr_lon').split(',')
    }
    sparse = {
      'carrier': tflayers.sparse_column_with_keys('carrier',
                  keys='AS,VX,F9,UA,US,WN,HA,EV,MQ,DL,OO,B6,NK,AA'.split(',')),
      'origin' : tflayers.sparse_column_with_hash_bucket('origin', hash_bucket_size=1000), # FIXME
      'dest'   : tflayers.sparse_column_with_hash_bucket('dest', hash_bucket_size=1000) #FIXME
    }
    return real, sparse

def get_features():
    return get_features_raw()

## Wide and Deep

In [18]:
def parse_hidden_units(s):
    return [int(item) for item in s.split(',')]

def wide_and_deep_model(output_dir, nbuckets=5, hidden_units='64,32', learning_rate=0.01):
    real, sparse = get_features()

    # the lat/lon columns can be discretized to yield "air traffic corridors"
    latbuckets = np.linspace(20.0, 50.0, nbuckets).tolist()  # USA
    lonbuckets = np.linspace(-120.0, -70.0, nbuckets).tolist() # USA
    disc = {}
    disc.update({
       'd_{}'.format(key) : tflayers.bucketized_column(real[key], latbuckets) \
          for key in ['dep_lat', 'arr_lat']
    })
    disc.update({
       'd_{}'.format(key) : tflayers.bucketized_column(real[key], lonbuckets) \
          for key in ['dep_lon', 'arr_lon']
    })

    # cross columns that make sense in combination
    sparse['dep_loc'] = tflayers.crossed_column([disc['d_dep_lat'], disc['d_dep_lon']],\
                                                nbuckets*nbuckets)
    sparse['arr_loc'] = tflayers.crossed_column([disc['d_arr_lat'], disc['d_arr_lon']],\
                                                nbuckets*nbuckets)
    sparse['dep_arr'] = tflayers.crossed_column([sparse['dep_loc'], sparse['arr_loc']],\
                                                nbuckets ** 4)
    sparse['ori_dest'] = tflayers.crossed_column([sparse['origin'], sparse['dest']], \
                                                hash_bucket_size=1000)
    
    # create embeddings of all the sparse columns
    embed = {
       colname : create_embed(col) \
          for colname, col in sparse.items()
    }
    real.update(embed)
 
    estimator = \
        tflearn.DNNLinearCombinedClassifier(model_dir=output_dir,
                                           linear_feature_columns=sparse.values(),
                                           dnn_feature_columns=real.values(),
                                           dnn_hidden_units=parse_hidden_units(hidden_units))
                                           #linear_optimizer=tf.train.FtrlOptimizer(learning_rate=learning_rate),
                                           #dnn_optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate*0.25))
    estimator.params["head"]._thresholds = [0.7]  # FIXME: hack
    return estimator

## Linear Model

In [19]:
def linear_model(output_dir):
    real, sparse = get_features()
    all = {}
    all.update(real)
    all.update(sparse)
    estimator = tflearn.LinearClassifier(model_dir=output_dir, feature_columns=all.values())
    estimator.params["head"]._thresholds = [0.7]  # FIXME: hack
    return estimator

## Deep Learning Model

In [20]:
def create_embed(sparse_col):
    dim = 10 # default
    if hasattr(sparse_col, 'bucket_size'):
       nbins = sparse_col.bucket_size
       if nbins is not None:
          dim = 1 + int(round(np.log2(nbins)))
    return tflayers.embedding_column(sparse_col, dimension=dim)

def dnn_model(output_dir):
    real, sparse = get_features()
    all = {}
    all.update(real)

    # create embeddings of the sparse columns
    embed = {
       colname : create_embed(col) \
          for colname, col in sparse.items()
    }
    all.update(embed)

    estimator = tflearn.DNNClassifier(model_dir=output_dir,
                                      feature_columns=all.values(),
                                      hidden_units=[64, 16, 4])
    estimator.params["head"]._thresholds = [0.7]  # FIXME: hack
    return estimator

## Select the actual model

In [21]:
def get_model(output_dir, nbuckets, hidden_units, learning_rate):
    #return linear_model(output_dir)
    #return dnn_model(output_dir)
    return wide_and_deep_model(output_dir, nbuckets, hidden_units, learning_rate)

# Provide a function for REST API
Inference will be requested on data coming from a JSON

In [22]:
def serving_input_fn():
    feature_placeholders = {
      key : tf.placeholder(tf.float32, [None]) \
        for key in ('dep_delay,taxiout,distance,avg_dep_delay,avg_arr_delay' +
             ',dep_lat,dep_lon,arr_lat,arr_lon').split(',')
    }
    feature_placeholders.update( {
      key : tf.placeholder(tf.string, [None]) \
        for key in 'carrier,origin,dest'.split(',')
    } )

    features = {
      key: tf.expand_dims(tensor, -1)
      for key, tensor in feature_placeholders.items()
    }
    return tflearn.utils.input_fn_utils.InputFnOps(
      features,
      None,
      feature_placeholders)

# Add custom metrics

In [23]:
def my_rmse(predictions, labels, **args):
  prob_ontime = predictions[:,1]
  return tfmetrics.streaming_root_mean_squared_error(prob_ontime, labels, **args)

# Create the Experiment instance

In [24]:
def make_experiment_fn(traindata, evaldata, num_training_epochs,
                       batch_size, nbuckets, hidden_units, learning_rate, **args):
  def _experiment_fn(output_dir):
    return tflearn.Experiment(
        get_model(output_dir, nbuckets, hidden_units, learning_rate),
        train_input_fn=read_dataset(traindata, mode=tf.contrib.learn.ModeKeys.TRAIN, num_training_epochs=num_training_epochs, batch_size=batch_size),
        eval_input_fn=read_dataset(evaldata),
        export_strategies=[saved_model_export_utils.make_export_strategy(
            serving_input_fn,
            default_output_alternative_key=None,
            exports_to_keep=1
        )],
        eval_metrics = {
          'rmse' : tflearn.MetricSpec(metric_fn=my_rmse, prediction_key='probabilities'),
          'training/hptuning/metric' : tflearn.MetricSpec(metric_fn=my_rmse, prediction_key='probabilities')
        },
        min_eval_frequency = 100,
        **args
    )
  return _experiment_fn

## Run a small training session on datalab

In [25]:
import os
os.environ['BUCKET'] = 'telemar-flights'

In [26]:
%%bash
echo "reading from $BUCKET"

DATA_DIR=data/flights
rm -rf $DATA_DIR
mkdir -p $DATA_DIR

for STEP in train test; do
  gsutil cp gs://$BUCKET/flights/chapter8/output/${STEP}Flights-00001*.csv full.csv
  head -10003 full.csv > $DATA_DIR/${STEP}.csv
  rm full.csv
done

ls -l $DATA_DIR

reading from telemar-flights
total 1684
-rw-r--r-- 1 root root 749937 Jun  8 15:52 test.csv
-rw-r--r-- 1 root root 969431 Jun  8 15:52 train.csv


Copying gs://telemar-flights/flights/chapter8/output/trainFlights-00001-of-00007.csv...
/ [0 files][    0.0 B/108.4 MiB]                                                -- [0 files][ 93.3 MiB/108.4 MiB]                                                - [1 files][108.4 MiB/108.4 MiB]                                                \
Operation completed over 1 objects/108.4 MiB.                                    
Copying gs://telemar-flights/flights/chapter8/output/testFlights-00001-of-00007.csv...
/ [0 files][    0.0 B/732.4 KiB]                                                / [1 files][732.4 KiB/732.4 KiB]                                                
Operation completed over 1 objects/732.4 KiB.                                    


In [27]:
%%bash
rm -rf trained_model

In [28]:
import json
import os

import tensorflow as tf
from tensorflow.contrib.learn.python.learn import learn_runner

arguments = {'traindata': 'data/flights/train.csv',
             'evaldata': 'data/flights/test.csv',  
             'num_training_epochs': 10,
             'batch_size': 100,
             'nbuckets': 5,  
             'hidden_units': '64,64,64,16,4', # Architecture of DNN part of wide-and-deep network
             'learning_rate': 0.001 }

output_dir = 'trained_model'
# when hp-tuning, we need to use different output directories for different runs
output_dir = os.path.join(
    output_dir,
    json.loads(
        os.environ.get('TF_CONFIG', '{}')
    ).get('task', {}).get('trial', '')
)
 

# run
tf.reset_default_graph()
tf.logging.set_verbosity(tf.logging.INFO)
learn_runner.run(make_experiment_fn(**arguments), output_dir)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': None, '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f44e7b16f90>, '_model_dir': 'trained_model/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': None, '_tf_random_seed': None, '_save_summary_steps': 100, '_environment': 'local', '_num_worker_replicas': 0, '_task_id': 0, '_log_step_count_steps': 100, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_evaluation_master': '', '_master': ''}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 2 into trained_model/model.ckpt.
INFO:tensorflow:loss = 0.69159335, step = 2
INFO:tensorflow:

({'accuracy': 0.8189666,
  'accuracy/baseline_label_mean': 0.8189666,
  'accuracy/threshold_0.700000_mean': 0.6219751,
  'auc': 0.53855383,
  'auc_precision_recall': 0.8462137,
  'global_step': 2020,
  'labels/actual_label_mean': 0.8189666,
  'labels/prediction_mean': 0.7377406,
  'loss': 0.493171,
  'precision/positive_threshold_0.700000_mean': 0.826584,
  'recall/positive_threshold_0.700000_mean': 0.6813608,
  'rmse': 0.3957664,
  'training/hptuning/metric': 0.3957664},
 ['trained_model/export/Servo/1528473207'])