# Anatomy of Tensorflow Experiment Class
[tf.contrib.learn.Experiment](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Experiment)

In [53]:
from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
import tensorflow.contrib.learn as tflearn
import tensorflow.contrib.layers as tflayers
import tensorflow.contrib.metrics as tfmetrics
import tensorflow as tf
import numpy as np
import os


## Provide an input function
TensorFlow Experiments needs a callback function that provides features and labels, and takes no input.

In [54]:
CSV_COLUMNS  = ('ontime,dep_delay,taxiout,distance,avg_dep_delay,avg_arr_delay' + \
                ',carrier,dep_lat,dep_lon,arr_lat,arr_lon,origin,dest').split(',')
LABEL_COLUMN = 'ontime'
DEFAULTS     = [[0.0],[0.0],[0.0],[0.0],[0.0],[0.0],\
                ['na'],[0.0],[0.0],[0.0],[0.0],['na'],['na']]

def read_dataset(filename, mode=tf.estimator.ModeKeys.EVAL, batch_size=512, num_training_epochs=10):

  # the actual input function passed to TensorFlow
  def _input_fn():
    num_epochs = num_training_epochs if mode == tf.estimator.ModeKeys.TRAIN else 1

    # could be a path to one file or a file pattern.
    input_file_names = tf.train.match_filenames_once(filename)
    filename_queue = tf.train.string_input_producer(
        input_file_names, num_epochs=num_epochs, shuffle=True)
 
    # read CSV
    reader = tf.TextLineReader()
    _, value = reader.read_up_to(filename_queue, num_records=batch_size)
    value_column = tf.expand_dims(value, -1)
    columns = tf.decode_csv(value_column, record_defaults=DEFAULTS)
    features = dict(zip(CSV_COLUMNS, columns))
    label = features.pop(LABEL_COLUMN)
    return features, label
  
  return _input_fn

# Provide a model
## First: define a helper function
Here we can select the features to build models on.

In [55]:
def get_features_raw():
    real = {
      colname : tf.feature_column.numeric_column(colname) \
          for colname in \
            ('dep_delay,taxiout,distance,avg_dep_delay,avg_arr_delay' + 
             ',dep_lat,dep_lon,arr_lat,arr_lon').split(',')
    }
    sparse = {
      'carrier': tf.feature_column.categorical_column_with_vocabulary_list('carrier',
                  vocabulary_list='AS,VX,F9,UA,US,WN,HA,EV,MQ,DL,OO,B6,NK,AA'.split(','),
                  dtype=tf.string)
      # NOT CONSIDERED, FOR THE MOMENT
      #, 'origin' : tflayers.sparse_column_with_hash_bucket('origin', hash_bucket_size=1000) # FIXME
      #, 'dest'   : tflayers.sparse_column_with_hash_bucket('dest', hash_bucket_size=1000) #FIXME
    }
    return real, sparse

def get_features():
    return get_features_raw()

## Wide and Deep

In [56]:
def parse_hidden_units(s):
    return [int(item) for item in s.split(',')]

def create_embed(sparse_col):
    dim = 10 # default
    if hasattr(sparse_col, 'bucket_size'):
       nbins = sparse_col.bucket_size
       if nbins is not None:
          dim = 1 + int(round(np.log2(nbins)))
    return tf.feature_column.embedding_column(sparse_col, dimension=dim)  
  
def wide_and_deep_model(output_dir, nbuckets=5, hidden_units='64,32', learning_rate=0.01):
    real, sparse = get_features()

    # the lat/lon columns can be discretized to yield "air traffic corridors"
    latbuckets = np.linspace(20.0, 50.0, nbuckets).tolist()  # USA
    lonbuckets = np.linspace(-120.0, -70.0, nbuckets).tolist() # USA
    disc = {}
    disc.update({
       'd_{}'.format(key) : tf.feature_column.bucketized_column(real[key], latbuckets) \
          for key in ['dep_lat', 'arr_lat']
    })
    disc.update({
       'd_{}'.format(key) : tf.feature_column.bucketized_column(real[key], lonbuckets) \
          for key in ['dep_lon', 'arr_lon']
    })

    # cross columns that make sense in combination
    # NOT CONSIDERED, FOR THE MOMENT
    #sparse['dep_loc'] = tflayers.crossed_column([disc['d_dep_lat'], disc['d_dep_lon']],\
    #                                            nbuckets*nbuckets)
    #sparse['arr_loc'] = tflayers.crossed_column([disc['d_arr_lat'], disc['d_arr_lon']],\
    #                                            nbuckets*nbuckets)
    #sparse['dep_arr'] = tflayers.crossed_column([sparse['dep_loc'], sparse['arr_loc']],\
    #                                            nbuckets ** 4)
    #sparse['ori_dest'] = tflayers.crossed_column([sparse['origin'], sparse['dest']], \
    #                                            hash_bucket_size=1000)
    
    # create embeddings of all the sparse columns
    embed = {
       colname : create_embed(col) \
          for colname, col in sparse.items()
    }
    real.update(embed)
    
    # updated following: https://github.com/tensorflow/tensorflow/blob/r1.8/tensorflow/contrib/learn/README.md
    estimator = \
        tf.estimator.DNNLinearCombinedClassifier(model_dir=output_dir,
                                           linear_feature_columns=sparse.values(),
                                           dnn_feature_columns=real.values(),
                                           dnn_hidden_units=parse_hidden_units(hidden_units),
                                           loss_reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE, # see README      
                                        )
    #linear_optimizer=tf.train.FtrlOptimizer(learning_rate=learning_rate),
    #dnn_optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate*0.25))
    
    # estimator.params["head"]._thresholds = [0.7]  # FIXME: hack (seems it's not a valid member)
    return estimator

## Select the actual model

In [None]:
"""
NOT NEEDED AT THE MOMENT

def get_model(output_dir, nbuckets, hidden_units, learning_rate):
    #return linear_model(output_dir)
    #return dnn_model(output_dir)
    return wide_and_deep_model(output_dir, nbuckets, hidden_units, learning_rate)
"""

# Provide a function for REST API
Inference will be requested on data coming from a JSON

In [57]:
# OK with: https://github.com/GoogleCloudPlatform/training-data-analyst/blob/master/courses/machine_learning/tensorflow/d_traineval.ipynb
def serving_input_fn():
    feature_placeholders = {
      key : tf.placeholder(tf.float32, [None]) \
        for key in ('dep_delay,taxiout,distance,avg_dep_delay,avg_arr_delay' +
             ',dep_lat,dep_lon,arr_lat,arr_lon').split(',')
    }
    feature_placeholders.update( {
      key : tf.placeholder(tf.string, [None]) \
        for key in 'carrier,origin,dest'.split(',')
    } )

    features = {
      key: tf.expand_dims(tensor, -1)
      for key, tensor in feature_placeholders.items()
    }
    return tf.estimator.export.build_raw_serving_input_receiver_fn(feature_placeholders)
    #return tf.estimator.export.ServingInputReceiver(features, feature_placeholders)

# Add custom metrics

In [58]:
def my_rmse(predictions, labels, **args):
  prob_ontime = predictions['probabilities'][:,1]
  #return tfmetrics.streaming_root_mean_squared_error(prob_ontime, labels, **args)

  #pred_values = predictions['predictions']
  return {'rmse': tf.metrics.root_mean_squared_error(prob_ontime, labels)}

## Run a small training session on datalab

In [59]:
import os
os.environ['BUCKET'] = 'telemar-flights'

In [None]:
%%bash
echo "reading from $BUCKET"

DATA_DIR=data/flights
rm -rf $DATA_DIR
mkdir -p $DATA_DIR

for STEP in train test; do
  gsutil cp gs://$BUCKET/flights/chapter8/output/${STEP}Flights-00001*.csv full.csv
  head -10003 full.csv > $DATA_DIR/${STEP}.csv
  rm full.csv
done

ls -l $DATA_DIR

In [60]:
%%bash
#rm -rf trained_model
gsutil -m rm -r gs://$BUCKET/flights/chapter9/output_estimator/

Removing gs://telemar-flights/flights/chapter9/output_estimator/#1529166067462328...
Removing gs://telemar-flights/flights/chapter9/output_estimator/Servo/#1529166076080610...
Removing gs://telemar-flights/flights/chapter9/output_estimator/Servo/1529166074/#1529166080229882...
Removing gs://telemar-flights/flights/chapter9/output_estimator/Servo/1529166074/saved_model.pb#1529166080370508...
Removing gs://telemar-flights/flights/chapter9/output_estimator/Servo/1529166074/variables/#1529166080552529...
Removing gs://telemar-flights/flights/chapter9/output_estimator/Servo/1529166074/variables/variables.data-00000-of-00001#1529166080702624...
/ [1/17 objects]   5% Done                                                      Removing gs://telemar-flights/flights/chapter9/output_estimator/Servo/1529166074/variables/variables.index#1529166080839529...
/ [2/17 objects]  11% Done                                                      / [3/17 objects]  17% Done                                      

In [61]:
import json
import os

import tensorflow as tf
from tensorflow.contrib.learn.python.learn import learn_runner

arguments = {'traindata': 'data/flights/train.csv',
             'evaldata': 'data/flights/test.csv',  
             'num_training_epochs': 10,
             'batch_size': 100,
             'nbuckets': 5,  
             'hidden_units': '64,64,64,16,4', # Architecture of DNN part of wide-and-deep network
             'learning_rate': 0.001 }

output_dir = 'gs://telemar-flights/flights/chapter9/output_estimator/'
# when hp-tuning, we need to use different output directories for different runs
output_dir = os.path.join(
    output_dir,
    json.loads(
        os.environ.get('TF_CONFIG', '{}')
    ).get('task', {}).get('trial', '')
)
 

# run
tf.reset_default_graph()
tf.logging.set_verbosity(tf.logging.INFO)

# create estimator
estimator = wide_and_deep_model(output_dir, 
                                arguments['nbuckets'],
                                arguments['hidden_units'],
                                arguments['learning_rate'])

estimator = tf.contrib.estimator.add_metrics(estimator, 
                                             my_rmse)

train_spec = tf.estimator.TrainSpec(input_fn=read_dataset(arguments['traindata'], 
                                                          mode=tf.estimator.ModeKeys.TRAIN, 
                                                          batch_size=arguments['batch_size'], 
                                                          num_training_epochs=arguments['num_training_epochs']),
                                    max_steps=200) # FOR A QUICK RUN

#exporter = tf.estimator.LatestExporter('exporter', 
#serving_input_receiver_fn=serving_input_fn())


eval_spec = tf.estimator.EvalSpec(input_fn=read_dataset(arguments['evaldata']),
                                  steps = None,
                                  start_delay_secs = 1, # start evaluating after N seconds
                                  throttle_secs = 10)#,  # evaluate every N seconds
#exporters = exporter)

tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

estimator.export_savedmodel(os.path.join(output_dir,'Servo'),
                            serving_input_receiver_fn=serving_input_fn())

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f349ab7b250>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': 'gs://telemar-flights/flights/chapter9/output_estimator/', '_global_id_in_cluster': 0, '_save_summary_steps': 100}
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f349b7c6a90>, '_evaluation_master': 

'gs://telemar-flights/flights/chapter9/output_estimator/Servo/1529166915'