In [19]:
import numpy as np
import pandas as pd
import tensorflow as tf

## In this notebook,I will learn the high level of tensorflow API invoke :
    1.using predefine Model
    2.create dataset out of memory
    3.distribution train
    4.monitor training processing
    5.deploy your model

## 1. PreDefine Model
<a href='https://www.tensorflow.org/api_docs/python/tf/estimator/LinearRegressor#train'>LinearRegressor</a>

In [3]:
class MyModel():
    def __init__(self,filename='outputs/tf_week2/model1'):
        self.columns=['pickuplon','pickuplat','dropofflon','dropofflat','passengers']
        features=[tf.feature_column.numeric_column(f) for f in self.columns]
        self.model=tf.estimator.LinearRegressor(feature_columns=features,model_dir=filename)
        tf.logging.set_verbosity(tf.logging.INFO)
    def make_train_input_fn(self,epochs):
        df_train=pd.read_csv('../AI_database/trips/taxi-train.csv')
        return tf.estimator.inputs.pandas_input_fn(x=df_train,
                                        y=df_train['fare_amount'],
                                        batch_size=128,
                                        num_epochs=epochs,
                                        shuffle=True,
                                        queue_capacity=1000)
    def make_valid_input_fn(self):
        df_dev=pd.read_csv('../AI_database/trips/taxi-valid.csv')
        return tf.estimator.inputs.pandas_input_fn(x=df_dev,
                                              y=df_dev["fare_amount"],
                                              batch_size=128,
                                              num_epochs=1,
                                              shuffle=True,
                                              queue_capacity=1000)
    def make_predict_input_fn(self):
        df_test=pd.read_csv('../AI_database/trips/taxi-test.csv')
        return tf.estimator.inputs.pandas_input_fn(x=df_test,y=None,
                                              batch_size=128,num_epochs=1,
                                              shuffle=False)
    def fit(self,epochs=10):
        self.model.train(self.make_train_input_fn(epochs))
    def evaluate(self):
        result=self.model.evaluate(self.make_valid_input_fn())
        print('Eval rms is {}'.format(np.sqrt(result['average_loss'])))
    def predict(self):
        return self.model.predict(self.make_predict_input_fn())

In [4]:
model=MyModel('outputs/tf_week2/model1')
print('Train  Step:')
model.fit(1)
print('Evaluatuin Step:')
model.evaluate()

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from outputs/tf_week2/model1/model.ckpt-2973
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 2974 into outputs/tf_week2/model1/model.ckpt.
INFO:tensorflow:loss = 2080.8, step = 2974
INFO:tensorflow:global_step/sec: 382.047
INFO:tensorflow:loss = 240.499, step = 3074 (0.263 sec)
INFO:tensorflow:global_step/sec: 524.864
INFO:tensorflow:loss = 1500.62, step = 3174 (0.192 sec)
INFO:tensorflow:global_step/sec: 633.036
INFO:tensorflow:loss = 84.3914, step = 3274 (0.157 sec)
INFO:tensorflow:global_step/sec: 481.793
INFO:tensorflow:loss = 9.38618, step = 3374 (0.209 sec)
INFO:tensorflow:global_step/sec: 648
INFO:tensorflow:loss = 54372.4, step = 3474 (0.152 sec)
INFO:tensorflow:Saving checkpoints for 3546 into outputs/tf_week2/model1

Train  Step:
Evaluatuin Step:

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-09-11-03:26:59
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from outputs/tf_week2/model1/model.ckpt-3546
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-09-11-03:26:59
INFO:tensorflow:Saving dict for global step 3546: average_loss = 111.996, global_step = 3546, loss = 14317.8



Eval rms is 10.582818984985352


In [5]:
#take a look at train,valid,test input function
print(model.make_train_input_fn(1)()) #give 2 node, features(map) and label(a tensor)
print('--------------------------------------')
print(model.make_valid_input_fn()()) #give 2 node, features(map) and label(a tensor)
print('--------------------------------------')
print(model.make_predict_input_fn()())

({'fare_amount': <tf.Tensor 'random_shuffle_queue_DequeueUpTo:1' shape=(?,) dtype=float64>, 'pickuplon': <tf.Tensor 'random_shuffle_queue_DequeueUpTo:2' shape=(?,) dtype=float64>, 'passengers': <tf.Tensor 'random_shuffle_queue_DequeueUpTo:6' shape=(?,) dtype=float64>, 'dropofflat': <tf.Tensor 'random_shuffle_queue_DequeueUpTo:5' shape=(?,) dtype=float64>, 'key': <tf.Tensor 'random_shuffle_queue_DequeueUpTo:7' shape=(?,) dtype=int64>, 'pickuplat': <tf.Tensor 'random_shuffle_queue_DequeueUpTo:3' shape=(?,) dtype=float64>, 'dropofflon': <tf.Tensor 'random_shuffle_queue_DequeueUpTo:4' shape=(?,) dtype=float64>}, <tf.Tensor 'random_shuffle_queue_DequeueUpTo:8' shape=(?,) dtype=float64>)
--------------------------------------
({'fare_amount': <tf.Tensor 'random_shuffle_queue_DequeueUpTo_1:1' shape=(?,) dtype=float64>, 'pickuplon': <tf.Tensor 'random_shuffle_queue_DequeueUpTo_1:2' shape=(?,) dtype=float64>, 'passengers': <tf.Tensor 'random_shuffle_queue_DequeueUpTo_1:6' shape=(?,) dtype=float

## 2. training on large dataset,using tf.data.* instead of  tf.estimator.input.pandas_input_fn 

In [6]:
#input a raw line,return features and labels
def decoder(row):
    cols=tf.decode_csv(row,[[0.0],[0.0],[0.0],[0.0],[0.0],[0.0],[0.0]])
    features={'pickuplon':cols[1],'pockuplat':cols[2]}
    label=cols[0]
    return features,label

### first let's see how dataset tensor work

In [7]:
! head ../AI_database/trips/taxi-train.csv -n11

fare_amount,pickuplon,pickuplat,dropofflon,dropofflat,passengers,key
2.5,-74.005559,40.740673,-73.956682,40.663655,2.0,0
2.5,-73.970619,40.764209,-73.965857,40.789753,2.0,1
2.5,-74.005465,40.740818,-73.973177,40.752603,2.0,2
2.5,-73.966332,40.758127,-73.966482,40.75808,2.0,3
2.5,-73.977088,40.774907,-73.977378,40.77444,2.0,4
2.5,-73.966111,40.762474,-73.945065,40.782581,2.0,5
2.5,-73.993483,40.72108,-73.993397,40.720987,2.0,6
2.5,-73.864043,40.765769,-73.864043,40.765769,2.0,7
2.5,-73.987632,40.702428,-73.987548,40.702332,2.0,8
2.5,-73.790031,40.645675,-73.789944,40.644809,2.0,9


In [8]:
#the workflow look like follow:
'''
    filename->TextDataset->shuffle->batch->repeat->map->iterator->next
    all about are node or tensor in TF
    
    the output should have the format for model.train()
    {featurename:tensor},label tensor
'''
dataset=tf.data.TextLineDataset('../AI_database/trips/taxi-train.csv')
dataset=dataset.skip(1)

dataset=dataset.map(decoder)
dataset=dataset.batch(3)
dataset.shuffle(1000)
mynext=dataset.make_one_shot_iterator().get_next()
with tf.Session() as sess:
    for i in range(5):
        print(sess.run(mynext))

({'pockuplat': array([ 40.74067307,  40.76420975,  40.74081802], dtype=float32), 'pickuplon': array([-74.00556183, -73.9706192 , -74.00546265], dtype=float32)}, array([ 2.5,  2.5,  2.5], dtype=float32))
({'pockuplat': array([ 40.75812531,  40.77490616,  40.76247406], dtype=float32), 'pickuplon': array([-73.96633148, -73.97708893, -73.96611023], dtype=float32)}, array([ 2.5,  2.5,  2.5], dtype=float32))
({'pockuplat': array([ 40.72108078,  40.76576996,  40.70242691], dtype=float32), 'pickuplon': array([-73.9934845 , -73.86404419, -73.98763275], dtype=float32)}, array([ 2.5,  2.5,  2.5], dtype=float32))
({'pockuplat': array([ 40.64567566,  40.72658539,  40.76021576], dtype=float32), 'pickuplon': array([-73.79003143, -73.98907471, -73.97946167], dtype=float32)}, array([ 2.5,  2.5,  3. ], dtype=float32))
({'pockuplat': array([ 40.77260971,  40.7104187 ,  40.76414871], dtype=float32), 'pickuplon': array([-73.96065521, -73.95875549, -73.97486877], dtype=float32)}, array([ 3.,  3.,  3.], dtyp

In [9]:
class MyModel2(MyModel):
    def decoder_line(self,row):
        DEFAULTS = [[0.0], [-74.0], [40.0], [-74.0], [40.7], [1.0], ['nokey']]
        cols=tf.decode_csv(row,DEFAULTS)
        features={self.columns[i]:cols[i+1] for i in range(0,len(self.columns))}
        labels=cols[0]
        return features,labels
    def _getDataset(self,filename,epochs):
        def ret_func():
            dataset=tf.data.Dataset.list_files(filename)
            dataset=dataset.flat_map(tf.data.TextLineDataset)
            dataset=dataset.skip(1)
            dataset=dataset.shuffle(1000)
            dataset=dataset.batch(128)
            dataset=dataset.repeat(epochs)
            dataset=dataset.map(lambda r:self.decoder_line(r))   
            features,labels=dataset.make_one_shot_iterator().get_next()
            return features,labels
        return ret_func
    
    def make_train_input_fn(self,epochs):
        print('======================Model 2 using large dataset======================')
        return self._getDataset('../AI_database/trips/taxi-train.csv',epochs)
    def make_valid_input_fn(self):
        print('======================Model 2 using large dataset======================')
        return self._getDataset('../AI_database/trips/taxi-valid.csv',1)
    def make_predict_input_fn(self):
        print('======================Model 2 using large dataset======================')
        return self._getDataset('../AI_database/trips/taxi-test.csv',1)

In [10]:
model=MyModel2('outputs/tf_week2/model2')
print('Train step:')
model.fit(1)
print('Evaluation step:')
model.evaluate()

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_keep_checkpoint_max': 5, '_evaluation_master': '', '_save_checkpoints_secs': 600, '_task_id': 0, '_tf_random_seed': None, '_session_config': None, '_master': '', '_model_dir': 'outputs/tf_week2/model2', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fb93abd6b70>, '_service': None, '_is_chief': True, '_task_type': 'worker', '_num_ps_replicas': 0, '_log_step_count_steps': 100, '_save_summary_steps': 100, '_num_worker_replicas': 1, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_steps': None, '_global_id_in_cluster': 0}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from outputs/tf_week2/model2/model.ckpt-21777
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 2

Train step:
Evaluation step:

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-09-11-03:27:02
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from outputs/tf_week2/model2/model.ckpt-22350
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-09-11-03:27:02
INFO:tensorflow:Saving dict for global step 22350: average_loss = 95.8433, global_step = 22350, loss = 12252.8



Eval rms is 9.789959907531738


In [11]:
gen=model.predict()
for i in range(5):
    print(next(gen))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from outputs/tf_week2/model2/model.ckpt-22350
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


{'predictions': array([ 12.69274139], dtype=float32)}
{'predictions': array([ 12.69482231], dtype=float32)}
{'predictions': array([ 12.69048882], dtype=float32)}
{'predictions': array([ 12.69081974], dtype=float32)}
{'predictions': array([ 12.69208717], dtype=float32)}


In [12]:
for v_name in model.model.get_variable_names():
    print('{} : {}'.format(v_name,model.model.get_variable_value(v_name)))

global_step : 22350
linear/linear_model/bias_weights : [ 0.07966512]
linear/linear_model/bias_weights/part_0/Ftrl : [  7.45741568e+09]
linear/linear_model/bias_weights/part_0/Ftrl_1 : [-34397.93359375]
linear/linear_model/dropofflat/weights : [[ 0.0561062]]
linear/linear_model/dropofflat/weights/part_0/Ftrl : [[  1.23824589e+13]]
linear/linear_model/dropofflat/weights/part_0/Ftrl_1 : [[-987152.5625]]
linear/linear_model/dropofflon/weights : [[-0.04978582]]
linear/linear_model/dropofflon/weights/part_0/Ftrl : [[  4.08025710e+13]]
linear/linear_model/dropofflon/weights/part_0/Ftrl_1 : [[ 1590081.5]]
linear/linear_model/passengers/weights : [[ 0.91197729]]
linear/linear_model/passengers/weights/part_0/Ftrl : [[  2.66309734e+10]]
linear/linear_model/passengers/weights/part_0/Ftrl_1 : [[-744127.8125]]
linear/linear_model/pickuplat/weights : [[ 0.05218478]]
linear/linear_model/pickuplat/weights/part_0/Ftrl : [[  1.23815173e+13]]
linear/linear_model/pickuplat/weights/part_0/Ftrl_1 : [[-918122

In [13]:
pd.read_csv('../AI_database/trips/taxi-valid.csv').head(1)

Unnamed: 0,fare_amount,pickuplon,pickuplat,dropofflon,dropofflat,passengers,key
0,2.5,-73.988954,40.758612,-73.952118,40.776227,2.0,0


In [14]:
-73.988954*-0.03677155+40.758612*0.05220714+-73.952118*-0.04987059+40.776227*0.05605096*2*0.92487079+0.07902358

12.843309126564572

## 3.big jobs distribution training
        1.define a run_config 
        2.define your model LinearRegression
        3.Define trainSpec  <a href='http://ss'>TrainSpec</a>
        4.Define evalSpec <a href=''>EvalSpec</a>
        5.call tf.estimator.train_and_eval(model,train_spec,eval_spec)
<a href='https://www.tensorflow.org/api_docs/python/tf/estimator/RunConfig'>RunConfig</a> <a href='https://www.tensorflow.org/api_docs/python/tf/estimator/TrainSpec'>TrainSpec</a> <a href='https://www.tensorflow.org/api_docs/python/tf/estimator/EvalSpec#__new__'>EvalSpec</a>
 <a href='https://www.tensorflow.org/api_docs/python/tf/estimator/export/ServingInputReceiver#__new__'>ServingInputReceiver</a>
  <a href='https://www.tensorflow.org/api_docs/python/tf/estimator/LatestExporter'>LatestExporter</a>

In [15]:
class MyModel3(MyModel2):
    def __init__(self,filename='outputs/tf_week2/model1'):
        self.columns=['pickuplon','pickuplat','dropofflon','dropofflat','passengers']
        features=[tf.feature_column.numeric_column(f) for f in self.columns]
        runconfig=tf.estimator.RunConfig(filename,save_summary_steps=100,save_checkpoints_steps=1000)
        self.model=tf.estimator.LinearRegressor(feature_columns=features,config=runconfig)
    
    def fit(self,epoch):
        # Defines the expected shape of the JSON feed that the model,will receive once 
        #deployed behind a REST API in production.
        def serving_input_fn():
            feature_placeholders = {
                'pickuplon' : tf.placeholder(tf.float32, [None]),
                'pickuplat' : tf.placeholder(tf.float32, [None]),
                'dropofflat' : tf.placeholder(tf.float32, [None]),
                'dropofflon' : tf.placeholder(tf.float32, [None]),
                'passengers' : tf.placeholder(tf.float32, [None])
            }
            #You can transforma data here from the input format to the format expected by your model.
            features = feature_placeholders # no transformation needed
            return tf.estimator.export.ServingInputReceiver(features, feature_placeholders)

        train_spec=tf.estimator.TrainSpec(input_fn=self.make_train_input_fn(epoch),max_steps=5000)
        
        eval_spec=tf.estimator.EvalSpec(input_fn=self.make_valid_input_fn(),
                                        steps=572,
                                        exporters=tf.estimator.LatestExporter('exporter',serving_input_fn),
                                        throttle_secs=600)
        tf.estimator.train_and_evaluate(self.model,train_spec,eval_spec)

In [17]:
model=MyModel3('outputs/tf_week2/model3')

INFO:tensorflow:Using config: {'_keep_checkpoint_max': 5, '_num_ps_replicas': 0, '_evaluation_master': '', '_save_checkpoints_secs': None, '_log_step_count_steps': 100, '_save_summary_steps': 100, '_tf_random_seed': None, '_keep_checkpoint_every_n_hours': 10000, '_is_chief': True, '_session_config': None, '_task_id': 0, '_master': '', '_num_worker_replicas': 1, '_model_dir': 'outputs/tf_week2/model3', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fb93aa3e780>, '_global_id_in_cluster': 0, '_service': None, '_save_checkpoints_steps': 1000, '_task_type': 'worker'}


In [18]:
model.fit(1)

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 600 secs (eval_spec.throttle_secs) or training is finished.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into outputs/tf_week2/model3/model.ckpt.
INFO:tensorflow:loss = 2964.75, step = 1
INFO:tensorflow:global_step/sec: 372.574
INFO:tensorflow:loss = 8987.67, step = 101 (0.269 sec)
INFO:tensorflow:global_step/sec: 382.489
INFO:tensorflow:loss = 22.9861, step = 201 (0.262 sec)
INFO:tensorflow:global_step/sec: 468.161
INFO:tensorflow:loss = 45.6097, step = 301 (0.213 sec)
INFO:tensorflow:global_step/sec: 455.339
INFO:tensorflow:loss = 14534.3, step = 401 (0.220 sec)
INFO:tensorflow:global_step/sec: 434.32
INF

