In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from datetime import datetime
tf.enable_eager_execution()

  from ._conv import register_converters as _register_converters


In [2]:
#tf.logging.set_verbosity(tf.logging.ERROR)

Setup varaibles that are required for input function

In [3]:
FILE_NAME = 'train.csv'
CSV_COLUMNS = 'key,fare_amount,pickupdate,pickuplon,pickuplat,dropofflon,dropofflat,passengers'.split(',')
DATE_COLUMNS = 'year,month,day,day_of_week,hours'.split(',')
DISTANCE_COLUMN = 'euclid_distance'
LAT_DISTANCE = 'latitude_diff'
LON_DISTANCE = 'longitude_diff'
LABEL_COLUMN = 'fare_amount'
DEFAULTS = [['nokey'], [0.0], [b'2001-01-01 00:00:01 UTC'], [-74.0], [40.0], [-74.0], [40.7], [1.0]]

Preview data as Pandas data frame

In [4]:
pd_dataset = pd.read_csv(FILE_NAME, nrows=100)
pd_dataset.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [5]:
pd_dataset.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

Function that will convert datetime into Year, Month, Day, Weekday and Hours

In [6]:
def pd_date_time(date_time):
    pd_date_time = pd.to_datetime(date_time.decode('utf-8'))
    pd_date_time = [pd_date_time.year, pd_date_time.month, 
                    pd_date_time.day, pd_date_time.weekday(), 
                    pd_date_time.hour]
    return pd_date_time

In [7]:
print(pd_date_time(b'2001-01-01 00:00:01 UTC'))

[2001, 1, 1, 0, 0]


Create feature engineering function that will be used in the input and serving input functions

In [8]:
def feature_engg(features):
    lat1 = features['pickuplat']
    lat2 = features['dropofflat']
    lat_diff = (lat1 - lat2)
    lon1 = features['pickuplon']
    lon2 = features['dropofflon']
    lon_diff = lon1 - lon2
    dist = tf.sqrt((lat_diff * lat_diff) + (lon_diff * lon_diff))
    features[LAT_DISTANCE] = lat_diff
    features[LON_DISTANCE] = lon_diff
    features[DISTANCE_COLUMN] = dist
    date_time = features['pickupdate']
    date_time = tf.py_func(pd_date_time, [date_time], [tf.int32]*5)
    features[DATE_COLUMNS[0]] = date_time[0]
    features[DATE_COLUMNS[1]] = date_time[1]
    features[DATE_COLUMNS[2]] = date_time[2]
    features[DATE_COLUMNS[3]] = date_time[3]
    features[DATE_COLUMNS[4]] = date_time[4]
    return features

Create input function to load data into datasets

In [9]:
def read_dataset(filename, mode, batch_size = 512):
    def _input_fun():
        def parse_csv(file_data):
            columns = tf.decode_csv(file_data, DEFAULTS)
            features = dict( zip(CSV_COLUMNS, columns))
            label = features.pop(LABEL_COLUMN)
            return feature_engg(features), label
        
        dataset = tf.data.TextLineDataset(filename).skip(1).map(parse_csv)
        
        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = None
            dataset = dataset.shuffle(buffer_size = 10 * batch_size)
        else:
            num_epochs = 1
        
        dataset = dataset.repeat(num_epochs).batch(batch_size)
        
        batch_features, batch_labels = dataset.make_one_shot_iterator().get_next()
        
        return batch_features, batch_labels
    
    return _input_fun
        

These are the raw input columns, and will be provided for prediction also

In [10]:
INPUT_COLUMNS = [
    # Define features
    tf.feature_column.categorical_column_with_identity('month', num_buckets = 12),
    tf.feature_column.categorical_column_with_identity('day', num_buckets = 31),
    tf.feature_column.categorical_column_with_identity('day_of_week', num_buckets = 7),
    tf.feature_column.categorical_column_with_identity('hours', num_buckets = 24),

    # Numeric columns
    tf.feature_column.numeric_column('pickuplat'),
    tf.feature_column.numeric_column('pickuplon'),
    tf.feature_column.numeric_column('dropofflat'),
    tf.feature_column.numeric_column('dropofflon'),
    tf.feature_column.numeric_column('passengers'),
    
    # Engineered features that are created in the input_fn
    tf.feature_column.numeric_column('latitude_diff'),
    tf.feature_column.numeric_column('longitude_diff'),
    tf.feature_column.numeric_column('euclid_distance')
]

Build the wide-n-deep regressor estimator

In [11]:
def build_estimator(model_dir, nbuckets, hidden_units):
    
    # Input columns
    (month, days, dayofweek, hourofday, plat, plon, dlat, dlon, pcount, latdiff, londiff, euclidean) = INPUT_COLUMNS
    
    # Bucketize the lats & lons
    latbuckets = np.linspace(38.0, 42.0, nbuckets).tolist()
    lonbuckets = np.linspace(-76.0, -72.0, nbuckets).tolist()
    b_plat = tf.feature_column.bucketized_column(plat, latbuckets)
    b_dlat = tf.feature_column.bucketized_column(dlat, latbuckets)
    b_plon = tf.feature_column.bucketized_column(plon, lonbuckets)
    b_dlon = tf.feature_column.bucketized_column(dlon, lonbuckets)
    
    # Feature cross
    ploc = tf.feature_column.crossed_column([b_plat, b_plon], nbuckets * nbuckets)
    dloc = tf.feature_column.crossed_column([b_dlat, b_dlon], nbuckets * nbuckets)
    pd_pair = tf.feature_column.crossed_column([ploc, dloc], nbuckets ** 4 )
    day_hr =  tf.feature_column.crossed_column([dayofweek, hourofday], 24 * 7)
    #month_day = tf.feature_column.crossed_column([month, days], 12 * 31)
    
    # Wide columns and deep columns.
    wide_columns = [
        # Feature crosses
        dloc, ploc, pd_pair,
        day_hr,
        #month_day,
        # Sparse columns
        dayofweek, hourofday,
        # Anything with a linear relationship
        pcount 
    ]
    deep_columns = [
        # Embedding_column to "group" together ...
        tf.feature_column.embedding_column(pd_pair, 10),
        tf.feature_column.embedding_column(day_hr, 10),
        #tf.feature_column.embedding_column(month_day, 10),
        # Numeric columns
        plat, plon, dlat, dlon,
        latdiff, londiff, euclidean
    ]
    
    estimator = tf.estimator.DNNLinearCombinedRegressor(
        model_dir = model_dir,
        linear_feature_columns = wide_columns,
        dnn_feature_columns = deep_columns,
        dnn_hidden_units = hidden_units,
        dnn_optimizer='Adagrad',
    )
    
    return estimator

Variables for training

In [12]:
BATCH_SIZE = 512
HEADER_COLS = 1
MAX_STEPS = 10000
NBBUCKETS = 10
HIDDEN_UNITS = [128, 32, 4]
MODEL_DIR = 'C:\\Rajesh\\training\\2\\'

In [13]:
estimator = None
estimator = build_estimator(MODEL_DIR, NBBUCKETS, HIDDEN_UNITS)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Rajesh\\training\\2\\', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000000015DB3630>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [14]:
estimator.train(input_fn = read_dataset(
                            filename = FILE_NAME,
                            mode = tf.estimator.ModeKeys.TRAIN,
                            batch_size = BATCH_SIZE), 
                max_steps = MAX_STEPS
            )

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into C:\Rajesh\training\2\model.ckpt.
INFO:tensorflow:loss = 100908.36, step = 1
INFO:tensorflow:global_step/sec: 4.08347
INFO:tensorflow:loss = 61837.168, step = 101 (24.491 sec)
INFO:tensorflow:global_step/sec: 4.08347
INFO:tensorflow:loss = 60332.18, step = 201 (24.488 sec)
INFO:tensorflow:global_step/sec: 3.56519
INFO:tensorflow:loss = 35727.37, step = 301 (28.050 sec)
INFO:tensorflow:global_step/sec: 4.23012
INFO:tensorflow:loss = 52157.523, step = 401 (23.639 sec)
INFO:tensorflow:global_step/sec: 4.18725
INFO:tensorflow:loss = 45459.047, step = 501 (23.885 sec)
INFO:tensorflow:global_step/sec: 4.42008
INFO:tensorflow:loss = 36842.668, step = 601 (22.621 sec)
INFO:tensorflow:global_step/sec: 4.1088
INFO:tens

INFO:tensorflow:loss = 36890.555, step = 7701 (22.646 sec)
INFO:tensorflow:global_step/sec: 4.43833
INFO:tensorflow:loss = 34062.15, step = 7801 (22.530 sec)
INFO:tensorflow:global_step/sec: 4.4403
INFO:tensorflow:loss = 28230.3, step = 7901 (22.521 sec)
INFO:tensorflow:global_step/sec: 4.51141
INFO:tensorflow:loss = 38830.016, step = 8001 (22.166 sec)
INFO:tensorflow:global_step/sec: 4.48129
INFO:tensorflow:loss = 39847.04, step = 8101 (22.320 sec)
INFO:tensorflow:global_step/sec: 4.39445
INFO:tensorflow:loss = 30719.443, step = 8201 (22.751 sec)
INFO:tensorflow:global_step/sec: 4.45395
INFO:tensorflow:loss = 35645.344, step = 8301 (22.452 sec)
INFO:tensorflow:global_step/sec: 4.47527
INFO:tensorflow:loss = 34660.82, step = 8401 (22.345 sec)
INFO:tensorflow:global_step/sec: 4.35673
INFO:tensorflow:loss = 47399.6, step = 8501 (22.953 sec)
INFO:tensorflow:global_step/sec: 4.54401
INFO:tensorflow:loss = 28916.29, step = 8601 (22.007 sec)
INFO:tensorflow:global_step/sec: 4.40723
INFO:tens

<tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressor at 0x15db3e48>

In [15]:
TEST_FILE_NAME = 'test.csv'

In [16]:
predictions = estimator.predict(input_fn = read_dataset(
                                            filename = TEST_FILE_NAME,
                                            mode = tf.estimator.ModeKeys.PREDICT,
                                            batch_size = BATCH_SIZE)
                                )

In [17]:
predict_df = pd.DataFrame(predictions)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Rajesh\training\2\model.ckpt-10000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [18]:
predict_df.head()

Unnamed: 0,predictions
0,[10.746673]
1,[10.745453]
2,[10.335909]
3,[10.252174]
4,[10.255049]


In [19]:
predict_df_cleaned = pd.DataFrame(predict_df['predictions'].apply(lambda x: x[0]))

In [20]:
predict_df_cleaned.head()

Unnamed: 0,predictions
0,10.746673
1,10.745453
2,10.335909
3,10.252174
4,10.255049


In [21]:
predict_df_cleaned.mean()

predictions    11.296935
dtype: float64

In [23]:
predict_df_cleaned.std()

predictions    2.457245
dtype: float64