# Kaggle Supervised Regression Machine Learning Problem
## Project: New York City Taxi Trip Duration

In [1]:
#Author: Nicholas Low
#Date 12/3/2017

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from datetime import datetime
from datetime import date
import xgboost as xgb

def rmsle(h, y): 
    """
    Compute the Root Mean Squared Log Error for hypthesis h and targets y

    Args:
        h - numpy array containing predictions with shape (n_samples, n_targets)
        y - numpy array containing targets with shape (n_samples, n_targets)
    """
    return np.sqrt(np.square(np.log(h + 1) - np.log(y + 1)).mean())

test_df = pd.read_csv('Data/test.csv')
train_df = pd.read_csv('Data/train.csv')

print "Training dataset has {} rows of data with {} variables each.".format(*train_df.shape)
print "Testing dataset has {} rows of data with {} variables each.".format(*test_df.shape)

Training dataset has 1458644 rows of data with 11 variables each.
Testing dataset has 625134 rows of data with 9 variables each.


In [2]:
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
id                    1458644 non-null object
vendor_id             1458644 non-null int64
pickup_datetime       1458644 non-null object
dropoff_datetime      1458644 non-null object
passenger_count       1458644 non-null int64
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null object
trip_duration         1458644 non-null int64
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


## Feature Cleaning/Imputation

In [4]:
#Not necessary in training
train_df = train_df.drop('dropoff_datetime', axis = 1)

for df in [train_df, test_df]:
    df['pickup_datetime'] = pd.to_datetime(train_df['pickup_datetime'])
    df['pickup_year']  = df['pickup_datetime'].dt.year
    df['pickup_month'] = df['pickup_datetime'].dt.month
    df['pickup_day']   = df['pickup_datetime'].dt.day
    df['pickup_hr']    = df['pickup_datetime'].dt.hour
    df['pickup_minute']= df['pickup_datetime'].dt.minute
    df['store_and_fwd_flag'] = 1 * (df.store_and_fwd_flag.values == 'Y')
    df['pickup_datetime'] = df['pickup_datetime'].astype('int64')
    
print test_df.head()

          id  vendor_id      pickup_datetime  passenger_count  \
0  id3004672          1  1457976295000000000                1   
1  id3505355          1  1465692215000000000                1   
2  id1217141          1  1453203324000000000                1   
3  id2150126          2  1459971151000000000                1   
4  id1598245          1  1458999055000000000                1   

   pickup_longitude  pickup_latitude  dropoff_longitude  dropoff_latitude  \
0        -73.988129        40.732029         -73.990173         40.756680   
1        -73.964203        40.679993         -73.959808         40.655403   
2        -73.997437        40.737583         -73.986160         40.729523   
3        -73.956070        40.771900         -73.986427         40.730469   
4        -73.970215        40.761475         -73.961510         40.755890   

   store_and_fwd_flag  pickup_year  pickup_month  pickup_day  pickup_hr  \
0                   0         2016             3          14         17

In [5]:
train_df = train_df.assign(log_trip_duration = np.log(train_df.trip_duration+1))

In [11]:
y = train_df['trip_duration']
x = train_df.drop(['id','trip_duration'], axis=1)
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size = .20, random_state = 0)
x_cv,x_v,y_cv,y_v = model_selection.train_test_split(x_test, y_test, test_size=0.5, random_state=1)

In [None]:
n_trees = 20
max_feat = 5
kfold = model_selection.KFold(n_splits = 10, random_state = 2)
r_model = RandomForestRegressor(n_estimators = n_trees, max_features = max_feat)
r_model.fit(x_train, y_train)
results = model_selection.cross_val_score(r_model, x_test, y_test)

In [None]:
print results
predictions = r_model.predict(x_test)
print predictions.round()
print rmsle(predictions.round(), y_test)

In [12]:
data_tr  = xgb.DMatrix(x_train, label=y_train)
data_cv  = xgb.DMatrix(x_cv   , label=y_cv)
evallist = [(data_tr, 'train'), (data_cv, 'valid')]
parms = {'max_depth':8, #maximum depth of a tree
         'objective':'reg:linear',
         'eta'      :0.3,
         'subsample':0.8,#SGD will use this percentage of data
         'lambda '  :4, #L2 regularization term,>1 more conservative 
         'colsample_bytree ':0.9,
         'colsample_bylevel':1,
         'min_child_weight': 10,
         'nthread'  :3}  #number of cpu core to use

model = xgb.train(parms, data_tr, num_boost_round=1000, evals = evallist,
                  early_stopping_rounds=30, maximize=False, 
                  verbose_eval=100)

[0]	train-rmse:3960.49	valid-rmse:9323.56
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 30 rounds.
[100]	train-rmse:2358.17	valid-rmse:7440.64
Stopping. Best iteration:
[109]	train-rmse:2360.23	valid-rmse:7336.03

