In [1]:
import sys
sys.version

'3.5.4 |Continuum Analytics, Inc.| (default, Aug 14 2017, 13:41:13) [MSC v.1900 64 bit (AMD64)]'

In [2]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np

import datetime as dt
from sklearn import linear_model
import math

In [3]:
import xgboost as xgb



In [4]:
train_raw = pd.read_csv('train.csv')

In [5]:
test = pd.read_csv('test.csv')

In [6]:
# IMPORTANT
test.pickup_datetime = pd.to_datetime(test.pickup_datetime, errors='coerce')
# IMPORTANT
train_raw.pickup_datetime = pd.to_datetime(train_raw.pickup_datetime, errors='coerce')
train_raw.dropoff_datetime = pd.to_datetime(train_raw.dropoff_datetime, errors='coerce')

In [None]:
train.describe()

### From previous notebook, the features with negative coeffs are:
* pickup_latitude      float64
* dropoff_longitude    float64
* dropoff_latitude     float64
* dayofweek              int64
* time_bin               int64

So let's remove outliers from the above latitude and longitude features.

In [7]:
# 95% Confident Interval for trip duration
np.percentile(train_raw.trip_duration, [2.5,97.5])

array([  135.,  2643.])

In [8]:
# 95% CI for pickup_latitude & long
print(np.percentile(train_raw.pickup_latitude, [2.5,97.5]) )
print(np.percentile(train_raw.pickup_longitude, [2.5,97.5]) )

[ 40.67717009  40.79689026]
[-74.00972748 -73.86276245]


In [9]:
# 95% CI for dropoff latitude & long
print(np.percentile(train_raw.dropoff_latitude, [2.5,97.5]) )
print(np.percentile(train_raw.dropoff_longitude, [2.5,97.5]) )

[ 40.67628126  40.80963516]
[-74.01110077 -73.87052155]


In [10]:
train_raw.loc[(train_raw.pickup_latitude > 40.5) & (train_raw.pickup_latitude < 40.9) ].count()

id                    1458446
vendor_id             1458446
pickup_datetime       1458446
dropoff_datetime      1458446
passenger_count       1458446
pickup_longitude      1458446
pickup_latitude       1458446
dropoff_longitude     1458446
dropoff_latitude      1458446
store_and_fwd_flag    1458446
trip_duration         1458446
dtype: int64

In [11]:
train = train_raw.loc[(train_raw.pickup_latitude > 40.5) & (train_raw.pickup_latitude < 40.9), ]

In [12]:
train = train.loc[(train.dropoff_latitude > 40.5) & (train.dropoff_latitude < 40.9), ]

In [13]:
# Now longitude
train = train.loc[(train.pickup_longitude > -74) & (train.pickup_longitude < -73.9), ]

In [14]:
train = train.loc[(train.trip_duration >= 130) & (train.trip_duration < 2645),]

In [15]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435
5,id0801584,2,2016-01-30 22:01:40,2016-01-30 22:09:03,6,-73.982857,40.742195,-73.992081,40.749184,N,443


In [16]:
train.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1151808.0,1151808.0,1151808.0,1151808.0,1151808.0,1151808.0,1151808.0
mean,1.534692,1.660232,-73.97705,40.75672,-73.9739,40.75499,744.8906
std,0.4987952,1.312826,0.01489399,0.02183101,0.02934384,0.02959476,468.7128
min,1.0,0.0,-73.99999,40.53193,-74.47315,40.53136,130.0
25%,1.0,1.0,-73.98861,40.744,-73.99002,40.74009,392.0
50%,2.0,1.0,-73.98013,40.75738,-73.97863,40.75677,629.0
75%,2.0,2.0,-73.96795,40.77001,-73.96252,40.77209,980.0
max,2.0,6.0,-73.90001,40.89445,-72.67115,40.89996,2644.0


Note: still have some apparent outliers for dropoff_longitude, dropoff_latitude and trip_duration

In [None]:
train[["pickup_datetime", "dropoff_datetime", "trip_duration"]].head()

In [None]:
train.dtypes

In [None]:
train.trip_duration.hist()
plt.show()

In [None]:
train.loc[train.trip_duration > 50000,]

In [17]:
#%%
# Remove rows with excessive drip_duration
train = train.loc[train.trip_duration < 50000,]
train.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1151808.0,1151808.0,1151808.0,1151808.0,1151808.0,1151808.0,1151808.0
mean,1.534692,1.660232,-73.97705,40.75672,-73.9739,40.75499,744.8906
std,0.4987952,1.312826,0.01489399,0.02183101,0.02934384,0.02959476,468.7128
min,1.0,0.0,-73.99999,40.53193,-74.47315,40.53136,130.0
25%,1.0,1.0,-73.98861,40.744,-73.99002,40.74009,392.0
50%,2.0,1.0,-73.98013,40.75738,-73.97863,40.75677,629.0
75%,2.0,2.0,-73.96795,40.77001,-73.96252,40.77209,980.0
max,2.0,6.0,-73.90001,40.89445,-72.67115,40.89996,2644.0


In [None]:
sum(train.isnull().any(axis=1))

In [None]:
train.dtypes

In [None]:
test.dtypes

In [18]:
for df in (train, test):
    #df["year"] = df.pickup_datetime.dt.year
    df["month"] = df.pickup_datetime.dt.month
    df["day"] =  df.pickup_datetime.dt.day
    df["hr"]  = df.pickup_datetime.dt.hour
    df["dayofweek"] = df.pickup_datetime.dt.weekday
    # Convert Y/N to 1/0 so that it's numeric.
    df['store_and_fwd_flag'] = 1 * (df.store_and_fwd_flag.values == 'Y')
    # df['minute'] = df.pickup_datetime.dt.minute
    # To prevent negative predictions convert longitudes to absolute numbers... Didn't help
    #df['pickup_longitude'] = -1 * df['pickup_longitude']
    #df['dropoff_longitude'] = -1 * df['dropoff_longitude']
    # Add new feature "time_bin" - which 15-min bin did time occur.
    df["time_bin"] = df["hr"] * 4 + df.pickup_datetime.dt.minute // 15 + 1


In [None]:
train.head()

In [19]:
# IMPORTANT
# Slice out the duration as our response variable
y_train = train.trip_duration

In [None]:
y_train.head()

In [20]:
# IMPORTANT
# Remove id and trip_duration from training data
# Also had to remove pickup_datetime and dropoff_datetime because linear regression didn't like datetime stamps
X_train = train.drop(labels=['id','trip_duration', 'pickup_datetime', 'dropoff_datetime'], axis=1)

In [None]:
X_train.head()

In [None]:
test.head()

In [None]:
test.dtypes

In [21]:
# IMPORTANT
# Slice and remove id's from test data. 
test_ids = test.id

In [None]:
test_ids.shape

In [None]:
test_ids.head()

In [22]:
# IMPORTANT
# Also remove pickup_datetime from test data.
X_test = test.drop(labels=['id', 'pickup_datetime'], axis=1)

Let's try XGBoost

In [23]:
#%%
# Call XGBoost 
dtrain = xgb.DMatrix(X_train, label=y_train)
#%%
dtest = xgb.DMatrix(X_test)

In [24]:
num_round = 10
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'reg:linear' }
#%%
# bst = xgb.train( plst, dtrain, num_round, evallist )
bst = xgb.train(param, dtrain, num_round)

In [25]:
predictions =  bst.predict(dtest)

In [26]:
# Count negative predictions
sum(predictions < 0)

0

In [28]:
y_predict = pd.Series(predictions,name='trip_duration')

In [29]:
y_predict.describe()

count    625134.000000
mean        763.514343
std         320.332611
min          75.562828
25%         566.990967
50%         679.828979
75%         864.914978
max        3253.544434
Name: trip_duration, dtype: float64

In [30]:
# Check if any negative predictions
sum(y_predict < 0)

0

In [31]:
results = pd.concat([test_ids, y_predict], axis=1)

In [32]:
results.head()

Unnamed: 0,id,trip_duration
0,id3004672,591.822083
1,id3505355,1081.255615
2,id1217141,808.470703
3,id2150126,656.690979
4,id1598245,473.344818


In [33]:
# Write results to file
results.to_csv('NSG102.csv', index=None)