In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from haversine import haversine

import pickle
from sklearn.model_selection import train_test_split

In [1]:
def featureEngineer(data):
    '''
    Function to pre-process and engineer features of the train data
    '''
    # Convert character variables to numeric
    f = lambda x: 0 if x == 'N' else 1
    data["store_and_fwd_flag"] = data["store_and_fwd_flag"].apply(lambda x: f(x))

    # Convert datetime strings into datetime
    data["dropoff_datetime"] = pd.to_datetime(data["dropoff_datetime"], format='%Y-%m-%d %H:%M:%S')
    data["pickup_datetime"] = pd.to_datetime(data["pickup_datetime"], format='%Y-%m-%d %H:%M:%S')

    # Now construct other variables, like month, date, etc.
    data["pickup_month"] = data["pickup_datetime"].dt.month
    data["pickup_day"] = data["pickup_datetime"].dt.day
    data["pickup_weekday"] = data["pickup_datetime"].dt.weekday
    data["pickup_hour"] = data["pickup_datetime"].dt.hour
    data["pickup_minute"] = data["pickup_datetime"].dt.minute

    # Get latitude and longitude differences
    data["latitude_difference"] = data["dropoff_latitude"] - data["pickup_latitude"]
    data["longitude_difference"] = data["dropoff_longitude"] - data["pickup_longitude"]

    # Convert duration to minutes for easier interpretation
    data["trip_duration"] = data["trip_duration"].apply(lambda x: round(x/60))


In [6]:
def rmsle(y_true, y_pred):
    """ Before """
    
    '''
    Function to define evaluation metric
    >> Input: y_true -- ground truth labels, y_pred -- predicted labels
    >> Output: evaluation metric
    '''
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

In [16]:
def XGBmodel(X, y):
    '''
    Function to train a XGBoost machine learning model on the data
    >> Input: X -- features, y -- label
    >> Output:
    '''
    # Split the train data into training, test, and valdiation sets

    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.2, random_state=2019)
    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                      test_size=0.2, random_state=2019)

    # XGBoost parameters
    params = {
        'n_jobs':             3, 
        'booster':            'gbtree',
        'objective':          'reg:linear',
        'learning_rate':      0.05,
        'max_depth':          14,
        'subsample':          0.9,
        'colsample_bytree':   0.7,
        'colsample_bylevel':  0.7,
        'silent':             1,
        'feval':              'rmsle'
    }

    # Define train and validation sets
    dtrain = xgb.DMatrix(X_train, np.log(y_train+1))
    dval = xgb.DMatrix(X_val, np.log(y_val+1))

    # this is for tracking the error
    watchlist = [(dval, 'eval'), (dtrain, 'train')]

    # Number of training rounds
    nrounds = 1000
    # Train model
    gbm = xgb.train(params, dtrain, num_boost_round = nrounds,
                    evals = watchlist, verbose_eval = True)

    # Test predictions
    y_pred = np.exp(gbm.predict(xgb.DMatrix(X_test))) - 1

    # Use mean absolute error to get a basic estimate of the error
    mae = (abs(y_pred - y_test)).mean()

    # Take a look at feature importance
    feature_scores = gbm.get_fscore()

    # Feature scaling
    summ = 0
    for key in feature_scores:
        summ = summ + feature_scores[key]
    for key in feature_scores:
        feature_scores[key] = feature_scores[key] / summ

    print('Mean Absolute Error:', mae)
    print('Feature Importance:', feature_scores)

    return gbm

In [4]:
taxiDB = pd.read_csv('NYC-train-cleaned.csv')

# Engineer features
featureEngineer(taxiDB)

In [5]:
# Get features and labels for the data
X = taxiDB.drop(["trip_duration", "id", "vendor_id", "pickup_datetime", "dropoff_datetime"], axis=1)
y = taxiDB["trip_duration"]

In [7]:
X.columns

Index(['passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'pickup_month', 'pickup_day', 'pickup_weekday', 'pickup_hour',
       'pickup_minute', 'latitude_difference', 'longitude_difference',
       'distance'],
      dtype='object')

In [None]:
# Train XGB Model to our data
model = XGBmodel(X, y)
filename = "xgb_model.sav"
pickle.dump(model, open(filename, 'wb'))

#### Mean Absolute Error: 2.624394634997983
Feature Importance: 
{
'distance': 0.07561965053695302,
'longitude_difference': 0.08164822606947014,
'pickup_latitude': 0.12563633790440462,
'dropoff_longitude': 0.113152761231232,
'latitude_difference': 0.08627949861853945,
'pickup_weekday': 0.03251633883382762,
'pickup_month': 0.03229480875009506,
'pickup_minute': 0.06701747104002434,
'passenger_count': 0.033071352225968076,
'pickup_hour': 0.050597048659518556,
'store_and_fwd_flag': 0.0009663886846974729,
'dropoff_latitude': 0.10042415485876995,
'pickup_day': 0.05986329032639646,
'pickup_longitude': 0.14091267226010326
}


In [45]:
Feature_Importance = [('distance', 0.07561965053695302),
                     ('longitude_difference', 0.08164822606947014),
                     ('pickup_latitude', 0.12563633790440462),
                     ('dropoff_longitude', 0.113152761231232),
                     ('latitude_difference', 0.08627949861853945),
                     ('pickup_weekday', 0.03251633883382762),
                     ('pickup_month', 0.03229480875009506),
                     ('pickup_minute', 0.06701747104002434),
                     ('passenger_count', 0.033071352225968076),
                     ('pickup_hour', 0.050597048659518556),
                     ('store_and_fwd_flag', 0.0009663886846974729),
                     ('dropoff_latitude', 0.10042415485876995),
                     ('pickup_day', 0.05986329032639646),
                     ('pickup_longitude', 0.14091267226010326)]

In [46]:
while len(Feature_Importance) > 0:
    mx = 0
    for i,item in enumerate(Feature_Importance):
        if item[1] > mx:
            mx = item[1]
            max_value = item
    print(max_value)
    Feature_Importance.remove(max_value)

('pickup_longitude', 0.14091267226010326)
('pickup_latitude', 0.12563633790440462)
('dropoff_longitude', 0.113152761231232)
('dropoff_latitude', 0.10042415485876995)
('latitude_difference', 0.08627949861853945)
('longitude_difference', 0.08164822606947014)
('distance', 0.07561965053695302)
('pickup_minute', 0.06701747104002434)
('pickup_day', 0.05986329032639646)
('pickup_hour', 0.050597048659518556)
('passenger_count', 0.033071352225968076)
('pickup_weekday', 0.03251633883382762)
('pickup_month', 0.03229480875009506)
('store_and_fwd_flag', 0.0009663886846974729)
