In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

%matplotlib inline



In [2]:
train = pd.read_csv('train.csv').drop(['Trip_ID'], axis = 1)
test = pd.read_csv('test.csv').drop(['Trip_ID'], axis = 1)

In [3]:
print train.dtypes

Trip_Distance                  float64
Type_of_Cab                     object
Customer_Since_Months          float64
Life_Style_Index               float64
Confidence_Life_Style_Index     object
Destination_Type                object
Customer_Rating                float64
Cancellation_Last_1Month         int64
Var1                           float64
Var2                             int64
Var3                             int64
Gender                          object
Surge_Pricing_Type               int64
dtype: object


In [4]:
train.describe()

Unnamed: 0,Trip_Distance,Customer_Since_Months,Life_Style_Index,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Surge_Pricing_Type
count,131662.0,125742.0,111469.0,131662.0,131662.0,60632.0,131662.0,131662.0,131662.0
mean,44.200909,6.016661,2.802064,2.849458,0.782838,64.202698,51.2028,75.099019,2.155747
std,25.522882,3.626887,0.225796,0.980675,1.037559,21.820447,4.986142,11.578278,0.738164
min,0.31,0.0,1.59638,0.00125,0.0,30.0,40.0,52.0,1.0
25%,24.58,3.0,2.65473,2.1525,0.0,46.0,48.0,67.0,2.0
50%,38.2,6.0,2.79805,2.895,0.0,61.0,50.0,74.0,2.0
75%,60.73,10.0,2.94678,3.5825,1.0,80.0,54.0,82.0,3.0
max,109.23,10.0,4.87511,5.0,8.0,210.0,124.0,206.0,3.0


In [5]:
for col in train.columns:
    print col, train[col].isnull().sum()

Trip_Distance 0
Type_of_Cab 20210
Customer_Since_Months 5920
Life_Style_Index 20193
Confidence_Life_Style_Index 20193
Destination_Type 0
Customer_Rating 0
Cancellation_Last_1Month 0
Var1 71030
Var2 0
Var3 0
Gender 0
Surge_Pricing_Type 0


So we have a lot of missing values. We can't simply drop them as then we would lose about 15% of the data, instead lets replace the missing the values with the most probable values. Here for continuous values I am gonna use mean and for categorical values I am gonna use the most frequent category

In [6]:
# Creating a combined data frame of initial processing
train_y = train['Surge_Pricing_Type']
train_x = train.drop(['Surge_Pricing_Type'], axis = 1)
test_x = test
all_Data = pd.concat([train_x, test_x])
all_Data.head(5)

Unnamed: 0,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender
0,6.77,B,1.0,2.42769,A,A,3.905,0,40.0,46,60,Female
1,29.47,B,10.0,2.78245,B,A,3.45,0,38.0,56,78,Male
2,41.58,,10.0,,,E,3.50125,2,,56,77,Male
3,61.56,C,10.0,,,A,3.45375,0,,52,74,Male
4,54.95,C,10.0,3.03453,B,A,3.4025,4,51.0,49,102,Male


In [7]:
# filling in the missing values
cols_with_missing = ['Type_of_Cab', 'Customer_Since_Months', 'Life_Style_Index', 'Confidence_Life_Style_Index', 'Var1']

def calc_rep(col):
    if all_Data[col].dtype == object:
        return all_Data[col].value_counts().index[0]
    else:
        return all_Data[col].mean()
miss_fill = {}
for col in cols_with_missing:
    miss_fill[col] = calc_rep(col)
    all_Data[col] = all_Data[col].fillna(miss_fill[col])
all_Data.head()

Unnamed: 0,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender
0,6.77,B,1.0,2.42769,A,A,3.905,0,40.0,46,60,Female
1,29.47,B,10.0,2.78245,B,A,3.45,0,38.0,56,78,Male
2,41.58,B,10.0,2.802594,B,E,3.50125,2,64.095972,56,77,Male
3,61.56,C,10.0,2.802594,B,A,3.45375,0,64.095972,52,74,Male
4,54.95,C,10.0,3.03453,B,A,3.4025,4,51.0,49,102,Male


In [8]:
# The Type of cab seems to be a good differentiator for the surge pricing as surge pricing is usually dependent on the type of car
# So for type of car we will create dummy variables and for the rest of the categorical values we use label encoding
labelEncoder = preprocessing.LabelEncoder()
# Encode categorical values
for column in list(all_Data.select_dtypes(include=['object']).columns):
    if column == 'Type_of_Cab':
        continue
    all_Data[column] = labelEncoder.fit_transform(all_Data[column])
all_Data = pd.get_dummies(all_Data)

In [9]:
# Getting the Train and Test Data set
train_x = all_Data[:train.shape[0]]
test_x = all_Data[train.shape[0]:]

# A holdout validation set with 10% of the data, to check the results of CV tuned Hyper parameters
train_val_x, test_val_x, train_val_y, test_val_y = train_test_split(train_x, train_y, test_size = 0.1)

In [105]:
base_model = XGBClassifier(max_depth = 3, n_estimators= 100)
param_grid = {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.05], 'colsample_bylevel':[0.8, 0.6, 0.5]}
grid_search = GridSearchCV(base_model, param_grid=param_grid, cv = 3)
grid_search.fit(train_val_x, train_val_y)
grid_search.cv_results_

{'mean_fit_time': array([  5.08996344,   9.31447395,  12.69599199,   4.6182851 ,
          8.43597126,  12.0485576 ,   4.99836699,   7.78746796,
          9.87616698,   5.56180509,   7.26805202,   8.94019969,
          3.97367032,   6.19290209,   8.47522696,   3.85505199,
          6.36562165,   8.3887314 ]),
 'mean_score_time': array([ 0.12153125,  0.24317296,  0.30839928,  0.14059297,  0.23333875,
         0.31734435,  0.1704669 ,  0.22855139,  0.35822606,  0.166243  ,
         0.22780403,  0.30442731,  0.12924258,  0.22286391,  0.31203437,
         0.14059273,  0.227326  ,  0.29664771]),
 'mean_test_score': array([ 0.67355585,  0.67845901,  0.68379257,  0.68172497,  0.68818094,
         0.69154817,  0.67369931,  0.67780075,  0.68315119,  0.68139584,
         0.68764927,  0.69194481,  0.67346302,  0.67766572,  0.68295709,
         0.6815815 ,  0.68757332,  0.69158192]),
 'mean_train_score': array([ 0.67377105,  0.67945061,  0.68909236,  0.6825731 ,  0.69143845,
         0.70435461,  

In [10]:
# Checking the score of the model with the best CV Score on the validation set
base_model = grid_search.best_estimator_

base_model.fit(train_val_x, train_val_y)
predicts = base_model.predict(test_val_x)

print 'The accuracy on the validation set is: ', accuracy_score(test_val_y, predicts)

The accuracy on the validation set is:  0.690590111643


In [109]:
base_model.fit(train_x, train_y)
predicts = base_model.predict(test_x)

submission = pd.DataFrame()
submission['Surge_Pricing_Type'] = predicts
submission['Trip_ID'] = pd.read_csv('test.csv').Trip_ID
submission.to_csv('attempt_1.csv', index=False)