In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost as xgb
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
trips = pd.read_csv("trips.csv")
trips

Unnamed: 0,activityid,personid,modechoice
0,1,3,WALK
1,2,3,WALK
2,3,3,WALK
3,4,3,WALK
4,5,3,WALK
...,...,...,...
14047024,17147588,4113994,DRIVE_ALONE_FREE
14047025,17147589,4113994,DRIVE_ALONE_FREE
14047026,17147590,4113994,DRIVE_ALONE_FREE
14047027,17147592,4113995,SHARED_3_HOV


In [4]:
trips.isnull().sum()

activityid    0
personid      0
modechoice    0
dtype: int64

In [5]:
trips['modechoice'].value_counts()

DRIVE_ALONE_FREE    6156506
SHARED_2_HOV        3507285
SHARED_3_HOV        2814408
WALK                1022834
WALK_SET             214366
SCH_BUS              194123
BIKE                  69645
SHARED_3_PAY          17088
SHARED_2_PAY          15450
KNR_SET               13633
PNR_SET               11926
DRIVE_ALONE_PAY        9765
Name: modechoice, dtype: int64

In [6]:
utilityvars = pd.read_csv("utilityvars.csv")
utilityvars

Unnamed: 0,activityid,age,gender,autosuf,numhouseholdpersons,income,oduden,oempden,ototint,dempden,sovdrivetime,hovdrivetime,tolldrivetime,tollcostsov,tollcosthov2,tollcosthov3,walkttime,walktotransitutility,drivetotransitutility,parkingcost,parkingwalktime,sovcost,hovcost,tollcost,tourpurpose,tourmode,firststop,laststop,zerototalstops,targettripmode
0,12150818,4,True,2,2,4,9.633200,11.594058,4.0,67.659134,5.053318,5.053318,5.053318,0.0,0.0,0.0,10.677524,-999.0,-999.0,0.0,0.0,46.447229,46.447229,46.447229,0,5,True,False,False,1
1,3018709,6,False,2,2,1,8.304700,70.117912,12.0,32.273338,31.226185,31.226185,31.226185,0.0,0.0,0.0,113.844883,-999.0,-999.0,0.0,0.0,495.225231,495.225231,495.225231,0,1,False,False,False,1
2,8809200,6,False,2,6,3,10.189500,47.538746,3.0,13.567157,30.145569,30.145569,30.145569,0.0,0.0,0.0,106.238420,-999.0,-999.0,0.0,0.0,462.137117,462.137117,462.137117,0,1,False,False,True,1
3,11332534,4,False,2,1,2,13.393200,21.983759,4.0,51.544109,24.318232,24.318232,24.318232,0.0,0.0,0.0,78.687684,-999.0,-999.0,0.0,0.0,342.291418,342.291418,342.291418,0,1,False,False,False,1
4,6130767,6,False,2,3,2,13.274000,6.568953,3.0,8.480003,25.267041,25.267041,25.267041,0.0,0.0,0.0,71.307144,-999.0,-999.0,0.0,0.0,310.186071,310.186071,310.186071,0,1,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14047024,10138525,2,True,2,4,2,35.460899,286.328949,14.0,12.385718,56.546418,56.546418,56.546418,0.0,0.0,0.0,193.717741,-999.0,-999.0,0.0,0.0,842.672156,842.672156,842.672156,5,5,False,False,False,5
14047025,16270294,7,False,2,4,3,17.207199,19.868286,13.0,1.423494,1.403090,1.403090,1.403090,0.0,0.0,0.0,3.069109,-999.0,-999.0,0.0,0.0,13.350623,13.350623,13.350623,5,3,False,False,False,3
14047026,16270295,7,False,2,4,3,16.574200,1.423494,13.0,88.151642,12.287907,12.287907,12.287907,0.0,0.0,0.0,31.580930,-999.0,-999.0,0.0,0.0,137.377041,137.377041,137.377041,5,3,False,False,False,3
14047027,10138526,2,True,2,4,2,22.916100,12.385718,14.0,8.472905,1.173692,1.173692,1.173692,0.0,0.0,0.0,1.252347,-999.0,-999.0,0.0,0.0,5.447711,5.447711,5.447711,5,5,False,False,False,3


In [7]:
# is there any missingness?
utilityvars.isnull().sum()

activityid               0
age                      0
gender                   0
autosuf                  0
numhouseholdpersons      0
income                   0
oduden                   0
oempden                  0
ototint                  0
dempden                  0
sovdrivetime             0
hovdrivetime             0
tolldrivetime            0
tollcostsov              0
tollcosthov2             0
tollcosthov3             0
walkttime                0
walktotransitutility     0
drivetotransitutility    0
parkingcost              0
parkingwalktime          0
sovcost                  0
hovcost                  0
tollcost                 0
tourpurpose              0
tourmode                 0
firststop                0
laststop                 0
zerototalstops           0
targettripmode           0
dtype: int64

In [8]:
props = utilityvars['targettripmode'].value_counts()/14047029
props

1     0.438187
3     0.250810
5     0.201549
7     0.071458
9     0.015199
12    0.009771
8     0.007629
11    0.001731
2     0.001459
6     0.000806
10    0.000717
4     0.000683
Name: targettripmode, dtype: float64

In [9]:
# sample the whole dataset

# mode1 = utilityvars[utilityvars['targettripmode']==1].sample(n=1000, weights='walkttime', random_state=1)
# mode2 = utilityvars[utilityvars['targettripmode']==2].sample(n=1000, weights='walkttime', random_state=1)
# mode3 = utilityvars[utilityvars['targettripmode']==3].sample(n=1000, weights='walkttime', random_state=1)
# mode4 = utilityvars[utilityvars['targettripmode']==4].sample(n=1000, weights='walkttime', random_state=1)
# mode5 = utilityvars[utilityvars['targettripmode']==5].sample(n=1000, weights='walkttime', random_state=1)
# mode6 = utilityvars[utilityvars['targettripmode']==6].sample(n=1000, weights='walkttime', random_state=1)
# mode7 = utilityvars[utilityvars['targettripmode']==7].sample(n=1000, weights='walkttime', random_state=1)
# mode8 = utilityvars[utilityvars['targettripmode']==8].sample(n=1000, weights='walkttime', random_state=1)
# mode9 = utilityvars[utilityvars['targettripmode']==9].sample(n=1000, weights='walkttime', random_state=1)
# mode10 = utilityvars[utilityvars['targettripmode']==10].sample(n=1000, weights='walkttime', random_state=1)
# mode11 = utilityvars[utilityvars['targettripmode']==11].sample(n=1000, weights='walkttime', random_state=1)
# mode12 = utilityvars[utilityvars['targettripmode']==12].sample(n=1000, weights='walkttime', random_state=1)

In [10]:
# combined_subsamples = pd.concat([mode1,mode2,mode3,mode4,mode5,mode6,mode7,mode8,mode9,mode10,mode11,mode12], axis=0)

In [11]:
subsample1 = utilityvars.sample(n=1000000, random_state=1)

In [12]:
subsample1['targettripmode'].value_counts()/1000000

1     0.437538
3     0.250873
5     0.202280
7     0.071412
9     0.015142
12    0.009754
8     0.007588
11    0.001729
2     0.001481
6     0.000826
4     0.000692
10    0.000685
Name: targettripmode, dtype: float64

In [13]:
size = 500000
mode1 = utilityvars[utilityvars['targettripmode']==1].sample(n=int(round(size*props[1])), random_state=1)
mode2 = utilityvars[utilityvars['targettripmode']==2].sample(n=int(round(size*props[2])), random_state=1)
mode3 = utilityvars[utilityvars['targettripmode']==3].sample(n=int(round(size*props[3])), random_state=1)
mode4 = utilityvars[utilityvars['targettripmode']==4].sample(n=int(round(size*props[4])), random_state=1)
mode5 = utilityvars[utilityvars['targettripmode']==5].sample(n=int(round(size*props[5])), random_state=1)
mode6 = utilityvars[utilityvars['targettripmode']==6].sample(n=int(round(size*props[6])), random_state=1)
mode7 = utilityvars[utilityvars['targettripmode']==7].sample(n=int(round(size*props[7])), random_state=1)
mode8 = utilityvars[utilityvars['targettripmode']==8].sample(n=int(round(size*props[8])), random_state=1)
mode9 = utilityvars[utilityvars['targettripmode']==9].sample(n=int(round(size*props[9])), random_state=1)
mode10 = utilityvars[utilityvars['targettripmode']==10].sample(n=int(round(size*props[10])), random_state=1)
mode11 = utilityvars[utilityvars['targettripmode']==11].sample(n=int(round(size*props[11])), random_state=1)
mode12 = utilityvars[utilityvars['targettripmode']==12].sample(n=int(round(size*props[12])), random_state=1)

In [14]:
subsample2 = pd.concat([mode1,mode2,mode3,mode4,mode5,mode6,mode7,mode8,mode9,mode10,mode11,mode12], axis=0)

In [15]:
# One-hot encode the categorical variables (the variables only include 0 and -1 don't need to be encoded)
new_utilityvars1 = pd.get_dummies(subsample1, columns=["tourpurpose"], prefix=["tourpurpose"])
new_utilityvars1

Unnamed: 0,activityid,age,gender,autosuf,numhouseholdpersons,income,oduden,oempden,ototint,dempden,sovdrivetime,hovdrivetime,tolldrivetime,tollcostsov,tollcosthov2,tollcosthov3,walkttime,walktotransitutility,drivetotransitutility,parkingcost,parkingwalktime,sovcost,hovcost,tollcost,tourmode,firststop,laststop,zerototalstops,targettripmode,tourpurpose_0,tourpurpose_1,tourpurpose_2,tourpurpose_3,tourpurpose_4,tourpurpose_5,tourpurpose_6,tourpurpose_7
1387296,1564459,6,True,2,4,2,7.5717,36.388775,4.0,53.826412,16.346799,16.346799,16.346799,0.0,0.0,0.0,42.808210,-999.0,-999.0,0.0,0.0,186.215711,186.215711,186.215711,1,False,False,True,1,0,0,0,0,1,0,0,0
10382234,4077144,7,False,2,1,1,13.0945,5.733529,3.0,160.396484,13.076593,12.217715,13.076593,0.0,0.0,0.0,34.379486,-999.0,-999.0,0.0,0.0,149.550759,144.172397,149.550759,1,False,True,False,1,0,0,0,1,0,0,0,0
13391127,13696540,4,False,2,1,3,7.4989,12.729101,10.0,27.067551,13.678353,13.666029,13.678353,0.0,0.0,0.0,48.839710,-999.0,-999.0,0.0,0.0,212.452735,212.667810,212.452735,1,False,False,False,1,0,0,0,0,0,0,1,0
2068532,16094595,2,False,1,5,3,8.5809,96.584641,6.0,30.264742,5.973807,5.973807,5.973807,0.0,0.0,0.0,17.714907,-999.0,-999.0,0.0,0.0,77.059845,77.059845,77.059845,3,False,False,True,3,0,0,1,0,0,0,0,0
10861969,7204173,7,False,2,1,1,4.2448,11.421939,1.0,94.689743,5.519096,5.519096,5.519096,0.0,0.0,0.0,9.530084,-999.0,-999.0,0.0,0.0,41.455863,41.455863,41.455863,1,False,False,False,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111184,10612140,1,True,1,6,5,4.2689,107.713318,2.0,24.240088,19.503379,19.503379,19.503379,0.0,0.0,0.0,52.896431,-999.0,-999.0,0.0,0.0,230.099469,230.099469,230.099469,3,True,False,False,3,0,0,1,0,0,0,0,0
7450224,10761986,4,True,2,2,2,0.5111,44.011829,1.0,2.223180,6.651095,6.651095,6.651095,0.0,0.0,0.0,14.117167,-999.0,-999.0,0.0,0.0,61.409677,61.409677,61.409677,8,False,False,False,8,1,0,0,0,0,0,0,0
4577494,13950333,6,True,1,2,5,8.2355,27.750124,3.0,32.060444,0.438984,0.438984,0.438984,0.0,0.0,0.0,1.168539,-999.0,-999.0,0.0,0.0,5.083145,5.083145,5.083145,3,False,False,False,1,0,0,0,0,0,1,0,0
7882379,8566521,7,False,2,1,3,0.8648,22.147526,26.0,5.116726,28.081127,28.081127,28.081127,0.0,0.0,0.0,95.469129,-999.0,-999.0,0.0,0.0,415.290702,415.290702,415.290702,1,False,False,True,1,0,0,0,0,0,1,0,0


In [16]:
new_utilityvars2 = pd.get_dummies(subsample2, columns=["tourpurpose"], prefix=["tourpurpose"])
new_utilityvars2

Unnamed: 0,activityid,age,gender,autosuf,numhouseholdpersons,income,oduden,oempden,ototint,dempden,sovdrivetime,hovdrivetime,tolldrivetime,tollcostsov,tollcosthov2,tollcosthov3,walkttime,walktotransitutility,drivetotransitutility,parkingcost,parkingwalktime,sovcost,hovcost,tollcost,tourmode,firststop,laststop,zerototalstops,targettripmode,tourpurpose_0,tourpurpose_1,tourpurpose_2,tourpurpose_3,tourpurpose_4,tourpurpose_5,tourpurpose_6,tourpurpose_7
389395,6846232,6,True,2,1,3,4.7632,27.730175,23.0,24.913393,5.347993,5.347993,5.347993,0.0,0.0,0.0,8.340726,-999.0,-999.0,0.0,0.0,36.282156,36.282156,36.282156,1,True,False,False,1,0,0,0,0,1,0,0,0
9670948,2110084,7,True,1,4,4,9.2246,15.671348,6.0,31.653795,2.035063,2.035063,2.035063,0.0,0.0,0.0,3.962483,-999.0,-999.0,0.0,0.0,17.236800,17.236800,17.236800,1,False,False,False,1,0,0,0,0,0,1,0,0
12239774,10040064,6,True,2,4,2,17.5495,41.898407,13.0,2.250410,14.014216,14.014216,14.014216,0.0,0.0,0.0,45.059798,-999.0,-999.0,0.0,0.0,196.010116,196.010116,196.010116,1,False,False,True,1,0,0,0,0,1,0,0,0
8333334,5200740,7,False,2,2,3,6.9557,33.792797,1.0,3.054741,14.571569,14.571569,14.571569,0.0,0.0,0.0,48.703855,-999.0,-999.0,0.0,0.0,211.861765,211.861765,211.861765,1,False,False,False,1,1,0,0,0,0,0,0,0
5401543,3080529,7,True,1,2,1,11.6963,5.626131,6.0,6.404369,6.707609,6.707609,6.707609,0.0,0.0,0.0,18.757141,-999.0,-999.0,0.0,0.0,81.593560,81.593560,81.593560,1,False,False,False,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7530275,12850414,1,False,1,2,3,6.3834,35.998531,10.0,21.200064,0.758012,0.758012,0.758012,0.0,0.0,0.0,1.014384,-999.0,-999.0,0.0,0.0,4.412570,4.412570,4.412570,12,False,False,True,12,0,0,1,0,0,0,0,0
1315937,4343516,3,True,1,4,4,10.3973,21.256468,4.0,99.809555,7.535834,7.535834,7.535834,0.0,0.0,0.0,13.630794,-999.0,-999.0,0.0,0.0,59.293954,59.293954,59.293954,12,False,False,True,12,0,0,1,0,0,0,0,0
1811575,931169,1,True,2,3,3,1.9526,11.972782,26.0,7.575649,9.073135,9.073135,9.073135,0.0,0.0,0.0,14.859670,-999.0,-999.0,0.0,0.0,64.639565,64.639565,64.639565,12,False,False,True,12,0,0,1,0,0,0,0,0
1577591,4171930,2,True,2,2,3,11.2013,30.438572,4.0,39.912827,6.276231,6.276231,6.276231,0.0,0.0,0.0,12.657595,-999.0,-999.0,0.0,0.0,55.060536,55.060536,55.060536,12,False,False,True,12,0,0,1,0,0,0,0,0


In [17]:
# Build XGBoost tree model
new_utilityvars1 = pd.merge(new_utilityvars1.drop(['targettripmode', 'tourmode', 'activityid'], axis=1), new_utilityvars1[['targettripmode']], left_index=True, right_index=True, how="outer")
new_utilityvars1

Unnamed: 0,age,gender,autosuf,numhouseholdpersons,income,oduden,oempden,ototint,dempden,sovdrivetime,hovdrivetime,tolldrivetime,tollcostsov,tollcosthov2,tollcosthov3,walkttime,walktotransitutility,drivetotransitutility,parkingcost,parkingwalktime,sovcost,hovcost,tollcost,firststop,laststop,zerototalstops,tourpurpose_0,tourpurpose_1,tourpurpose_2,tourpurpose_3,tourpurpose_4,tourpurpose_5,tourpurpose_6,tourpurpose_7,targettripmode
1387296,6,True,2,4,2,7.5717,36.388775,4.0,53.826412,16.346799,16.346799,16.346799,0.0,0.0,0.0,42.808210,-999.0,-999.0,0.0,0.0,186.215711,186.215711,186.215711,False,False,True,0,0,0,0,1,0,0,0,1
10382234,7,False,2,1,1,13.0945,5.733529,3.0,160.396484,13.076593,12.217715,13.076593,0.0,0.0,0.0,34.379486,-999.0,-999.0,0.0,0.0,149.550759,144.172397,149.550759,False,True,False,0,0,0,1,0,0,0,0,1
13391127,4,False,2,1,3,7.4989,12.729101,10.0,27.067551,13.678353,13.666029,13.678353,0.0,0.0,0.0,48.839710,-999.0,-999.0,0.0,0.0,212.452735,212.667810,212.452735,False,False,False,0,0,0,0,0,0,1,0,1
2068532,2,False,1,5,3,8.5809,96.584641,6.0,30.264742,5.973807,5.973807,5.973807,0.0,0.0,0.0,17.714907,-999.0,-999.0,0.0,0.0,77.059845,77.059845,77.059845,False,False,True,0,0,1,0,0,0,0,0,3
10861969,7,False,2,1,1,4.2448,11.421939,1.0,94.689743,5.519096,5.519096,5.519096,0.0,0.0,0.0,9.530084,-999.0,-999.0,0.0,0.0,41.455863,41.455863,41.455863,False,False,False,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111184,1,True,1,6,5,4.2689,107.713318,2.0,24.240088,19.503379,19.503379,19.503379,0.0,0.0,0.0,52.896431,-999.0,-999.0,0.0,0.0,230.099469,230.099469,230.099469,True,False,False,0,0,1,0,0,0,0,0,3
7450224,4,True,2,2,2,0.5111,44.011829,1.0,2.223180,6.651095,6.651095,6.651095,0.0,0.0,0.0,14.117167,-999.0,-999.0,0.0,0.0,61.409677,61.409677,61.409677,False,False,False,1,0,0,0,0,0,0,0,8
4577494,6,True,1,2,5,8.2355,27.750124,3.0,32.060444,0.438984,0.438984,0.438984,0.0,0.0,0.0,1.168539,-999.0,-999.0,0.0,0.0,5.083145,5.083145,5.083145,False,False,False,0,0,0,0,0,1,0,0,1
7882379,7,False,2,1,3,0.8648,22.147526,26.0,5.116726,28.081127,28.081127,28.081127,0.0,0.0,0.0,95.469129,-999.0,-999.0,0.0,0.0,415.290702,415.290702,415.290702,False,False,True,0,0,0,0,0,1,0,0,1


In [18]:
df_array1 = new_utilityvars1.values
X1 = df_array1[:,0:34]
Y1 = df_array1[:,34]

In [19]:
seed = 7
test_size = 0.33
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, Y1, test_size=test_size, random_state=seed)

In [20]:
model1 = xgb.XGBClassifier()
model1.fit(X_train1, y_train1, eval_set = [(X_train1, y_train1), (X_test1, y_test1)], verbose = 0)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [21]:
y_pred1 = [round(value) for value in model1.predict(X_test1)]
y_test1 = [round(value) for value in y_test1]

In [22]:
new_utilityvars2 = pd.merge(new_utilityvars2.drop(['targettripmode', 'tourmode', 'activityid'], axis=1), new_utilityvars2[['targettripmode']], left_index=True, right_index=True, how="outer")
new_utilityvars2

Unnamed: 0,age,gender,autosuf,numhouseholdpersons,income,oduden,oempden,ototint,dempden,sovdrivetime,hovdrivetime,tolldrivetime,tollcostsov,tollcosthov2,tollcosthov3,walkttime,walktotransitutility,drivetotransitutility,parkingcost,parkingwalktime,sovcost,hovcost,tollcost,firststop,laststop,zerototalstops,tourpurpose_0,tourpurpose_1,tourpurpose_2,tourpurpose_3,tourpurpose_4,tourpurpose_5,tourpurpose_6,tourpurpose_7,targettripmode
389395,6,True,2,1,3,4.7632,27.730175,23.0,24.913393,5.347993,5.347993,5.347993,0.0,0.0,0.0,8.340726,-999.0,-999.0,0.0,0.0,36.282156,36.282156,36.282156,True,False,False,0,0,0,0,1,0,0,0,1
9670948,7,True,1,4,4,9.2246,15.671348,6.0,31.653795,2.035063,2.035063,2.035063,0.0,0.0,0.0,3.962483,-999.0,-999.0,0.0,0.0,17.236800,17.236800,17.236800,False,False,False,0,0,0,0,0,1,0,0,1
12239774,6,True,2,4,2,17.5495,41.898407,13.0,2.250410,14.014216,14.014216,14.014216,0.0,0.0,0.0,45.059798,-999.0,-999.0,0.0,0.0,196.010116,196.010116,196.010116,False,False,True,0,0,0,0,1,0,0,0,1
8333334,7,False,2,2,3,6.9557,33.792797,1.0,3.054741,14.571569,14.571569,14.571569,0.0,0.0,0.0,48.703855,-999.0,-999.0,0.0,0.0,211.861765,211.861765,211.861765,False,False,False,1,0,0,0,0,0,0,0,1
5401543,7,True,1,2,1,11.6963,5.626131,6.0,6.404369,6.707609,6.707609,6.707609,0.0,0.0,0.0,18.757141,-999.0,-999.0,0.0,0.0,81.593560,81.593560,81.593560,False,False,False,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7530275,1,False,1,2,3,6.3834,35.998531,10.0,21.200064,0.758012,0.758012,0.758012,0.0,0.0,0.0,1.014384,-999.0,-999.0,0.0,0.0,4.412570,4.412570,4.412570,False,False,True,0,0,1,0,0,0,0,0,12
1315937,3,True,1,4,4,10.3973,21.256468,4.0,99.809555,7.535834,7.535834,7.535834,0.0,0.0,0.0,13.630794,-999.0,-999.0,0.0,0.0,59.293954,59.293954,59.293954,False,False,True,0,0,1,0,0,0,0,0,12
1811575,1,True,2,3,3,1.9526,11.972782,26.0,7.575649,9.073135,9.073135,9.073135,0.0,0.0,0.0,14.859670,-999.0,-999.0,0.0,0.0,64.639565,64.639565,64.639565,False,False,True,0,0,1,0,0,0,0,0,12
1577591,2,True,2,2,3,11.2013,30.438572,4.0,39.912827,6.276231,6.276231,6.276231,0.0,0.0,0.0,12.657595,-999.0,-999.0,0.0,0.0,55.060536,55.060536,55.060536,False,False,True,0,0,1,0,0,0,0,0,12


In [23]:
df_array2 = new_utilityvars2.values
X2 = df_array2[:,0:34]
Y2 = df_array2[:,34]

In [24]:
seed = 7
test_size = 0.33
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, Y2, test_size=test_size, random_state=seed)

In [25]:
model2 = xgb.XGBClassifier()
model2.fit(X_train2, y_train2, eval_set = [(X_train2, y_train2), (X_test2, y_test2)], verbose = 0)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [26]:
y_pred2 = [round(value) for value in model2.predict(X_test2)]
y_test2 = [round(value) for value in y_test2]

In [27]:
# multilabel confusion matrix
metrics.multilabel_confusion_matrix(y_test1, y_pred1)

array([[[130323,  55280],
        [ 20368, 124029]],

       [[329311,    193],
        [   290,    206]],

       [[219220,  28146],
        [ 55659,  26975]],

       [[329724,     54],
        [   206,     16]],

       [[231673,  31559],
        [ 27278,  39490]],

       [[329621,    107],
        [   225,     47]],

       [[296680,   9598],
        [ 13767,   9955]],

       [[327145,    320],
        [  2247,    288]],

       [[324406,    633],
        [  4538,    423]],

       [[329760,      6],
        [   234,      0]],

       [[329431,     21],
        [   545,      3]],

       [[325300,   1489],
        [  2049,   1162]]])

In [28]:
# accuracy
accuracy1 = metrics.accuracy_score(y_test1, y_pred1)
accuracy1

0.6139212121212121

In [29]:
# sensitivity
sensitivity = metrics.recall_score(y_test1, y_pred1, average = 'macro')
sensitivity

0.28524219562271896

In [30]:
# precision
precision = metrics.precision_score(y_test1, y_pred1, average = 'macro')
precision

0.3944717028941404

In [31]:
# F1-score
f1 = (2 * precision * sensitivity) / (precision + sensitivity)
f1

0.33108039982727105

In [32]:
accuracy2 = metrics.accuracy_score(y_test2, y_pred2)
accuracy2

0.6111939393939394

In [33]:
# 13*13 confusion matrix
metrics.confusion_matrix(y_test1, y_pred1)

array([[124029,     89,  10367,      7,   6675,     15,   2825,    164,
           184,      3,      5,     34],
       [   253,    206,      9,     16,      4,      7,      1,      0,
             0,      0,      0,      0],
       [ 31917,     11,  26975,      8,  19659,     24,   3263,     51,
           172,      0,      4,    550],
       [    83,     53,     22,     16,     19,     28,      0,      0,
             0,      1,      0,      0],
       [ 11808,      3,  11969,      7,  39490,     32,   2627,     27,
           159,      0,      3,    643],
       [    55,     34,     41,     16,     78,     47,      1,      0,
             0,      0,      0,      0],
       [  6544,      0,   2972,      0,   3960,      1,   9955,     61,
            43,      0,      0,    186],
       [  1428,      0,    251,      0,    164,      0,    339,    288,
            28,      1,      0,     36],
       [  2606,      3,    786,      0,    683,      0,    417,      4,
           423,      1, 

In [34]:
# normalized confusion matrix

In [37]:
normalizeddf1 = pd.DataFrame(metrics.confusion_matrix(y_test1, y_pred1)/ metrics.confusion_matrix(y_test1, y_pred1).astype(np.float).sum(axis=1))
normalizeddf1.columns = ['pred Drive Alone Free','pred Drive Alone Pay','pred HOV2 Free',
                       'pred HOV2 Pay', 'pred HOV3 Free','pred HOV3 Pay','pred Walk','pred BIKE',
                       'pred Walk to Transit', 'pred Park and Ride','pred Kiss and Ride', 'pred School Bus']
normalizeddf1.index = ['pred Drive Alone Free','pred Drive Alone Pay','pred HOV2 Free',
                       'pred HOV2 Pay', 'pred HOV3 Free','pred HOV3 Pay','pred Walk','pred BIKE',
                       'pred Walk to Transit', 'pred Park and Ride','pred Kiss and Ride', 'pred School Bus']

In [38]:
normalizeddf1

Unnamed: 0,pred Drive Alone Free,pred Drive Alone Pay,pred HOV2 Free,pred HOV2 Pay,pred HOV3 Free,pred HOV3 Pay,pred Walk,pred BIKE,pred Walk to Transit,pred Park and Ride,pred Kiss and Ride,pred School Bus
pred Drive Alone Free,0.858944,0.179435,0.125457,0.031532,0.099973,0.055147,0.119088,0.064694,0.037089,0.012821,0.009124,0.010589
pred Drive Alone Pay,0.001752,0.415323,0.000109,0.072072,6e-05,0.025735,4.2e-05,0.0,0.0,0.0,0.0,0.0
pred HOV2 Free,0.221036,0.022177,0.326439,0.036036,0.294437,0.088235,0.137552,0.020118,0.03467,0.0,0.007299,0.171286
pred HOV2 Pay,0.000575,0.106855,0.000266,0.072072,0.000285,0.102941,0.0,0.0,0.0,0.004274,0.0,0.0
pred HOV3 Free,0.081775,0.006048,0.144844,0.031532,0.591451,0.117647,0.110741,0.010651,0.03205,0.0,0.005474,0.200249
pred HOV3 Pay,0.000381,0.068548,0.000496,0.072072,0.001168,0.172794,4.2e-05,0.0,0.0,0.0,0.0,0.0
pred Walk,0.04532,0.0,0.035966,0.0,0.05931,0.003676,0.419653,0.024063,0.008668,0.0,0.0,0.057926
pred BIKE,0.009889,0.0,0.003037,0.0,0.002456,0.0,0.014291,0.113609,0.005644,0.004274,0.0,0.011211
pred Walk to Transit,0.018047,0.006048,0.009512,0.0,0.010229,0.0,0.017579,0.001578,0.085265,0.004274,0.010949,0.009966
pred Park and Ride,0.001392,0.0,0.000133,0.0,4.5e-05,0.0,0.000169,0.000394,0.002217,0.0,0.005474,0.0


|  | pred Drive Alone Free | pred Drive Alone Pay | pred HOV2 Free | pred HOV2 Pay | pred HOV3 Free | pred HOV3 Pay | pred Walk | pred Bike | pred Walk to Transit | pred Park and Ride | pred Kiss and Ride | pred School Bus |
| :- | :- | :- | :- | :- | :- | :- | :- | :- | :- | :- | :- | :- |
| Drive Alone Free | 0.43333333 | 0.01515152 | 0.14848485 | 0 | 0.06060606 | 0.0030303 | 0.05757576 | 0.08181818 | 0.05757576 | 0.1 | 0 | 0.04242424 |
| Drive Alone Pay | 0.01273885 | 0.70700637 | 0.00318471 | 0.20063694 | 0.00318471 | 0.06687898 | 0 | 0.00318471 | 0 | 0 | 0.00318471 | 0 |
| HOV2 Free | 0.12941176 | 0.00294118 | 0.25 | 0.01176471 | 0.24705882 | 0.00294118 | 0.08235294 | 0.04411765 | 0.05588235 | 0.03823529 | 0.08529412 | 0.05 |
| HOV2 Pay | 0.00890208 | 0.25222552| 0.02670623 | 0.40356083 | 0.01186944 | 0.27002967 | 0.00890208 | 0 | 0.00296736 | 0.00890208 | 0.00593472 | 0 |
| HOV3 Free | 0.05 | 0.00294118 | 0.13823529 | 0.00882353 | 0.47941176 | 0.00882353 | 0.12058824 | 0.02058824 | 0.04705882 | 0.02647059 | 0.05882353 | 0.03823529 |
| HOV3 Pay | 0 | 0.09763314 | 0.01183432 | 0.28698225 | 0.04142012 | 0.5443787 | 0.00591716 | 0 | 0.00295858 | 0.00295858 | 0.00591716 | 0 |
| Walk | 0.01807229 | 0 | 0.06927711 | 0.00301205 | 0.0813253 | 0 | 0.63253012 | 0.09638554 | 0.03313253 | 0.01204819 | 0.03313253 | 0.02108434 |
| Bike | 0.06031746 | 0.0031746 | 0.03809524 | 0 | 0.05079365 | 0 | 0.11746032 | 0.52380952 | 0.04761905 | 0.06666667 | 0.07301587 | 0.01904762 |
| Walk to Transit | 0.07917889 | 0.00293255 | 0.04398827 | 0.00293255 | 0.05278592 | 0 | 0.07038123 | 0.07038123 | 0.32258065 | 0.17302053 | 0.16129032 | 0.02052786 |
| Park and Ride | 0.09567901 | 0.02777778 | 0.0154321 | 0.0154321 | 0.01234568 | 0.00925926 | 0.00617284 | 0.03703704 | 0.08333333 | 0.5 | 0.19753086 | 0 |
| Kiss and Ride | 0.06918239 | 0.00943396 | 0.05345912 | 0.00314465 | 0.05660377 | 0.00628931 | 0.02201258 | 0.02830189 | 0.14465409 | 0.21069182 | 0.36477987 | 0.03144654 |
| School Bus | 0 | 0 | 0.00302115 | 0 | 0.00906344 | 0 | 0.01812689 | 0 | 0 | 0 | 0.00906344 | 0.96072508 |

In [39]:
normalizeddf2 = pd.DataFrame(metrics.confusion_matrix(y_test2, y_pred2)/ metrics.confusion_matrix(y_test2, y_pred2).astype(np.float).sum(axis=1))
normalizeddf2.columns = ['pred Drive Alone Free','pred Drive Alone Pay','pred HOV2 Free',
                       'pred HOV2 Pay', 'pred HOV3 Free','pred HOV3 Pay','pred Walk','pred BIKE',
                       'pred Walk to Transit', 'pred Park and Ride','pred Kiss and Ride', 'pred School Bus']
normalizeddf2.index = ['pred Drive Alone Free','pred Drive Alone Pay','pred HOV2 Free',
                       'pred HOV2 Pay', 'pred HOV3 Free','pred HOV3 Pay','pred Walk','pred BIKE',
                       'pred Walk to Transit', 'pred Park and Ride','pred Kiss and Ride', 'pred School Bus']

In [40]:
normalizeddf2

Unnamed: 0,pred Drive Alone Free,pred Drive Alone Pay,pred HOV2 Free,pred HOV2 Pay,pred HOV3 Free,pred HOV3 Pay,pred Walk,pred BIKE,pred Walk to Transit,pred Park and Ride,pred Kiss and Ride,pred School Bus
pred Drive Alone Free,0.855755,0.162264,0.123194,0.035398,0.110417,0.037879,0.117284,0.057767,0.044746,0.032787,0.010239,0.013673
pred Drive Alone Pay,0.002187,0.320755,7.2e-05,0.097345,0.0,0.060606,0.0,0.0,0.0,0.0,0.0,0.0
pred HOV2 Free,0.221724,0.018868,0.319429,0.044248,0.307848,0.075758,0.139457,0.018735,0.044347,0.008197,0.010239,0.181479
pred HOV2 Pay,0.000623,0.060377,0.000409,0.079646,0.000452,0.075758,0.0,0.0,0.0,0.0,0.0,0.000622
pred HOV3 Free,0.080455,0.0,0.139206,0.017699,0.597117,0.037879,0.111805,0.009368,0.03356,0.0,0.003413,0.214419
pred HOV3 Pay,0.000484,0.05283,0.000289,0.061947,0.001387,0.136364,0.0,0.0,0.0,0.0,0.0,0.0
pred Walk,0.044802,0.0,0.034645,0.0,0.060803,0.007576,0.415889,0.02498,0.006392,0.0,0.0,0.04972
pred BIKE,0.009758,0.0,0.002837,0.0,0.002805,0.0,0.016009,0.110851,0.005194,0.0,0.0,0.014295
pred Walk to Transit,0.01838,0.0,0.008319,0.0,0.011431,0.0,0.018064,0.0,0.087495,0.008197,0.017065,0.008701
pred Park and Ride,0.001426,0.0,0.00012,0.0,0.000121,0.0,0.000342,0.0,0.002397,0.0,0.0,0.0


In [53]:
# check the distribution of prediction result
pd.Series(y_test1).value_counts()/len(y_test1)

1     0.437567
3     0.250406
5     0.202327
7     0.071885
9     0.015033
12    0.009730
8     0.007682
11    0.001661
2     0.001503
6     0.000824
10    0.000709
4     0.000673
dtype: float64

In [54]:
pd.Series(y_test2).value_counts()/len(y_test2)

1     0.437891
3     0.252079
5     0.200945
7     0.070794
9     0.015170
12    0.009752
8     0.007764
11    0.001776
2     0.001606
6     0.000800
10    0.000739
4     0.000685
dtype: float64

In [41]:
# metrics.confusion_matrix(y_test, y_pred)[11] / sum(metrics.confusion_matrix(y_test, y_pred)[11])

In [42]:
# check trips.csv

In [43]:
trip_mg = pd.merge(utilityvars, trips, on=["activityid"])[['targettripmode', 'modechoice']]
trip_mg

Unnamed: 0,targettripmode,modechoice
0,1,DRIVE_ALONE_FREE
1,1,DRIVE_ALONE_FREE
2,1,DRIVE_ALONE_FREE
3,1,DRIVE_ALONE_FREE
4,1,DRIVE_ALONE_FREE
...,...,...
14047024,5,SHARED_3_HOV
14047025,3,SHARED_3_HOV
14047026,3,SHARED_3_HOV
14047027,3,SHARED_3_HOV


In [44]:
trip_mg["targettripmode"].replace({1: "DRIVE_ALONE_FREE", 2: "DRIVE_ALONE_PAY", 3: "SHARED_2_HOV", 4: "SHARED_2_PAY", 5: "SHARED_3_HOV", 6: "SHARED_3_PAY", 7: "WALK", 8: "BIKE", 9: "WALK_SET", 10: "PNR_SET", 11: "KNR_SET", 12: "SCH_BUS"}, inplace=True)
trip_mg

Unnamed: 0,targettripmode,modechoice
0,DRIVE_ALONE_FREE,DRIVE_ALONE_FREE
1,DRIVE_ALONE_FREE,DRIVE_ALONE_FREE
2,DRIVE_ALONE_FREE,DRIVE_ALONE_FREE
3,DRIVE_ALONE_FREE,DRIVE_ALONE_FREE
4,DRIVE_ALONE_FREE,DRIVE_ALONE_FREE
...,...,...
14047024,SHARED_3_HOV,SHARED_3_HOV
14047025,SHARED_2_HOV,SHARED_3_HOV
14047026,SHARED_2_HOV,SHARED_3_HOV
14047027,SHARED_2_HOV,SHARED_3_HOV


In [45]:
metrics.accuracy_score(trip_mg['targettripmode'], trip_mg['modechoice'])

0.4792990745587554

In [46]:
metrics.recall_score(trip_mg['targettripmode'], trip_mg['modechoice'], average = 'macro')

0.19459360823852725

In [47]:
metrics.precision_score(trip_mg['targettripmode'], trip_mg['modechoice'], average = 'macro')

0.19574958766844489

In [48]:
metrics.confusion_matrix(trip_mg['targettripmode'], trip_mg['modechoice'])

array([[  10170,   42876,       8,      97,     122,    1324,   15767,
             21,   12382,      34,   22051,    2307],
       [  21915, 4152601,    5193,    6747,    6837,    3568,  896572,
           4914,  620743,    5673,  326343,  104119],
       [      4,   13181,    1514,      36,      38,       0,    1986,
           1466,     838,    1188,      98,     145],
       [     29,   10215,      14,     334,     237,     215,    6232,
             49,    4156,      25,     720,    2089],
       [     30,    6027,      11,      56,     130,       0,    1737,
             27,     912,      24,     154,     966],
       [   1255,    2520,       0,      20,       0,   25297,   53723,
              4,   46013,      13,    7946,     465],
       [   7758, 1049745,    1296,    2554,    1824,   76525, 1242758,
           3813,  877007,    3858,  214324,   41679],
       [      0,    3390,     395,       9,      26,      81,    2315,
            851,    1369,     873,     193,      96],


In [None]:
# age subset
age0 = new_utilityvars[new_utilityvars['age'] == 0]
age1 = new_utilityvars[new_utilityvars['age'] == 1]
age2 = new_utilityvars[new_utilityvars['age'] == 2]
age3 = new_utilityvars[new_utilityvars['age'] == 3]
age4 = new_utilityvars[new_utilityvars['age'] == 4]
age5 = new_utilityvars[new_utilityvars['age'] == 5]
age6 = new_utilityvars[new_utilityvars['age'] == 6]
age7 = new_utilityvars[new_utilityvars['age'] == 7]

In [None]:
df_array = age0.values
X = df_array[:,0:34]
Y = df_array[:,34]

In [None]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [None]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_test, y_test)], verbose = 0)

In [None]:
y_pred = [round(value) for value in model.predict(X_test)]
y_test = [round(value) for value in y_test]

In [None]:
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy

In [None]:
sensitivity = metrics.recall_score(y_test, y_pred, average = 'macro')
sensitivity

In [None]:
df_array = age1.values
X = df_array[:,0:34]
Y = df_array[:,34]

In [None]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [None]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_test, y_test)], verbose = 0)

In [None]:
y_pred = [round(value) for value in model.predict(X_test)]
y_test = [round(value) for value in y_test]

In [None]:
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy

Age could be an important feature.