In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost as xgb

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
trips = pd.read_csv("trips.csv")
trips

Unnamed: 0,activityid,personid,modechoice
0,1,3,WALK
1,2,3,WALK
2,3,3,WALK
3,4,3,WALK
4,5,3,WALK
...,...,...,...
14047024,17147588,4113994,DRIVE_ALONE_FREE
14047025,17147589,4113994,DRIVE_ALONE_FREE
14047026,17147590,4113994,DRIVE_ALONE_FREE
14047027,17147592,4113995,SHARED_3_HOV


In [4]:
trips.isnull().sum()

activityid    0
personid      0
modechoice    0
dtype: int64

In [5]:
trips['modechoice'].value_counts()

DRIVE_ALONE_FREE    6156506
SHARED_2_HOV        3507285
SHARED_3_HOV        2814408
WALK                1022834
WALK_SET             214366
SCH_BUS              194123
BIKE                  69645
SHARED_3_PAY          17088
SHARED_2_PAY          15450
KNR_SET               13633
PNR_SET               11926
DRIVE_ALONE_PAY        9765
Name: modechoice, dtype: int64

In [75]:
utilityvars = pd.read_csv("utilityvars.csv")
utilityvars

Unnamed: 0,activityid,age,gender,autosuf,numhouseholdpersons,income,oduden,oempden,ototint,dempden,sovdrivetime,hovdrivetime,tolldrivetime,tollcostsov,tollcosthov2,tollcosthov3,walkttime,walktotransitutility,drivetotransitutility,parkingcost,parkingwalktime,sovcost,hovcost,tollcost,tourpurpose,tourmode,firststop,laststop,zerototalstops,targettripmode
0,12150818,4,True,2,2,4,9.633200,11.594058,4.0,67.659134,5.053318,5.053318,5.053318,0.0,0.0,0.0,10.677524,-999.0,-999.0,0.0,0.0,46.447229,46.447229,46.447229,0,5,True,False,False,1
1,3018709,6,False,2,2,1,8.304700,70.117912,12.0,32.273338,31.226185,31.226185,31.226185,0.0,0.0,0.0,113.844883,-999.0,-999.0,0.0,0.0,495.225231,495.225231,495.225231,0,1,False,False,False,1
2,8809200,6,False,2,6,3,10.189500,47.538746,3.0,13.567157,30.145569,30.145569,30.145569,0.0,0.0,0.0,106.238420,-999.0,-999.0,0.0,0.0,462.137117,462.137117,462.137117,0,1,False,False,True,1
3,11332534,4,False,2,1,2,13.393200,21.983759,4.0,51.544109,24.318232,24.318232,24.318232,0.0,0.0,0.0,78.687684,-999.0,-999.0,0.0,0.0,342.291418,342.291418,342.291418,0,1,False,False,False,1
4,6130767,6,False,2,3,2,13.274000,6.568953,3.0,8.480003,25.267041,25.267041,25.267041,0.0,0.0,0.0,71.307144,-999.0,-999.0,0.0,0.0,310.186071,310.186071,310.186071,0,1,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14047024,10138525,2,True,2,4,2,35.460899,286.328949,14.0,12.385718,56.546418,56.546418,56.546418,0.0,0.0,0.0,193.717741,-999.0,-999.0,0.0,0.0,842.672156,842.672156,842.672156,5,5,False,False,False,5
14047025,16270294,7,False,2,4,3,17.207199,19.868286,13.0,1.423494,1.403090,1.403090,1.403090,0.0,0.0,0.0,3.069109,-999.0,-999.0,0.0,0.0,13.350623,13.350623,13.350623,5,3,False,False,False,3
14047026,16270295,7,False,2,4,3,16.574200,1.423494,13.0,88.151642,12.287907,12.287907,12.287907,0.0,0.0,0.0,31.580930,-999.0,-999.0,0.0,0.0,137.377041,137.377041,137.377041,5,3,False,False,False,3
14047027,10138526,2,True,2,4,2,22.916100,12.385718,14.0,8.472905,1.173692,1.173692,1.173692,0.0,0.0,0.0,1.252347,-999.0,-999.0,0.0,0.0,5.447711,5.447711,5.447711,5,5,False,False,False,3


In [76]:
# is there any missingness?
utilityvars.isnull().sum()

activityid               0
age                      0
gender                   0
autosuf                  0
numhouseholdpersons      0
income                   0
oduden                   0
oempden                  0
ototint                  0
dempden                  0
sovdrivetime             0
hovdrivetime             0
tolldrivetime            0
tollcostsov              0
tollcosthov2             0
tollcosthov3             0
walkttime                0
walktotransitutility     0
drivetotransitutility    0
parkingcost              0
parkingwalktime          0
sovcost                  0
hovcost                  0
tollcost                 0
tourpurpose              0
tourmode                 0
firststop                0
laststop                 0
zerototalstops           0
targettripmode           0
dtype: int64

In [8]:
utilityvars['parkingcost'].value_counts()

0.000000      13216142
125.000000       95773
943.000000       81421
412.000000       37549
137.604996       21844
                ...   
381.760692           1
89.240509            1
152.255060           1
2.973118             1
285.949867           1
Name: parkingcost, Length: 59411, dtype: int64

In [77]:
# sample the whole dataset

mode1 = utilityvars[utilityvars['targettripmode']==1].sample(n=1000, weights='walkttime', random_state=1)
mode2 = utilityvars[utilityvars['targettripmode']==2].sample(n=1000, weights='walkttime', random_state=1)
mode3 = utilityvars[utilityvars['targettripmode']==3].sample(n=1000, weights='walkttime', random_state=1)
mode4 = utilityvars[utilityvars['targettripmode']==4].sample(n=1000, weights='walkttime', random_state=1)
mode5 = utilityvars[utilityvars['targettripmode']==5].sample(n=1000, weights='walkttime', random_state=1)
mode6 = utilityvars[utilityvars['targettripmode']==6].sample(n=1000, weights='walkttime', random_state=1)
mode7 = utilityvars[utilityvars['targettripmode']==7].sample(n=1000, weights='walkttime', random_state=1)
mode8 = utilityvars[utilityvars['targettripmode']==8].sample(n=1000, weights='walkttime', random_state=1)
mode9 = utilityvars[utilityvars['targettripmode']==9].sample(n=1000, weights='walkttime', random_state=1)
mode10 = utilityvars[utilityvars['targettripmode']==10].sample(n=1000, weights='walkttime', random_state=1)
mode11 = utilityvars[utilityvars['targettripmode']==11].sample(n=1000, weights='walkttime', random_state=1)
mode12 = utilityvars[utilityvars['targettripmode']==12].sample(n=1000, weights='walkttime', random_state=1)

In [78]:
combined_subsamples = pd.concat([mode1,mode2,mode3,mode4,mode5,mode6,mode7,mode8,mode9,mode10,mode11,mode12], axis=0)

In [79]:
# One-hot encode the categorical variables (the variables only include 0 and -1 don't need to be encoded)
new_utilityvars = pd.get_dummies(combined_subsamples, columns=["tourpurpose"], prefix=["tourpurpose"])
new_utilityvars

Unnamed: 0,activityid,age,gender,autosuf,numhouseholdpersons,income,oduden,oempden,ototint,dempden,sovdrivetime,hovdrivetime,tolldrivetime,tollcostsov,tollcosthov2,tollcosthov3,walkttime,walktotransitutility,drivetotransitutility,parkingcost,parkingwalktime,sovcost,hovcost,tollcost,tourmode,firststop,laststop,zerototalstops,targettripmode,tourpurpose_0,tourpurpose_1,tourpurpose_2,tourpurpose_3,tourpurpose_4,tourpurpose_5,tourpurpose_6,tourpurpose_7
5420299,5933728,7,True,2,2,2,15.0496,6.528248,3.0,28.221581,25.183311,25.183311,25.183311,0.0,0.0,0.0,74.541791,-999.0,-999.0,0.0,0.0,324.256782,324.256782,324.256782,1,False,False,True,1,0,0,0,0,1,0,0,0
10137151,14562853,3,False,2,5,4,3.4107,230.852676,22.0,27.689655,20.561739,20.561739,20.561739,0.0,0.0,0.0,68.671031,-999.0,-999.0,0.0,0.0,298.718978,298.718978,298.718978,1,False,False,True,1,1,0,0,0,0,0,0,0
459,13897599,6,False,1,5,5,7.3797,4.521993,1.0,137.408112,32.240913,32.231399,32.240913,0.0,0.0,0.0,125.903913,-999.0,-999.0,0.0,0.0,547.682008,547.825087,547.682008,1,False,False,False,1,1,0,0,0,0,0,0,0
3523186,5757716,7,False,1,2,4,6.2555,31.995995,1.0,6.055996,9.218620,9.218620,9.218620,0.0,0.0,0.0,30.619256,-999.0,-999.0,0.0,0.0,133.193759,133.193759,133.193759,1,False,False,False,1,0,0,0,1,0,0,0,0
1389833,9001967,7,False,2,1,2,13.1061,80.675232,3.0,24.416708,14.847696,14.847696,14.847696,0.0,0.0,0.0,51.645009,-999.0,-999.0,0.0,0.0,224.655784,224.655784,224.655784,1,False,False,False,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4649022,5565944,1,False,2,5,5,3.6818,46.511749,1.0,90.145737,5.535691,5.535691,5.535691,0.0,0.0,0.0,12.468397,-999.0,-999.0,0.0,0.0,54.237525,54.237525,54.237525,12,False,False,True,12,0,0,1,0,0,0,0,0
1014748,11693791,1,True,1,4,4,7.7022,19.270676,12.0,42.764721,2.246100,2.246100,2.246100,0.0,0.0,0.0,5.529111,-999.0,-999.0,0.0,0.0,24.051633,24.051633,24.051633,12,False,False,True,12,0,0,1,0,0,0,0,0
409363,512651,3,False,2,5,1,5.6995,14.991196,10.0,46.152000,19.167122,19.167122,19.167122,0.0,0.0,0.0,69.843901,-999.0,-999.0,0.0,0.0,303.820963,303.820963,303.820963,12,False,False,True,12,0,0,1,0,0,0,0,0
1561740,101043,1,True,1,3,2,6.4687,20.669909,8.0,19.647785,15.057140,15.057140,15.057140,0.0,0.0,0.0,34.377707,-999.0,-999.0,0.0,0.0,149.543022,149.543022,149.543022,12,False,False,True,12,0,0,1,0,0,0,0,0


In [80]:
# Build XGBoost tree model
new_utilityvars = pd.merge(new_utilityvars.drop(['targettripmode', 'tourmode', 'activityid'], axis=1), new_utilityvars[['targettripmode']], left_index=True, right_index=True, how="outer")
new_utilityvars

Unnamed: 0,age,gender,autosuf,numhouseholdpersons,income,oduden,oempden,ototint,dempden,sovdrivetime,hovdrivetime,tolldrivetime,tollcostsov,tollcosthov2,tollcosthov3,walkttime,walktotransitutility,drivetotransitutility,parkingcost,parkingwalktime,sovcost,hovcost,tollcost,firststop,laststop,zerototalstops,tourpurpose_0,tourpurpose_1,tourpurpose_2,tourpurpose_3,tourpurpose_4,tourpurpose_5,tourpurpose_6,tourpurpose_7,targettripmode
5420299,7,True,2,2,2,15.0496,6.528248,3.0,28.221581,25.183311,25.183311,25.183311,0.0,0.0,0.0,74.541791,-999.0,-999.0,0.0,0.0,324.256782,324.256782,324.256782,False,False,True,0,0,0,0,1,0,0,0,1
10137151,3,False,2,5,4,3.4107,230.852676,22.0,27.689655,20.561739,20.561739,20.561739,0.0,0.0,0.0,68.671031,-999.0,-999.0,0.0,0.0,298.718978,298.718978,298.718978,False,False,True,1,0,0,0,0,0,0,0,1
459,6,False,1,5,5,7.3797,4.521993,1.0,137.408112,32.240913,32.231399,32.240913,0.0,0.0,0.0,125.903913,-999.0,-999.0,0.0,0.0,547.682008,547.825087,547.682008,False,False,False,1,0,0,0,0,0,0,0,1
3523186,7,False,1,2,4,6.2555,31.995995,1.0,6.055996,9.218620,9.218620,9.218620,0.0,0.0,0.0,30.619256,-999.0,-999.0,0.0,0.0,133.193759,133.193759,133.193759,False,False,False,0,0,0,1,0,0,0,0,1
1389833,7,False,2,1,2,13.1061,80.675232,3.0,24.416708,14.847696,14.847696,14.847696,0.0,0.0,0.0,51.645009,-999.0,-999.0,0.0,0.0,224.655784,224.655784,224.655784,False,False,False,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4649022,1,False,2,5,5,3.6818,46.511749,1.0,90.145737,5.535691,5.535691,5.535691,0.0,0.0,0.0,12.468397,-999.0,-999.0,0.0,0.0,54.237525,54.237525,54.237525,False,False,True,0,0,1,0,0,0,0,0,12
1014748,1,True,1,4,4,7.7022,19.270676,12.0,42.764721,2.246100,2.246100,2.246100,0.0,0.0,0.0,5.529111,-999.0,-999.0,0.0,0.0,24.051633,24.051633,24.051633,False,False,True,0,0,1,0,0,0,0,0,12
409363,3,False,2,5,1,5.6995,14.991196,10.0,46.152000,19.167122,19.167122,19.167122,0.0,0.0,0.0,69.843901,-999.0,-999.0,0.0,0.0,303.820963,303.820963,303.820963,False,False,True,0,0,1,0,0,0,0,0,12
1561740,1,True,1,3,2,6.4687,20.669909,8.0,19.647785,15.057140,15.057140,15.057140,0.0,0.0,0.0,34.377707,-999.0,-999.0,0.0,0.0,149.543022,149.543022,149.543022,False,False,True,0,0,1,0,0,0,0,0,12


In [81]:
df_array = new_utilityvars.values
X = df_array[:,0:34]
Y = df_array[:,34]

In [82]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [83]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_test, y_test)], verbose = 0)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [84]:
y_pred = [round(value) for value in model.predict(X_test)]
y_test = [round(value) for value in y_test]

In [85]:
# multilabel confusion matrix
metrics.multilabel_confusion_matrix(y_test, y_pred)

array([[[3463,  167],
        [ 179,  151]],

       [[3500,  146],
        [  84,  230]],

       [[3464,  156],
        [ 251,   89]],

       [[3452,  171],
        [ 202,  135]],

       [[3408,  212],
        [ 176,  164]],

       [[3501,  121],
        [ 159,  179]],

       [[3459,  169],
        [ 111,  221]],

       [[3501,  144],
        [ 142,  173]],

       [[3464,  155],
        [ 234,  107]],

       [[3434,  202],
        [ 164,  160]],

       [[3424,  218],
        [ 209,  109]],

       [[3564,   65],
        [  15,  316]]])

In [86]:
# accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy

0.5136363636363637

In [87]:
# sensitivity
sensitivity = metrics.recall_score(y_test, y_pred, average = 'macro')
sensitivity

0.5153571648544141

In [88]:
# precision
precision = metrics.precision_score(y_test, y_pred, average = 'macro')
precision

0.5041122581933385

In [89]:
# F1-score
f1 = (2 * precision * sensitivity) / (precision + sensitivity)
f1

0.5096726949871575

In [46]:
# 13*13 confusion matrix
metrics.confusion_matrix(y_test, y_pred)

array([[143,   5,  49,   0,  20,   1,  19,  27,  19,  33,  14,   0],
       [  4, 222,   1,  63,   1,  21,   0,   1,   0,   0,   1,   0],
       [ 44,   1,  85,   4,  84,   1,  28,  15,  19,  13,  29,  17],
       [  3,  85,   9, 136,   4,  91,   3,   0,   1,   3,   2,   0],
       [ 17,   1,  47,   3, 163,   3,  41,   7,  16,   9,  20,  13],
       [  0,  33,   4,  97,  14, 184,   2,   0,   1,   1,   2,   0],
       [  6,   0,  23,   1,  27,   0, 210,  32,  11,   4,  11,   7],
       [ 19,   1,  12,   0,  16,   0,  37, 165,  15,  21,  23,   6],
       [ 27,   1,  15,   1,  18,   0,  24,  24, 110,  59,  55,   7],
       [ 31,   9,   5,   5,   4,   3,   2,  12,  27, 162,  64,   0],
       [ 22,   3,  17,   1,  18,   2,   7,   9,  46,  67, 116,  10],
       [  0,   0,   1,   0,   3,   0,   6,   0,   0,   0,   3, 318]])

In [47]:
# normalized confusion matrix

|  | pred Drive Alone Free | pred Drive Alone Pay | pred HOV2 Free | pred HOV2 Pay | pred HOV3 Free | pred HOV3 Pay | pred Walk | pred Bike | pred Walk to Transit | pred Park and Ride | pred Kiss and Ride | pred School Bus |
| :- | :- | :- | :- | :- | :- | :- | :- | :- | :- | :- | :- | :- |
| Drive Alone Free | 0.43333333 | 0.01515152 | 0.14848485 | 0 | 0.06060606 | 0.0030303 | 0.05757576 | 0.08181818 | 0.05757576 | 0.1 | 0 | 0.04242424 |
| Drive Alone Pay | 0.01273885 | 0.70700637 | 0.00318471 | 0.20063694 | 0.00318471 | 0.06687898 | 0 | 0.00318471 | 0 | 0 | 0.00318471 | 0 |
| HOV2 Free | 0.12941176 | 0.00294118 | 0.25 | 0.01176471 | 0.24705882 | 0.00294118 | 0.08235294 | 0.04411765 | 0.05588235 | 0.03823529 | 0.08529412 | 0.05 |
| HOV2 Pay | 0.00890208 | 0.25222552| 0.02670623 | 0.40356083 | 0.01186944 | 0.27002967 | 0.00890208 | 0 | 0.00296736 | 0.00890208 | 0.00593472 | 0 |
| HOV3 Free | 0.05 | 0.00294118 | 0.13823529 | 0.00882353 | 0.47941176 | 0.00882353 | 0.12058824 | 0.02058824 | 0.04705882 | 0.02647059 | 0.05882353 | 0.03823529 |
| HOV3 Pay | 0 | 0.09763314 | 0.01183432 | 0.28698225 | 0.04142012 | 0.5443787 | 0.00591716 | 0 | 0.00295858 | 0.00295858 | 0.00591716 | 0 |
| Walk | 0.01807229 | 0 | 0.06927711 | 0.00301205 | 0.0813253 | 0 | 0.63253012 | 0.09638554 | 0.03313253 | 0.01204819 | 0.03313253 | 0.02108434 |
| Bike | 0.06031746 | 0.0031746 | 0.03809524 | 0 | 0.05079365 | 0 | 0.11746032 | 0.52380952 | 0.04761905 | 0.06666667 | 0.07301587 | 0.01904762 |
| Walk to Transit | 0.07917889 | 0.00293255 | 0.04398827 | 0.00293255 | 0.05278592 | 0 | 0.07038123 | 0.07038123 | 0.32258065 | 0.17302053 | 0.16129032 | 0.02052786 |
| Park and Ride | 0.09567901 | 0.02777778 | 0.0154321 | 0.0154321 | 0.01234568 | 0.00925926 | 0.00617284 | 0.03703704 | 0.08333333 | 0.5 | 0.19753086 | 0 |
| Kiss and Ride | 0.06918239 | 0.00943396 | 0.05345912 | 0.00314465 | 0.05660377 | 0.00628931 | 0.02201258 | 0.02830189 | 0.14465409 | 0.21069182 | 0.36477987 | 0.03144654 |
| School Bus | 0 | 0 | 0.00302115 | 0 | 0.00906344 | 0 | 0.01812689 | 0 | 0 | 0 | 0.00906344 | 0.96072508 |

In [60]:
# metrics.confusion_matrix(y_test, y_pred)[11] / sum(metrics.confusion_matrix(y_test, y_pred)[11])

1. free-free-free; pay-pay-pay
2. walk, bike errors distributed evenly
3. walk to transit - kiss and ride - school bus

In [27]:
# check trips.csv

In [28]:
trip_mg = pd.merge(utilityvars, trips, on=["activityid"])[['targettripmode', 'modechoice']]
trip_mg

Unnamed: 0,targettripmode,modechoice
0,1,DRIVE_ALONE_FREE
1,1,DRIVE_ALONE_FREE
2,1,DRIVE_ALONE_FREE
3,1,DRIVE_ALONE_FREE
4,1,DRIVE_ALONE_FREE
...,...,...
14047024,5,SHARED_3_HOV
14047025,3,SHARED_3_HOV
14047026,3,SHARED_3_HOV
14047027,3,SHARED_3_HOV


In [62]:
trip_mg["targettripmode"].replace({1: "DRIVE_ALONE_FREE", 2: "DRIVE_ALONE_PAY", 3: "SHARED_2_HOV", 4: "SHARED_2_PAY", 5: "SHARED_3_HOV", 6: "SHARED_3_PAY", 7: "WALK", 8: "BIKE", 9: "WALK_SET", 10: "PNR_SET", 11: "KNR_SET", 12: "SCH_BUS"}, inplace=True)
trip_mg

Unnamed: 0,targettripmode,modechoice
0,DRIVE_ALONE_FREE,DRIVE_ALONE_FREE
1,DRIVE_ALONE_FREE,DRIVE_ALONE_FREE
2,DRIVE_ALONE_FREE,DRIVE_ALONE_FREE
3,DRIVE_ALONE_FREE,DRIVE_ALONE_FREE
4,DRIVE_ALONE_FREE,DRIVE_ALONE_FREE
...,...,...
14047024,SHARED_3_HOV,SHARED_3_HOV
14047025,SHARED_2_HOV,SHARED_3_HOV
14047026,SHARED_2_HOV,SHARED_3_HOV
14047027,SHARED_2_HOV,SHARED_3_HOV


In [63]:
metrics.accuracy_score(trip_mg['targettripmode'], trip_mg['modechoice'])

0.4792990745587554

In [64]:
metrics.recall_score(trip_mg['targettripmode'], trip_mg['modechoice'], average = 'macro')

0.19459360823852725

In [65]:
metrics.precision_score(trip_mg['targettripmode'], trip_mg['modechoice'], average = 'macro')

0.19574958766844489

In [66]:
metrics.confusion_matrix(trip_mg['targettripmode'], trip_mg['modechoice'])

array([[  10170,   42876,       8,      97,     122,    1324,   15767,
             21,   12382,      34,   22051,    2307],
       [  21915, 4152601,    5193,    6747,    6837,    3568,  896572,
           4914,  620743,    5673,  326343,  104119],
       [      4,   13181,    1514,      36,      38,       0,    1986,
           1466,     838,    1188,      98,     145],
       [     29,   10215,      14,     334,     237,     215,    6232,
             49,    4156,      25,     720,    2089],
       [     30,    6027,      11,      56,     130,       0,    1737,
             27,     912,      24,     154,     966],
       [   1255,    2520,       0,      20,       0,   25297,   53723,
              4,   46013,      13,    7946,     465],
       [   7758, 1049745,    1296,    2554,    1824,   76525, 1242758,
           3813,  877007,    3858,  214324,   41679],
       [      0,    3390,     395,       9,      26,      81,    2315,
            851,    1369,     873,     193,      96],


In [90]:
# age subset
age0 = new_utilityvars[new_utilityvars['age'] == 0]
age1 = new_utilityvars[new_utilityvars['age'] == 1]
age2 = new_utilityvars[new_utilityvars['age'] == 2]
age3 = new_utilityvars[new_utilityvars['age'] == 3]
age4 = new_utilityvars[new_utilityvars['age'] == 4]
age5 = new_utilityvars[new_utilityvars['age'] == 5]
age6 = new_utilityvars[new_utilityvars['age'] == 6]
age7 = new_utilityvars[new_utilityvars['age'] == 7]

In [98]:
df_array = age0.values
X = df_array[:,0:34]
Y = df_array[:,34]

In [99]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [101]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_test, y_test)], verbose = 0)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [102]:
y_pred = [round(value) for value in model.predict(X_test)]
y_test = [round(value) for value in y_test]

In [103]:
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy

0.5541125541125541

In [104]:
sensitivity = metrics.recall_score(y_test, y_pred, average = 'macro')
sensitivity

0.43743448315816735

In [105]:
df_array = age1.values
X = df_array[:,0:34]
Y = df_array[:,34]

In [106]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [107]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_test, y_test)], verbose = 0)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [108]:
y_pred = [round(value) for value in model.predict(X_test)]
y_test = [round(value) for value in y_test]

In [109]:
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy

0.7039274924471299

Age could be an important feature.