In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns

In [2]:
trips = pd.read_csv('data/trips.csv')
trips

Unnamed: 0,activityid,personid,modechoice
0,1,3,WALK
1,2,3,WALK
2,3,3,WALK
3,4,3,WALK
4,5,3,WALK
...,...,...,...
14047024,17147588,4113994,DRIVE_ALONE_FREE
14047025,17147589,4113994,DRIVE_ALONE_FREE
14047026,17147590,4113994,DRIVE_ALONE_FREE
14047027,17147592,4113995,SHARED_3_HOV


In [3]:
utility = pd.read_csv('data/utilityvars.csv')
utility

Unnamed: 0,activityid,age,gender,autosuf,numhouseholdpersons,income,oduden,oempden,ototint,dempden,...,parkingwalktime,sovcost,hovcost,tollcost,tourpurpose,tourmode,firststop,laststop,zerototalstops,targettripmode
0,12150818,4,True,2,2,4,9.633200,11.594058,4.0,67.659134,...,0.0,46.447229,46.447229,46.447229,0,5,True,False,False,1
1,3018709,6,False,2,2,1,8.304700,70.117912,12.0,32.273338,...,0.0,495.225231,495.225231,495.225231,0,1,False,False,False,1
2,8809200,6,False,2,6,3,10.189500,47.538746,3.0,13.567157,...,0.0,462.137117,462.137117,462.137117,0,1,False,False,True,1
3,11332534,4,False,2,1,2,13.393200,21.983759,4.0,51.544109,...,0.0,342.291418,342.291418,342.291418,0,1,False,False,False,1
4,6130767,6,False,2,3,2,13.274000,6.568953,3.0,8.480003,...,0.0,310.186071,310.186071,310.186071,0,1,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14047024,10138525,2,True,2,4,2,35.460899,286.328949,14.0,12.385718,...,0.0,842.672156,842.672156,842.672156,5,5,False,False,False,5
14047025,16270294,7,False,2,4,3,17.207199,19.868286,13.0,1.423494,...,0.0,13.350623,13.350623,13.350623,5,3,False,False,False,3
14047026,16270295,7,False,2,4,3,16.574200,1.423494,13.0,88.151642,...,0.0,137.377041,137.377041,137.377041,5,3,False,False,False,3
14047027,10138526,2,True,2,4,2,22.916100,12.385718,14.0,8.472905,...,0.0,5.447711,5.447711,5.447711,5,5,False,False,False,3


In [14]:
# grab labels 

utility_simple = utility[['activityid', 'targettripmode']]
utility_simple

Unnamed: 0,activityid,targettripmode
0,12150818,1
1,3018709,1
2,8809200,1
3,11332534,1
4,6130767,1
...,...,...
14047024,10138525,5
14047025,16270294,3
14047026,16270295,3
14047027,10138526,3


In [15]:
# combine trips and utility dataframes 

util_trips = utility_simple.join(trips.set_index('activityid'), on='activityid')
util_trips

Unnamed: 0,activityid,targettripmode,personid,modechoice
0,12150818,1,2382494,DRIVE_ALONE_FREE
1,3018709,1,583761,DRIVE_ALONE_FREE
2,8809200,1,1724631,DRIVE_ALONE_FREE
3,11332534,1,2219806,DRIVE_ALONE_FREE
4,6130767,1,1198716,DRIVE_ALONE_FREE
...,...,...,...,...
14047024,10138525,5,1984999,SHARED_3_HOV
14047025,16270294,3,3197700,SHARED_3_HOV
14047026,16270295,3,3197700,SHARED_3_HOV
14047027,10138526,3,1984999,SHARED_3_HOV


In [16]:
util_trips['modechoice'].unique()

array(['DRIVE_ALONE_FREE', 'SHARED_3_HOV', 'WALK_SET', 'SHARED_2_HOV',
       'PNR_SET', 'WALK', 'SCH_BUS', 'SHARED_3_PAY', 'KNR_SET', 'BIKE',
       'DRIVE_ALONE_PAY', 'SHARED_2_PAY'], dtype=object)

In [17]:
util_trips['modechoice'] = np.where(util_trips['modechoice'] == 'DRIVE_ALONE_FREE', 1, \
                                np.where(util_trips['modechoice'] == 'DRIVE_ALONE_PAY',2,\
                                np.where(util_trips['modechoice'] == 'SHARED_2_HOV', 3,\
                                np.where(util_trips['modechoice'] == 'SHARED_2_PAY', 4,\
                                np.where(util_trips['modechoice'] == 'SHARED_3_HOV', 5,\
                                np.where(util_trips['modechoice'] == 'SHARED_3_PAY', 6,\
                                np.where(util_trips['modechoice'] == 'WALK', 7,\
                                np.where(util_trips['modechoice'] == 'BIKE', 8,\
                                np.where(util_trips['modechoice'] == 'WALK_SET', 9,\
                                np.where(util_trips['modechoice'] == 'PNR_SET', 10,\
                                np.where(util_trips['modechoice'] == 'KNR_SET', 11,12)))))))))))

In [18]:
labels = util_trips['targettripmode']
preds = util_trips['modechoice']
metrics.confusion_matrix(labels, preds)

array([[4152601,    5193,  896572,    4914,  620743,    5673,  326343,
          21915,  104119,    6837,    6747,    3568],
       [  13181,    1514,    1986,    1466,     838,    1188,      98,
              4,     145,      38,      36,       0],
       [1049745,    1296, 1242758,    3813,  877007,    3858,  214324,
           7758,   41679,    1824,    2554,   76525],
       [   3390,     395,    2315,     851,    1369,     873,     193,
              0,      96,      26,       9,      81],
       [ 512652,     840, 1003419,    2908, 1013856,    3665,  179717,
           6381,   28680,     866,    1648,   76531],
       [   2423,     253,    3226,     894,    2710,    1274,     267,
              0,      78,      13,      11,     179],
       [ 270611,     179,  228300,     385,  200394,     353,  260649,
          21071,   10448,     639,    1384,    9361],
       [  42876,       8,   15767,      21,   12382,      34,   22051,
          10170,    2307,     122,      97,    1324],


In [19]:
# accuracy
accuracy = metrics.accuracy_score(labels, preds)
accuracy

0.4792990745587554

In [20]:
# sensitivity
sensitivity = metrics.recall_score(labels, preds, average = 'macro')
sensitivity

0.19459360823852725

In [21]:
# precision
precision = metrics.precision_score(labels, preds, average = 'macro')
precision

0.19574958766844486

In [22]:
# F1-score
f1 = (2 * precision * sensitivity) / (precision + sensitivity)
f1

0.19516988626943932