In [115]:
%matplotlib inline
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, Imputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import recall_score, precision_score

## Read in data and set up DataFrame.

In [19]:
survey_place = pd.read_csv('caltrans_full_survey/survey_place.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [20]:
columns = ['arr_time', 'dep_time', 'trip_distance_miles', 'air_trip_distance_miles', 'prev_trip_duration_min', 'act_dur', 'act_cnt', 'tract_id', 'county_id', 'state_id', 'mode']
survey_place_less = survey_place[columns]

In [21]:
survey_place_less['arr_time'] = pd.to_datetime(survey_place_less['arr_time'], infer_datetime_format=True, format='%H:%M:%S').dt.hour
survey_place_less['dep_time'] = pd.to_datetime(survey_place_less['dep_time'], infer_datetime_format=True, format='%H:%M:%S').dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Determine percent missing.
## Drop rows where "mode" is missing.
## Impute missing data.

In [22]:
def get_percentage_missing(series):
   ''' Calculates percentage of NaN values in DataFrame
   :param series: Pandas DataFrame object
   :return: float
   '''
   num = series.isnull().sum()
   den = len(series)
   return round(num/den, 7)

In [23]:
get_percentage_missing(survey_place_less)

arr_time                   0.000002
dep_time                   0.000002
trip_distance_miles        0.241332
air_trip_distance_miles    0.236207
prev_trip_duration_min     0.236207
act_dur                    0.000002
act_cnt                    0.000004
tract_id                   0.000011
county_id                  0.000011
state_id                   0.000002
mode                       0.236207
dtype: float64

In [24]:
survey_place_less.dropna(subset=['mode'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [25]:
get_percentage_missing(survey_place_less)

arr_time                   0.000000
dep_time                   0.000000
trip_distance_miles        0.006709
air_trip_distance_miles    0.000000
prev_trip_duration_min     0.000000
act_dur                    0.000000
act_cnt                    0.000003
tract_id                   0.000011
county_id                  0.000011
state_id                   0.000000
mode                       0.000000
dtype: float64

In [26]:
for col in survey_place_less:
    survey_place_less[col].fillna((survey_place_less[col].mean()), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [27]:
get_percentage_missing(survey_place_less)

arr_time                   0.0
dep_time                   0.0
trip_distance_miles        0.0
air_trip_distance_miles    0.0
prev_trip_duration_min     0.0
act_dur                    0.0
act_cnt                    0.0
tract_id                   0.0
county_id                  0.0
state_id                   0.0
mode                       0.0
dtype: float64

## Train-test split
## Map y to "transit" (1) and "not transit" (0)

In [51]:
y = survey_place_less['mode']

In [52]:
# X = survey_place_less[['arr_time', 'dep_time', 'trip_distance_miles', 'air_trip_distance_miles', 'prev_trip_duration_min', 'act_dur', 'act_cnt', 'tract_id', 'county_id', 'state_id']]
X = survey_place_less[['arr_time', 'dep_time', 'trip_distance_miles', 'air_trip_distance_miles', 'prev_trip_duration_min', 'act_dur', 'act_cnt', 'state_id']]

In [53]:
dict_not_transit = dict((key, 0) for key in range(15))
dict_transit = dict((key, 1) for key in range(15,30))
mode_dict = {**dict_not_transit, **dict_transit}

In [54]:
ymap = y.map(mode_dict)

In [55]:
print(f"{ymap[ymap>0].count()} of {len(ymap)} ({ymap[ymap>0].count()/len(ymap)*100:.2f}%) are public transit.")

12928 of 351745 (3.68%) are public transit.


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, ymap)
# X_train, X_test, y_train, y_test = train_test_split(X, y)

In [57]:
X_train

Unnamed: 0,arr_time,dep_time,trip_distance_miles,air_trip_distance_miles,prev_trip_duration_min,act_dur,act_cnt,state_id
35448,14.0,2.0,8.951638,4.427572,60.0,749.0,1.0,6.0
356624,11.0,12.0,0.260377,0.190487,10.0,69.0,1.0,6.0
76271,8.0,18.0,3.548570,3.146544,25.0,625.0,1.0,6.0
62112,13.0,13.0,4.065800,1.446258,10.0,5.0,1.0,6.0
214535,15.0,15.0,5.070823,4.411265,1.0,6.0,1.0,6.0
82177,15.0,16.0,1.730520,0.794309,15.0,45.0,1.0,6.0
23784,17.0,2.0,2.246039,0.790249,7.0,562.0,1.0,6.0
79073,11.0,13.0,2.304995,1.621878,10.0,75.0,1.0,6.0
436613,13.0,15.0,3.589579,2.657075,15.0,75.0,1.0,6.0
282045,13.0,14.0,0.287266,0.212414,5.0,25.0,1.0,6.0


In [77]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train);

In [78]:
y_pred_gbc = gbc.predict(X_test)

In [79]:
recall_score(y_test, y_pred_gbc)

0.32340025094102887

In [80]:
abc = AdaBoostClassifier()
abc.fit(X_train, y_train);

In [81]:
y_pred_abc = abc.predict(X_test)

In [82]:
recall_score(y_test, y_pred_abc)

0.27885821831869512

In [83]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train);

In [112]:
y_pred_rfc = rfc.predict(X_test)

In [113]:
recall_score(y_test, y_pred_rfc)

0.43789209535759099

In [102]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [103]:
gbc_scaled = GradientBoostingClassifier()
gbc_scaled.fit(X_train_scaled, y_train);

In [117]:
y_pred_gbc_scaled = gbc_scaled.predict(X_test_scaled)

In [119]:
recall_score(y_test, y_pred_gbc_scaled)

0.32465495608531997

In [106]:
abc_scaled = AdaBoostClassifier()
abc_scaled.fit(X_train_scaled, y_train);

In [120]:
y_pred_abc_scaled = abc_scaled.predict(X_test_scaled)

In [121]:
recall_score(y_test, y_pred_abc_scaled)

0.27885821831869512

In [109]:
rfc_scaled = RandomForestClassifier()
rfc_scaled.fit(X_train_scaled, y_train);

In [122]:
y_pred_rfc_scaled = rfc_scaled.predict(X_test_scaled)

In [123]:
recall_score(y_test, y_pred_rfc_scaled)

0.44259723964868258

In [None]:
for imp, feat in sorted(zip(gbc.feature_importances_, survey_place_less.columns), reverse=True):
    print(f"{feat}: {imp:.2f}")