In [34]:
%matplotlib inline
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, Imputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
import pickle
from imblearn.combine import SMOTEENN

## Read in data and set up DataFrame.

In [2]:
survey_place = pd.read_csv('data/caltrans_full_survey/survey_place.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
columns = ['arr_time', 'dep_time', 'trip_distance_miles', 'air_trip_distance_miles', 'prev_trip_duration_min', 'act_dur', 'act_cnt', 'tract_id', 'county_id', 'state_id', 'mode']
survey_place_less = survey_place[columns]

In [4]:
survey_place_less['arr_time'] = pd.to_datetime(survey_place_less['arr_time'], infer_datetime_format=True, format='%H:%M:%S').dt.hour
survey_place_less['dep_time'] = pd.to_datetime(survey_place_less['dep_time'], infer_datetime_format=True, format='%H:%M:%S').dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Determine percent missing.
## Drop rows where "mode" is missing.
## Impute missing data.

In [5]:
def get_percentage_missing(series):
   ''' Calculates percentage of NaN values in DataFrame
   :param series: Pandas DataFrame object
   :return: float
   '''
   num = series.isnull().sum()
   den = len(series)
   return round(num/den, 7)

In [6]:
get_percentage_missing(survey_place_less)

arr_time                   0.000002
dep_time                   0.000002
trip_distance_miles        0.241332
air_trip_distance_miles    0.236207
prev_trip_duration_min     0.236207
act_dur                    0.000002
act_cnt                    0.000004
tract_id                   0.000011
county_id                  0.000011
state_id                   0.000002
mode                       0.236207
dtype: float64

In [7]:
survey_place_less.dropna(subset=['mode'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [8]:
get_percentage_missing(survey_place_less)

arr_time                   0.000000
dep_time                   0.000000
trip_distance_miles        0.006709
air_trip_distance_miles    0.000000
prev_trip_duration_min     0.000000
act_dur                    0.000000
act_cnt                    0.000003
tract_id                   0.000011
county_id                  0.000011
state_id                   0.000000
mode                       0.000000
dtype: float64

In [9]:
for col in survey_place_less:
    survey_place_less[col].fillna((survey_place_less[col].mean()), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [10]:
get_percentage_missing(survey_place_less)

arr_time                   0.0
dep_time                   0.0
trip_distance_miles        0.0
air_trip_distance_miles    0.0
prev_trip_duration_min     0.0
act_dur                    0.0
act_cnt                    0.0
tract_id                   0.0
county_id                  0.0
state_id                   0.0
mode                       0.0
dtype: float64

## Train-test split
## Map y to "transit" (1) and "not transit" (0)

In [11]:
y = survey_place_less['mode']

In [12]:
# X = survey_place_less[['arr_time', 'dep_time', 'trip_distance_miles', 'air_trip_distance_miles', 'prev_trip_duration_min', 'act_dur', 'act_cnt', 'tract_id', 'county_id', 'state_id']]
X = survey_place_less[['arr_time', 'dep_time', 'trip_distance_miles', 'air_trip_distance_miles', 'prev_trip_duration_min', 'act_dur', 'act_cnt', 'state_id']]

In [13]:
dict_not_transit = dict((key, 0) for key in range(15))
dict_transit = dict((key, 1) for key in range(15,30))
mode_dict = {**dict_not_transit, **dict_transit}

In [14]:
ymap = y.map(mode_dict)

In [15]:
X, ymap = SMOTEENN().fit_sample(X, ymap)

In [16]:
pickle.dump(X, open('X', 'wb'))
pickle.dump(ymap, open('ymap', 'wb'))

In [17]:
print(f"{ymap[ymap>0].count()} of {len(ymap)} ({ymap[ymap>0].count()/len(ymap)*100:.2f}%) are public transit.")

AttributeError: 'numpy.ndarray' object has no attribute 'count'

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, ymap)
# X_train, X_test, y_train, y_test = train_test_split(X, y)

In [26]:
gbc = GradientBoostingClassifier()

In [None]:
gbc.fit(X_train, y_train);

In [None]:
y_pred_gbc = gbc.predict(X_test)

In [27]:
abc = AdaBoostClassifier()

In [None]:
abc.fit(X_train, y_train);

In [None]:
y_pred_abc = abc.predict(X_test)

In [28]:
rfc = RandomForestClassifier()

In [None]:
rfc.fit(X_train, y_train);

In [None]:
y_pred_rfc = rfc.predict(X_test)

In [33]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
gbc_scaled = GradientBoostingClassifier()
gbc_scaled.fit(X_train_scaled, y_train);

In [None]:
y_pred_gbc_scaled = gbc_scaled.predict(X_test_scaled)

In [None]:
abc_scaled = AdaBoostClassifier()
abc_scaled.fit(X_train_scaled, y_train);

In [None]:
y_pred_abc_scaled = abc_scaled.predict(X_test_scaled)

In [None]:
rfc_scaled = RandomForestClassifier()
rfc_scaled.fit(X_train_scaled, y_train);

In [None]:
y_pred_rfc_scaled = rfc_scaled.predict(X_test_scaled)

In [None]:
gbcgrid = GridSearchCV(gbc, param_grid={'n_estimators':[50, 75, 100, 150, 200, 500], 'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20], 'learning_rate':[.01, .05, .1, 1, 2]}, scoring='f1', n_jobs=-1, refit=True)

In [None]:
gbcgrid.fit(X_train, y_train)

In [None]:
gbcgrid.best_params_

In [30]:
abcgrid = GridSearchCV(abc, param_grid={'n_estimators':[50, 75, 100, 150, 200, 500], 'learning_rate':[.01, .05, .1, 1, 2]}, scoring='f1', n_jobs=-1, refit=True)

In [31]:
abcgrid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [10, 50], 'learning_rate': [0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=0)

In [32]:
abcgrid.best_params_

{'learning_rate': 1, 'n_estimators': 50}

In [None]:
rfcgrid = GridSearchCV(rfc, param_grid={'n_estimators':[50, 75, 100, 150, 200, 500], 'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20], 'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20], 'class_weight':[None, 'balanced_subsample', 'balanced']}, scoring='f1', n_jobs=-1, refit=True)

In [None]:
rfcgrid.fit(X_train, y_train)

In [None]:
rfcgrid.best_params_

In [None]:
for model in [gbcgrid, adagrid, rfcgrid]:
    print("------------------------------")
    print("------------------------------")
    print("------------------------------")
    print(model)
    y_pred = model.predict(X_test)
    print("\t------------------------------")
    print(f"\tF1 score: {f1_score(y_test, y_pred):.3f}")
    print(f"\tPrecision score: {precision_score(y_test, y_pred):.3f}")
    print(f"\tRecall score: {recall_score(y_test, y_pred):.3f}")
    print("\t------------------------------")
#     print("\tMost important features:\n")
#     for imp, feat in sorted(zip(model.feature_importances_, survey_place_less.columns), reverse=True):
#         print(f"\t{feat}: {imp:.3f}")
# for model in [gbc_scaled, abc_scaled, rfc_scaled]:
#     print("------------------------------")
#     print("------------------------------")
#     print("------------------------------")
#     print(model)
#     y_pred = model.predict(X_test_scaled)
#     print("\t------------------------------")
#     print(f"\tRecall score: {recall_score(y_test, y_pred):.3f}")
#     print("\t------------------------------")
    print("\tMost important features:\n")
    for imp, feat in sorted(zip(model.feature_importances_, survey_place_less.columns), reverse=True):
#         print(f"\t{feat}: {imp:.3f}")

In [None]:
pickle.dump(gbcgrid, open('gbc.pkl', 'wb'))

In [None]:
pickle.dump(abcgrid, open('abc.pkl', 'wb'))

In [None]:
pickle.dump(rfcgrid, open('rfc.pkl', 'wb'))

In [None]:
gbc_scaled = GridSearchCV(gbc, param_grid={'n_estimators':[50, 75, 100, 150, 200, 500], 'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20], 'learning_rate':[.01, .05, .1, 1, 2]}, scoring='f1', n_jobs=-1, refit=True)

In [None]:
gbc_scaled.fit(X_train_scaled, y_train)

In [None]:
gbc_scaled.best_params_

In [None]:
abc_scaled = GridSearchCV(abc, param_grid={'n_estimators':[50, 75, 100, 150, 200, 500], 'learning_rate':[.01, .05, .1, 1, 2]}, scoring='f1', n_jobs=-1, refit=True)

In [None]:
abc_scaled.fit(X_train_scaled, y_train)

In [None]:
abc_scaled.best_params_

In [None]:
rfc_scaled = GridSearchCV(rfc, param_grid={'n_estimators':[50, 75, 100, 150, 200, 500], 'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20], 'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20], 'class_weight':[None, 'balanced_subsample', 'balanced']}, scoring='f1', n_jobs=-1, refit=True)

In [None]:
rfc_scaled.fit(X_train_scaled, y_train)

In [None]:
rfc_scaled.best_params_

In [None]:
for model in [gbc_scaled, abc_scaled, rfc_scaled]:
    print("------------------------------")
    print("------------------------------")
    print("------------------------------")
    print(model)
    y_pred = model.predict(X_test_scaled)
    print("\t------------------------------")
       print(f"\tF1 score: {f1_score(y_test, y_pred):.3f}")
    print(f"\tPrecision score: {precision_score(y_test, y_pred):.3f}")
    print(f"\tRecall score: {recall_score(y_test, y_pred):.3f}")
    print("\t------------------------------")

In [None]:
pickle.dump(gbc_scaled, open('gbc_scaled.pkl', 'wb'))

In [None]:
pickle.dump(abc_scaled, open('abc_scaled.pkl', 'wb'))

In [None]:
pickle.dump(rfc_scaled, open('rfc_scaled.pkl', 'wb'))