In [67]:
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

block_meta = pd.read_csv('block_meta.csv')

data = pd.read_hdf('model_data_6H.h5')
data.drop('CaseID', axis = 1, inplace = True)

data['has_poop'] = data['poop'].groupby(data['block_fips']).transform('sum')
data = data[data['has_poop']>0]

block_neigh = pd.read_csv('block_neigh.csv')
block_neigh = block_neigh[block_neigh['nbrhood_name'].notnull()]
block_neigh = block_neigh.loc[:,[
    
    'block_fips',
    'nbrhood_name'
]]

In [68]:
data = data.merge(block_neigh, on = 'block_fips')

data['Opened_rnd'] = pd.to_datetime(data['Opened_rnd'], format="%Y-%m-%d %H:%M:%S")
data['day_of_week'] = data['Opened_rnd'].dt.dayofweek
data['month'] = data['Opened_rnd'].dt.month
data['hour'] = data['Opened_rnd'].dt.hour

data['winter'] = np.where(data['month'].isin([1, 2, 3]), 1, 0)
data['spring'] = np.where(data['month'].isin([4, 5, 6]), 1, 0)
data['summer'] = np.where(data['month'].isin([7, 8, 9]), 1, 0)
data['fall'] = np.where(data['month'].isin([10, 11, 12]), 1, 0)

In [69]:
neigh_dummies = pd.get_dummies(data['nbrhood_name'])
data = pd.concat((data,neigh_dummies), axis=1)
del neigh_dummies

month_dummies = pd.get_dummies(data['month'], prefix='month')
data = pd.concat((data,month_dummies), axis=1)
del month_dummies

hour_dummies = pd.get_dummies(data['hour'], prefix='hour')
data = pd.concat((data,hour_dummies), axis=1)
del hour_dummies


In [70]:
#data.groupby('nbrhood_name').mean().sort_values('poop', ascending = False)

data = data.merge(block_meta, on='block_fips')

In [78]:
data_small = data[data['nbrhood_name']=='Tenderloin']

In [98]:
X = data_small[[
    
 'winter',
 'spring',
 'summer',
 'fall',
 'hour_0',
 'hour_6',
 'hour_12',
 'hour_18',
 'UNIVPROX',
 'SIGNALIZED',
 'PKGMETERS',
 'MAXPCTSLPE',
 'MODEL6_VOL',
 'HH_PEDMODE',
 'PCOL_04_09',
 'PCOL_RATE',
 'HH_income'
]]

y = data_small['poop']
#del data

In [165]:
from sklearn.preprocessing import StandardScaler

X_imp = Imputer().fit_transform(X)
X_scale = StandardScaler().fit_transform(X_imp)

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X_imp, y,
                                                    stratify=y,
                                                    test_size=0.25,
                                                    random_state=32)

In [167]:
from sklearn.exceptions import UndefinedMetricWarning
import warnings
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.svm import SVC

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import Imputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import StratifiedKFold

scoring = {'AUC': 'roc_auc',
           #'F1': make_scorer(f1_score)
          #'Accuracy': make_scorer(accuracy_score),
          'Precision': make_scorer(precision_score),
          'Recall': make_scorer(recall_score)
          }

gs_dt = GridSearchCV(DecisionTreeClassifier(random_state=42),
                     param_grid={'criterion': ['gini', 'entropy'],
                                 'max_depth': range(1,6)},
                     scoring=scoring,
                     cv=StratifiedKFold(),
                     refit='AUC')
gs_dt.fit(X_scale, y)
dt_results = gs_dt.cv_results_

gs_lr = GridSearchCV(LogisticRegression(),
                     param_grid = {'penalty':['l1','l2'],'C':[0.00001,0.0001,0.001,0.01,0.1,1,10]}, 
                     scoring=scoring,
                     refit='AUC')
gs_lr.fit(X_scale, y)
lr_results = gs_lr.cv_results_


#GBT = GradientBoostingClassifier(learning_rate=0.01, n_estimators=150, max_depth=5)
#GBT = GBT.fit(X_train, y_train)

RDF = RandomForestClassifier(class_weight='balanced',
                             max_depth = 3,
                             n_estimators = 50,
                             criterion = 'entropy')

RDF = RDF.fit(X_train, y_train)

In [168]:
print('Decision Tree\n\nBest Params')
print(gs_dt.best_params_)
print('Recall')
print(np.mean(dt_results['mean_test_Recall']))
print('Precision')
print(np.mean(dt_results['mean_test_Precision']))

print('\nLogisticRegression')
print(gs_lr.best_params_)
print('Recall')
print(np.mean(lr_results['mean_test_Recall']))
print('Precision')
print(np.mean(lr_results['mean_test_Precision']))

Decision Tree

Best Params
{'criterion': 'entropy', 'max_depth': 2}
Recall
0.040413108073138373
Precision
0.013817618464533717

LogisticRegression
{'C': 10, 'penalty': 'l1'}
Recall
0.055398068982651635
Precision
0.049255725028090766


In [116]:
print(classification_report(y_train, RDF.predict(X_train)))
print(classification_report(y_test, RDF.predict(X_test)))

             precision    recall  f1-score   support

          0       0.99      0.77      0.87     67808
          1       0.14      0.88      0.25      2965

avg / total       0.96      0.78      0.84     70773

             precision    recall  f1-score   support

          0       0.99      0.77      0.87     22603
          1       0.14      0.86      0.24       988

avg / total       0.96      0.77      0.84     23591



In [117]:
pd.DataFrame({'feat':X.columns, 'importance':RDF.feature_importances_}).sort_values('importance', ascending = False)

Unnamed: 0,feat,importance
6,hour_12,0.462709
5,hour_6,0.213235
4,hour_0,0.183739
7,hour_18,0.063795
16,HH_income,0.035662
12,MODEL6_VOL,0.009154
13,HH_PEDMODE,0.008351
10,PKGMETERS,0.008259
11,MAXPCTSLPE,0.007237
15,PCOL_RATE,0.003573


In [97]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94364 entries, 1230104 to 3279178
Data columns (total 18 columns):
winter        94364 non-null int64
spring        94364 non-null int64
summer        94364 non-null int64
fall          94364 non-null int64
hour_0        94364 non-null uint8
hour_6        94364 non-null uint8
hour_12       94364 non-null uint8
hour_18       94364 non-null uint8
TOTEMP2       0 non-null float64
UNIVPROX      76674 non-null float64
SIGNALIZED    76674 non-null float64
PKGMETERS     76674 non-null float64
MAXPCTSLPE    76674 non-null float64
MODEL6_VOL    76674 non-null float64
HH_PEDMODE    76674 non-null float64
PCOL_04_09    76674 non-null float64
PCOL_RATE     76674 non-null float64
HH_income     94364 non-null object
dtypes: float64(9), int64(4), object(1), uint8(4)
memory usage: 13.7+ MB
