In [324]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_validate, GridSearchCV, KFold
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score, log_loss, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

In [342]:
modeldata = pd.read_pickle('/home/michael/chicagohealthinspections/data/model.pkl')

In [343]:
modeldata.head()

Unnamed: 0,LocId,id,y,rating,inspcount,reinspection,failpct,price,prevfails,breakfast_brunch,pizza,mexican,burgers,complaint,sandwiches
0,1,el-ranchito-restaurant-chicago-5,0,2.0,1,0,0.0,1,0,0,1,1,0,1,0
1,1,el-ranchito-restaurant-chicago-5,0,2.0,2,0,0.0,1,0,0,1,1,0,1,0
2,1,el-ranchito-restaurant-chicago-5,0,2.0,3,0,0.0,1,0,0,1,1,0,1,0
3,1,el-ranchito-restaurant-chicago-5,1,2.0,4,0,0.0,1,0,0,1,1,0,1,0
4,1,el-ranchito-restaurant-chicago-5,0,2.0,5,1,0.2,1,1,0,1,1,0,1,0


In [187]:
def assmodel(features, target, model, score):
    '''
    Assesses model performance via Cross validation and also shows the confusion matrix for the true fit
    '''
    
    model.fit(features,target)
    logresult = cross_validate(model, X_resampled, y_resampled, 
                               scoring = score, cv=10, n_jobs=-1,return_train_score=True)
    print('train score {}\ntests score {}'.format(logresult['train_score'].mean(),logresult['test_score'].mean()))
    print(confusion_matrix(target,model.predict(features)))

In [270]:
def smotetrain(features, target, model, score, cv = 10, random_state = 42):
    '''
    Cross Validation of Models with oversampling
    We need to oversample only the train set to prevent information leak
    '''
    folds = KFold(n_splits = cv, random_state = random_state)
    
    ros = SMOTE(random_state=42)
    
    train_score = []
    test_score = []
    
    for train_index, test_index in folds.split(features):
        X_train, X_test = features.loc[train_index], features.loc[test_index]
        y_train, y_test = target.loc[train_index], target.loc[test_index]
        
        X_resampled, y_resampled = ros.fit_sample(X_train, y_train) 
        
        model.fit(X_resampled,y_resampled)
        
        trscore = f1_score(y_resampled,model.predict(X_resampled))
        train_score.append(trscore)
        
        tescore = f1_score(y_test,model.predict(X_test))
        test_score.append(tescore)
    
    mean_train = sum(train_score)/len(train_score)
    mean_test = sum(test_score)/len(test_score)
    
    print('train score {}\ntests score {}'.format(mean_train,mean_test))
    #print(train_score)
    #print(test_score)

In [218]:
X_train = modeldata.copy(deep=True)
y = X_train.pop('y')

model = KNeighborsClassifier(n_neighbors = 10)
assmodel(X_resampled, y_resampled, model, 'f1_micro')

train score 0.6218993690102512
tests score 0.5848663822207203
[[44554 13714]
 [30276 27992]]


In [220]:
X_train = modeldata.copy(deep=True)
y = X_train.pop('y')

model = KNeighborsClassifier(n_neighbors = 10, n_jobs=-1)
smotetrain(X_train, y, model, 'f1_micro')

train score 0.5824437129911683
tests score 0.2532233833118129


In [196]:
model = LogisticRegression(class_weight='balanced')

assmodel(X_train, y, model, 'f1_micro')
assmodel(X_resampled, y_resampled, model, 'f1_micro')

train score 0.5441789968826574
tests score 0.5266003560375776
[[31196 27072]
 [ 6461  7555]]
train score 0.5441789968826574
tests score 0.5266003560375776
[[30903 27365]
 [26021 32247]]


In [217]:
X_train = modeldata.copy(deep=True)
y = X_train.pop('y')

model = LogisticRegression()
smotetrain(X_train, y, model, 'f1_micro')

train score 0.5496312380777392
tests score 0.3083016353370237


In [223]:
model = GaussianNB()
assmodel(X_resampled, y_resampled, model, 'f1_micro')

train score 0.5337358827448779
tests score 0.519684177041768
[[23411 34857]
 [19695 38573]]


In [229]:
X_train = modeldata.copy(deep=True)
y = X_train.pop('y')

model = GaussianNB()
smotetrain(X_train, y, model, 'f1_micro')

train score 0.5882557595769611
tests score 0.3082941545326364


In [185]:
model = GaussianNB()
assmodel(X_resampled[:,0:4], y_resampled, model, 'f1_micro')

train score 0.5333220928533879
tests score 0.5037407010265258
[[32901 25367]
 [30209 28059]]


In [199]:
model = GradientBoostingClassifier(n_estimators = 100,
                                   learning_rate = 0.1,
                                   max_features = 2,
                                   max_depth = 4, 
                                   random_state=42)

assmodel(X_resampled, y_resampled, model, 'f1_micro')

train score 0.6408653945558928
tests score 0.6213097362556528
[[37787 20481]
 [21053 37215]]


In [202]:
model = GradientBoostingClassifier(n_estimators = 100,
                                   learning_rate = 0.1,
                                   max_features = 2,
                                   max_depth = 5, 
                                   random_state=42)

assmodel(X_resampled[:,0:4], y_resampled, model, 'f1_micro')

train score 0.6520989258847065
tests score 0.6326369703967544
[[42753 15515]
 [29058 29210]]


In [228]:
X_train = modeldata.copy(deep=True)
y = X_train.pop('y')

model = GradientBoostingClassifier(n_estimators = 100,
                                   learning_rate = 0.1,
                                   max_features = 2,
                                   max_depth = 5, 
                                   random_state=42)

smotetrain(X_train, y, model, 'f1_micro')

train score 0.6504718654243878
tests score 0.29478804195833624


In [235]:
X_train = modeldata.copy(deep=True)
y = X_train.pop('y')

model = GradientBoostingClassifier(n_estimators = 100,
                                   learning_rate = 0.1,
                                   max_features = 1,
                                   max_depth = 4, 
                                   random_state=42)

smotetrain(X_train, y, model, 'f1_micro')

train score 0.62807971669144
tests score 0.30619993097517273


In [236]:
X_train = modeldata.copy(deep=True)
y = X_train.pop('y')

model = GradientBoostingClassifier(n_estimators = 100,
                                   learning_rate = 0.1,
                                   max_features = 1,
                                   max_depth = 2, 
                                   random_state=42)

smotetrain(X_train, y, model, 'f1_micro')

train score 0.5967791179120897
tests score 0.3141904618947153


In [240]:
X_train = modeldata.copy(deep=True)
y = X_train.pop('y')

model = GradientBoostingClassifier(n_estimators = 100,
                                   learning_rate = 0.01,
                                   max_features = 2,
                                   max_depth = 2, 
                                   random_state=42)

smotetrain(X_train, y, model, 'f1_micro')

train score 0.6030917911230198
tests score 0.3170235389658157


In [248]:
X_train = modeldata.copy(deep=True)
y = X_train.pop('y')

model = GradientBoostingClassifier(n_estimators = 100,
                                   learning_rate = 0.01,
                                   max_features = 4,
                                   max_depth = 2, 
                                   random_state=42)

smotetrain(X_train, y, model, 'f1_micro')

train score 0.6120810144928813
tests score 0.31905792100076413


In [254]:
X_train = modeldata.copy(deep=True)
y = X_train.pop('y')

model = GradientBoostingClassifier(n_estimators = 100,
                                   learning_rate = 0.001,
                                   max_features = 4,
                                   max_depth = 2, 
                                   random_state=42)

smotetrain(X_train, y, model, 'f1_micro')

train score 0.6196634056081023
tests score 0.32081638951908614


In [257]:
X_train = modeldata.copy(deep=True)
y = X_train.pop('y')

model = GradientBoostingClassifier(n_estimators = 100,
                                   learning_rate = 0.001,
                                   max_features = 5,
                                   max_depth = 1, 
                                   random_state=42)

smotetrain(X_train, y, model, 'f1_micro')

train score 0.6179885153492232
tests score 0.3219503561783922


In [260]:
X_train = modeldata.copy(deep=True)
y = X_train.pop('y')

model = GradientBoostingClassifier(n_estimators = 100,
                                   learning_rate = 0.001,
                                   max_features = 9,
                                   max_depth = 1, 
                                   random_state=42)

smotetrain(X_train, y, model, 'f1_micro')

train score 0.6578781603627368
tests score 0.32968957869266335


In [332]:
X_train = modeldata.drop(['id','LocId'],1).copy(deep=True)
y = X_train.pop('y')

model = GradientBoostingClassifier(n_estimators = 100,
                                   learning_rate = 0.0001,
                                   max_features = 9,
                                   max_depth = 1, 
                                   random_state=42)

smotetrain(X_train, y, model, 'f1_micro')

train score 0.7614194666218782
tests score 0.44351419837033834


In [288]:
X_train = modeldata.copy(deep=True)
y = X_train.pop('y')

model = GradientBoostingClassifier(n_estimators = 100,
                                   learning_rate = 0.0001,
                                   max_features = 9,
                                   max_depth = 1, 
                                   random_state=42)

ros = SMOTE(random_state=42)
X_resampled, y_resampled = ros.fit_sample(X_train, y) 


model.fit(X_resampled, y_resampled)

print(f1_score(y,model.predict(X_train)))

print(confusion_matrix(y,model.predict(X_train)))

0.329869956652
[[ 9675 48593]
 [ 1650 12366]]


In [345]:
X_train = modeldata.drop(['id','LocId'],1).copy(deep=True)
y = X_train.pop('y')
#X_train = X_train.iloc[:,0:9]

model = RandomForestClassifier(n_estimators = 100,
                               criterion = 'entropy',
                               max_features = 'sqrt',
                               oob_score = True,
                               n_jobs = -1,
                               random_state = 42,
                               class_weight = 'balanced')

model.fit(X_train,y)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=None, max_features='sqrt',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=True, random_state=42,
            verbose=0, warm_start=False)

In [349]:
prediction = model.predict(X_train)

train_score = f1_score(y,prediction)
#recall_score = recall_score(y,prediction)
#precision_score = precision_score(y,prediction)

print('f1 score {}\nRecall score {}\nPrecision Score {}'.format(train_score,recall_score,precision_score))


print(confusion_matrix(y,prediction))

f1 score 0.474086310138344
Recall score 0.7414383561643836
Precision Score 0.3495929489335935
[[39029 19239]
 [ 3684 10332]]


In [350]:
modeldata['probs']=model.predict_proba(X_train)[:,1]

In [353]:
inspections = pd.read_pickle('/home/michael/chicagohealthinspections/data/inspectdata.pkl')
locations = pd.read_pickle('/home/michael/chicagohealthinspections/data/linkloc.pkl')

In [355]:
dateadded = modeldata.merge(inspections[['LocId','inspcount','Inspection Date']],how='inner')
dateadded = dateadded.merge(locations[['LocId','PRI_NEIGH']],how='inner')
dateadded.head()

Unnamed: 0,LocId,id,y,rating,inspcount,reinspection,failpct,price,prevfails,breakfast_brunch,pizza,mexican,burgers,complaint,sandwiches,probs,Inspection Date,PRI_NEIGH
0,1,el-ranchito-restaurant-chicago-5,0,2.0,1,0,0.0,1,0,0,1,1,0,1,0,0.103502,2010-01-19,Avondale
1,1,el-ranchito-restaurant-chicago-5,0,2.0,2,0,0.0,1,0,0,1,1,0,1,0,0.08,2010-03-15,Avondale
2,1,el-ranchito-restaurant-chicago-5,0,2.0,3,0,0.0,1,0,0,1,1,0,1,0,0.1,2010-07-29,Avondale
3,1,el-ranchito-restaurant-chicago-5,1,2.0,4,0,0.0,1,0,0,1,1,0,1,0,0.72,2011-01-11,Avondale
4,1,el-ranchito-restaurant-chicago-5,0,2.0,5,1,0.2,1,1,0,1,1,0,1,0,0.07,2011-01-18,Avondale


In [368]:
recomendation = dateadded[dateadded['probs']==0]
recomendation = recomendation[recomendation['PRI_NEIGH']=='Logan Square']
len(recomendation)

153

In [369]:
recomendation = recomendation[recomendation['Inspection Date']>'2017-01-01']
len(recomendation)

13

In [370]:
recomendation.sort_values('Inspection Date',ascending=False).head(20)

Unnamed: 0,LocId,id,y,rating,inspcount,reinspection,failpct,price,prevfails,breakfast_brunch,pizza,mexican,burgers,complaint,sandwiches,probs,Inspection Date,PRI_NEIGH
9850,1902,tacos-garcia-chicago,0,4.0,15,1,0.266667,1,4,0,0,1,0,0,0,0.0,2017-12-05,Logan Square
14965,3011,dunkin-donuts-chicago-115,0,3.0,11,0,0.090909,1,1,0,0,0,0,1,0,0.0,2017-11-03,Logan Square
15439,3136,paladar-chicago-2,0,4.0,7,1,0.285714,2,2,0,0,0,0,1,0,0.0,2017-11-01,Logan Square
18596,3874,subway-chicago-67,0,2.5,13,0,0.076923,1,1,0,0,0,0,0,1,0.0,2017-10-16,Logan Square
1376,297,mcdonalds-chicago-5,0,2.5,12,1,0.0,1,0,0,0,0,1,0,0,0.0,2017-08-10,Logan Square
23727,5053,scofflaw-chicago,0,4.5,7,1,0.285714,2,2,0,0,0,0,1,0,0.0,2017-05-12,Logan Square
49486,10052,buona-terra-ristorante-chicago,0,4.5,9,0,0.222222,2,2,0,0,0,0,0,0,0.0,2017-04-18,Logan Square
27659,5825,red-hot-ranch-chicago,0,4.0,9,0,0.0,1,0,0,0,0,0,1,0,0.0,2017-04-17,Logan Square
50037,10178,taqueria-moran-chicago-3,0,4.0,14,1,0.214286,1,3,0,0,1,0,1,0,0.0,2017-04-13,Logan Square
38440,7751,anong-thai-chicago,0,3.5,16,1,0.3125,1,5,0,0,0,0,1,0,0.0,2017-03-10,Logan Square


In [358]:
dateadded[dateadded['id']=='el-cubanito-chicago']

Unnamed: 0,LocId,id,y,rating,inspcount,reinspection,failpct,price,prevfails,breakfast_brunch,pizza,mexican,burgers,complaint,sandwiches,probs,Inspection Date,PRI_NEIGH
55869,11608,el-cubanito-chicago,0,4.5,1,0,0.0,1,0,0,0,0,0,0,1,0.424641,2015-01-27,Logan Square
55870,11608,el-cubanito-chicago,1,4.5,2,0,0.0,1,0,0,0,0,0,0,1,0.495112,2016-03-23,Logan Square
55871,11608,el-cubanito-chicago,0,4.5,3,1,0.333333,1,1,0,0,0,0,0,1,0.0,2017-02-15,Logan Square
55872,17379,el-cubanito-chicago,1,4.5,1,0,0.0,1,0,0,0,0,0,1,1,0.497996,2012-05-31,Logan Square
55873,17379,el-cubanito-chicago,0,4.5,2,1,0.5,1,1,0,0,0,0,1,1,0.00508,2012-06-08,Logan Square
55874,17379,el-cubanito-chicago,0,4.5,3,0,0.333333,1,1,0,0,0,0,1,1,0.596702,2013-01-18,Logan Square
