# Basic modeling for restaurant success/failure based on current attributes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score


from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Read in data

In [None]:
# Read in neigbhor and census features.
# Excludes restaurants that opened after 2015 (can't meet our def of success)
rest_df = pd.read_csv('./spatial/data/business_neighbors_census_atts.csv')

In [None]:
# Merge in Ryan's restaurant attributes
att_df = pd.read_pickle('./data/restaurants_imputed.pkl.bz2')

In [None]:
att_df = att_df.loc[:, att_df.columns.str.startswith(r'biz')]

In [None]:
rest_df = rest_df.merge(att_df, on='business_id', how='left')

# Prep for training, eval

### Split

In [None]:
# Set input cols to exclude business ID and the actual success statistics. 
# Can be modified more to test different variable combinations
non_input_cols = np.array(['business_id', 'is_open', 'successful', 'stars', 'review_count'])
input_cols = rest_df.columns[np.isin(rest_df.columns.values, non_input_cols, invert=True)]
X = rest_df[input_cols].values
y = rest_df['successful'].values

In [None]:
# Hold out test set for final test. Use cross val for development
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=25)

### Functions for cross validation testing and var imp

In [None]:
def pipeline_cross_val(pipeline, X, y):
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1_weighted')
    print('Cross Val Scores: {}'.format(cv_scores))
    print('CV Mean F1: {}'.format(np.mean(cv_scores)))
    
    return

In [None]:
def var_imp_plot(var_imps, input_col_names, topn=10):
    indices = np.argsort(var_imps)[::-1]
    indices = indices[0:topn+1]
    # Print the feature ranking
    print("Feature ranking:")
    
    input_col_sorted = input_col_names.values[indices]
    for i in range(indices.shape[0]):
        print('{}: {}'.format(input_col_sorted[i], round(var_imps[indices[i]], 3)))
        
    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.barh(np.flip(input_col_sorted), np.flip(var_imps[indices]),
           color="r", align="center")
    plt.show()

    return

# Training/CrossVal/VarImp

### LogReg

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
scaler = StandardScaler()
lr_c = LogisticRegression(solver='lbfgs', max_iter=1000)
lr_pipeline = Pipeline([('scale', scaler), ('clf', lr_c)])
pipeline_cross_val(lr_pipeline, X_train, y_train)

In [None]:
# Variable importance
lr_pipeline.fit(X_train, y_train)
lr_varimp = lr_pipeline.steps[1][1].coef_
var_imp_plot(lr_varimp[0], input_cols)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_c = RandomForestClassifier(n_estimators=200, random_state=25, min_samples_leaf=1)
rf_pipeline = Pipeline([('clf', rf_c)])
pipeline_cross_val(rf_pipeline, X_train, y_train)

In [None]:
rf_pipeline.fit(X_train, y_train)
rf_varimp = rf_pipeline.steps[0][1].feature_importances_
var_imp_plot(rf_varimp, input_cols)

### SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svm_c = SVC(gamma='auto', kernel='rbf')
svm_pipeline = Pipeline([('scale', scaler), ('clf', svm_c)])
pipeline_cross_val(svm_pipeline, X_train, y_train)

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_c = GaussianNB()
nb_pipeline = Pipeline([('clf', nb_c)])
pipeline_cross_val(nb_pipeline, X_train, y_train)