# Basic modeling for restaurant success/failure based on current attributes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score


from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
# Read in neigbhor and census features.
# Excludes restaurants that opened after 2015 (can't meet our def of success)
rest_df = pd.read_csv('./spatial/data/business_neighbors_census_atts.csv')
rest_df.shape

# Split

In [None]:
# Set input cols to exclude business ID and the actual success statistics. 
# Can be modified more to test different variable combinations
non_input_cols = np.array(['business_id', 'is_open', 'successful', 'stars', 'review_count'])
input_cols = rest_df.columns[np.isin(rest_df.columns.values, non_input_cols, invert=True)]
X = rest_df[input_cols].values
y = rest_df['successful'].values

In [None]:
# Hold out test set for final test. Use cross val for development
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=25)

# Function for cross validation testing

In [None]:
def pipeline_cross_val(pipeline, X, y):
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1_weighted')
    print('Cross Val Scores: {}'.format(cv_scores))
    print('CV Mean F1: {}'.format(np.mean(cv_scores)))
    
    return

# LogReg

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
scaler = StandardScaler()
lr_c = LogisticRegression(solver='lbfgs', max_iter=1000)
lr_pipeline = Pipeline([('scale', scaler), ('clf', lr_c)])
pipeline_cross_val(lr_pipeline, X_train, y_train)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_c = RandomForestClassifier(n_estimators=300)
rf_pipeline = Pipeline([('clf', rf_c)])
pipeline_cross_val(rf_pipeline, X_train, y_train)

# SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svm_c = SVC(gamma='auto', kernel='rbf')
svm_pipeline = Pipeline([('scale', scaler), ('clf', svm_c)])
pipeline_cross_val(svm_pipeline, X_train, y_train)

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_c = GaussianNB()
nb_pipeline = Pipeline([('clf', nb_c)])
pipeline_cross_val(nb_pipeline, X_train, y_train)