# Basic modeling for restaurant success/failure based on current attributes
We'll create 3 dataframes for modeling: 

1) Only data that is available when restaurant opens (excluding any review data)

2) DF 1 + review counts from first 4 and 8 weeks

3) DF 2 + Word2Vec data (limits us to ~900 restaurants that had enough reviews)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.metrics import classification_report, f1_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score


from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Read in data

In [None]:
# Read in neigbhor and census features.
# Excludes restaurants that opened after 2015 (can't meet our def of success)
rest_df = pd.read_csv('./data/business_neighbors_census.csv')

In [None]:
# Merge in the success data
success_df = pd.read_pickle('./data/features_df_3mo.pi')
success_df['successful'] = ((success_df['age']>=4.5) &
                            (success_df['is_open']) &
                            (success_df['stars']>=3.5) &
                            (success_df['review_count'] >= 20))
cols_to_merge = success_df.columns.difference(rest_df.columns)
cols_to_merge = np.append(cols_to_merge.values, 'business_id')
# Merge and get open year
rest_df = rest_df.merge(success_df[cols_to_merge], on='business_id')
for col in ['num_in_4_weeks', 'num_in_8_weeks']:
    rest_df[col] = rest_df[col].apply(int)

In [None]:
# Merge in Ryan's restaurant attributes
att_df = pd.read_pickle('./data/restaurants_imputed.pkl.bz2')
att_df = att_df.loc[:, att_df.columns.str.startswith(r'biz')]
rest_df = rest_df.merge(att_df, on='business_id', how='left')

In [None]:
rest_3mo_df = rest_df.copy()
rest_df = rest_df.drop(columns=['first_4_week_review', 'first_8_week_review',
       'num_in_4_weeks', 'num_in_8_weeks', 'average_review_length', 'review_length_variance'])

In [None]:
# Merge in text data
words_df = pd.read_pickle('./data/word2vec_keywordEmbeddings.pkl').transpose()
words_df.columns = ['w2v_{}'.format(c) for c in words_df.columns]
words_df['business_id'] = words_df.index.values
rest_3mo_words_df = rest_3mo_df.merge(words_df, on='business_id', how='inner')

In [None]:
# Our 3 data frames:
print('Base DF {}'.format(rest_df.shape))
print('Info up to 3 monghs {}'.format(rest_3mo_df.shape))
print('Text Info (3 months, >= 10 reviews) {}'.format(rest_3mo_words_df.shape))

# Prep for training, eval

### Functions for cross validation testing and var imp

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
def pipeline_cross_val(pipeline, X, y, print_results=False):
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1_weighted')
    if print_results:
        print('Cross Val Scores: {}'.format(cv_scores))
        print('CV Mean F1: {}'.format(np.mean(cv_scores)))
    
    return cv_scores

def pipeline_final_test(pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)
    y_test_pred = pipeline.predict(X_test)
    f1_test = f1_score(y_test, y_test_pred, average='weighted')
    
    return f1_test

def all_models_cv_test(df_dict, final_test=False):
    out_df = pd.DataFrame(index=list(df_dict.keys()),columns=['RandomForest', 'SVM', 'LogReg', 'NaiveBayes'])
    
    ### Define pipelines
    scaler = StandardScaler()
    # LR
    lr_c = LogisticRegression(solver='lbfgs', max_iter=1000)
    lr_pipeline = Pipeline([('scale', scaler), ('clf', lr_c)])
    # NB
    nb_c = GaussianNB()
    nb_pipeline = Pipeline([('clf', nb_c)])
    # RF
    rf_c = RandomForestClassifier(n_estimators=200, random_state=25, min_samples_leaf=1)
    rf_pipeline = Pipeline([('clf', rf_c)])
    # SVM
    svm_c = SVC(gamma='auto', kernel='rbf')
    svm_pipeline = Pipeline([('scale', scaler), ('clf', svm_c)])    
    
    # Loop over our datasets
    for df_name in df_dict.keys():
        df = df_dict[df_name]     
        
        ### Split
        # Set input cols to exclude business ID and the actual success statistics. 
        # Can be modified more to test different variable combinations
        non_input_cols = np.array(['business_id', 'is_open', 'successful', 'stars', 
                                   'review_count', 'age', 'first_review'])
        input_cols = df.columns[np.isin(df.columns.values, non_input_cols, invert=True)]
        X = df[input_cols].values
        y = df['successful'].values

        # Hold out test set for final test. Use cross val for development
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=25)

        if not final_test:
            out_df.loc[df_name, 'LogReg'] = [pipeline_cross_val(
                lr_pipeline, X_train, y_train)]
            out_df.loc[df_name, 'RandomForest'] = [pipeline_cross_val(
                rf_pipeline, X_train, y_train)]
            out_df.loc[df_name, 'SVM'] = [pipeline_cross_val(
                svm_pipeline, X_train, y_train)]
            out_df.loc[df_name, 'NaiveBayes'] = [pipeline_cross_val(
                nb_pipeline, X_train, y_train)]
        else:
            out_df.loc[df_name, 'LogReg'] = pipeline_final_test(
                lr_pipeline, X_train, y_train, X_test, y_test)
            out_df.loc[df_name, 'RandomForest'] = pipeline_final_test(
                rf_pipeline, X_train, y_train, X_test, y_test)
            out_df.loc[df_name, 'SVM'] = pipeline_final_test(
                svm_pipeline, X_train, y_train, X_test, y_test)
            out_df.loc[df_name, 'NaiveBayes'] = pipeline_final_test(
                nb_pipeline, X_train, y_train, X_test, y_test)
    
    return out_df


In [None]:
def var_imp_plot(var_imps, input_col_names, topn=15, title=None):
    indices = np.argsort(var_imps)[::-1]
    indices = indices[0:topn+1]
    # Print the feature ranking
    print("Feature ranking:")
    
    input_col_sorted = input_col_names.values[indices]
    for i in range(indices.shape[0]):
        print('{}: {}'.format(input_col_sorted[i], round(var_imps[indices[i]], 3)))
        
    # Plot the feature importances of the forest
    plt.figure()
    if title==None:
        plt.title("Feature importances")
    else:
        plt.title(title)
        
    plt.barh(np.flip(input_col_sorted), np.flip(var_imps[indices]),
           color="r", align="center")
    plt.show()

    return

def var_imp_plot(var_imps, input_col_names, topn=10, title=None):
    plt.figure(figsize=(8, 8), facecolor='white')
    indices = np.argsort(var_imps)[::-1]
    indices = indices[0:topn+1]
    # Print the feature ranking
    print("Feature ranking:")
    
    input_col_sorted = input_col_names.values[indices]
    for i in range(indices.shape[0]):
        print('{}: {}'.format(input_col_sorted[i], round(var_imps[indices[i]], 3)))
        
    # Plot the feature importances of the forest
    plt.figure()
    if title==None:
        plt.title("Feature importances")
    else:
        plt.title(title)
        
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=16)
    plt.xlabel('Feature Importance', fontsize=18)
        
    plt.barh(np.flip(input_col_sorted), np.flip(var_imps[indices]),
           color="r", align="center")
    plt.show()

    return

# Training/CrossVal/VarImp, all at once

In [None]:
all_df = all_models_cv_test({'Base':rest_df,
                             'First 3 Months':rest_3mo_df,
                             'First 3 Months + Keywords':rest_3mo_words_df})

In [None]:
all_df

In [None]:
final_df = all_models_cv_test({'Base':rest_df, 'First 3 Months':rest_3mo_df, 'First 3 Months + Keywords':rest_3mo_words_df},
                             final_test=True)

In [None]:
fig, ax = plt.subplots(facecolor='white', figsize=(10, 7))
final_df.loc[['Base', 'First 3 Months', 'First 3 Months + Keywords']
            ].plot(kind='bar', ax=ax)
plt.ylabel('F1 Score (Weighted)', fontsize=18)
plt.xticks(labels=['Initial Data', 'First 3 Months', '3 Months + Keywords'], 
           ticks=[0,1,2], fontsize=16, rotation=30, ha='right')
plt.xlim(-0.5, 3.4)
plt.ylim(0, 1)
plt.legend(fontsize=15)
plt.yticks(fontsize=15)
plt.show()

# Run model by model

In [None]:
target_df = rest_3mo_df.copy() # Set which df you're working with here
# Set input cols to exclude business ID and the actual success statistics. 
# Can be modified more to test different variable combinations
non_input_cols = np.array(['business_id', 'is_open', 'successful', 'stars', 
                           'review_count', 'age', 'first_review', 'average_review_length'])
input_cols = target_df.columns[np.isin(target_df.columns.values, non_input_cols, invert=True)]
X = target_df[input_cols].values
y = target_df['successful'].values
# Hold out test set for final test. Use cross val for development
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=25)

### LogReg

In [None]:
scaler = StandardScaler()
lr_c = LogisticRegression(solver='lbfgs', max_iter=1000)
lr_pipeline = Pipeline([('scale', scaler), ('clf', lr_c)])
pipeline_cross_val(lr_pipeline, X_train, y_train, print_results=True)

In [None]:
# Variable importance
lr_pipeline.fit(X_train, y_train)
lr_varimp = lr_pipeline.steps[1][1].coef_
var_imp_plot(lr_varimp[0], input_cols, title='')

### Random Forest

In [None]:
rf_c = RandomForestClassifier(n_estimators=200, random_state=25, min_samples_leaf=1)
rf_pipeline = Pipeline([('clf', rf_c)])
pipeline_cross_val(rf_pipeline, X_train, y_train, print_results=True)

In [None]:
rf_pipeline.fit(X_train, y_train)
rf_varimp = rf_pipeline.steps[0][1].feature_importances_
var_imp_plot(rf_varimp, input_cols, title='')

### SVM

In [None]:
svm_c = SVC(gamma='auto', kernel='rbf')
svm_pipeline = Pipeline([('scale', scaler), ('clf', svm_c)])
pipeline_cross_val(svm_pipeline, X_train, y_train, print_results=True)

### Naive Bayes

In [None]:
nb_c = GaussianNB()
nb_pipeline = Pipeline([('clf', nb_c)])
pipeline_cross_val(nb_pipeline, X_train, y_train, print_results=True)