# Question 2

Can we predict if a business will stay open?

In [10]:
import pandas as pd
import ProcessData as p
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix

In [11]:
# Read in data   
business_file = 'yelp_dataset/yelp_academic_dataset_business.json'
business_df = pd.read_json(business_file, lines=True)

In [12]:
# Check the correlation matrix
business_df.corr().style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

Unnamed: 0,is_open,latitude,longitude,review_count,stars
is_open,1.0,-0.0064,0.0052,0.037,0.052
latitude,-0.0064,1.0,0.56,-0.077,-0.093
longitude,0.0052,0.56,1.0,-0.092,-0.082
review_count,0.037,-0.077,-0.092,1.0,0.032
stars,0.052,-0.093,-0.082,0.032,1.0


There seems to be some correlation between is_open and both latitude and longitude. There doesn't seem to be much correlation between is_open and stars.

In [13]:
business_df.groupby(by=['is_open']).mean()

Unnamed: 0_level_0,latitude,longitude,review_count,stars
is_open,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,38.579515,-97.69424,23.322475,3.514756
1,38.49194,-97.449337,33.528307,3.655406


In [14]:
# Process categories and attributes columns
categories = ['Restaurants', 'Shopping', 'Nightlife', 'Active Life', 'Beauty & Spas', 'Automotive', 'Home Services']
business_df = p.process_business(business_df, categories)

# Drop the columns that were processed
business_df.drop(['attributes', 'categories'], axis='columns', inplace=True)

In [15]:
# Check for remaining missing values
business_df.isnull().mean().sort_values(ascending=False).head(10)

hours                     0.237559
latitude                  0.000032
longitude                 0.000032
Home Services             0.000000
BusinessParking_garage    0.000000
BestNights_monday         0.000000
BestNights_saturday       0.000000
BestNights_sunday         0.000000
BestNights_thursday       0.000000
BestNights_tuesday        0.000000
dtype: float64

In [16]:
business_df.drop(['hours'], axis='columns', inplace=True)

Drop the hours column for now. There are too many missing values and preprocessing would need to be done to make the column useable.

In [17]:
business_df.dropna(subset=['latitude', 'longitude'], axis='rows', inplace=True)

In [18]:
# Drop columns that are essentially unique identifiers
business_df.drop(['address', 'business_id', 'name'], axis='columns', inplace=True)

In [19]:
business_df.drop(['city', 'neighborhood', 'postal_code'], axis='columns', inplace=True)

In [20]:
business_df = pd.get_dummies(business_df, drop_first=True)

In [21]:
business_df.columns

Index(['is_open', 'latitude', 'longitude', 'review_count', 'stars',
       'AcceptsInsurance', 'Ambience_casual', 'Ambience_classy',
       'Ambience_divey', 'Ambience_hipster',
       ...
       'RestaurantsPriceRange2_1', 'RestaurantsPriceRange2_2',
       'RestaurantsPriceRange2_3', 'RestaurantsPriceRange2_4', 'Smoking_no',
       'Smoking_outdoor', 'Smoking_yes', 'WiFi_free', 'WiFi_no', 'WiFi_paid'],
      dtype='object', length=180)

## Predict if a business will stay open

In [22]:
print(business_df.shape[0])
print(business_df.is_open.value_counts())

188586
1    156601
0     31985
Name: is_open, dtype: int64


In [23]:
y = business_df.is_open
X = business_df.drop(['is_open'], axis='columns')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=42)

### Random Forest

In [24]:
# Random Forest Model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [46]:
print('train accuracy: ', rf_model.score(X_train, y_train_preds))
print('test accuracy: ', rf_model.score(X_test, y_test))

train accuracy:  1.0
test accuracy:  0.835389564479638


The model is overfitting.

### Parameter Estimation

In [35]:
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(random_state=42)

# Create the parameters list you wish to tune.
parameters = {'max_depth':[40],'min_samples_leaf':[1,2,5]}

# Make an fbeta_score scoring object.
scorer = make_scorer(f1_score)

# Perform grid search on the classifier using 'scorer' as the scoring method.
grid_obj = GridSearchCV(clf, parameters, scoring=scorer)

# Fit the grid search object to the training data and find the optimal parameters.
grid_fit = grid_obj.fit(X, y)

# Get the estimator.
best_clf = grid_fit.best_estimator_

# Fit the new model.
best_clf.fit(X_train, y_train)

# Make predictions using the new model.
best_train_predictions = best_clf.predict(X_train)
best_test_predictions = best_clf.predict(X_test)

# Calculate the f1_score of the new model.
print('The training F1 Score is', f1_score(best_train_predictions, y_train))
print('The testing F1 Score is', f1_score(best_test_predictions, y_test))

# Plot the new model.
#plot_model(X, y, best_clf)

# Let's also explore what parameters ended up being used in the new model.
best_clf

The training F1 Score is 0.9352217896171944
The testing F1 Score is 0.915460444840739


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=40, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

### Results

In [36]:
confusion_matrix(y_test, best_test_predictions)

array([[ 1842,  7900],
       [  633, 46201]], dtype=int64)

In [38]:
tn, fp, fn, tp = confusion_matrix(y_test, best_test_predictions).ravel()
print(tn, fp, fn, tp)

1842 7900 633 46201


In [37]:
feature_importances = pd.DataFrame(best_clf.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance',
                                                                       ascending=False)
feature_importances.head(10)

Unnamed: 0,importance
longitude,0.110092
review_count,0.109925
latitude,0.107762
BikeParking,0.059261
stars,0.051044
RestaurantsTableService,0.027935
Restaurants,0.026919
WheelchairAccessible,0.021435
RestaurantsAttire_casual,0.021376
Alcohol_full_bar,0.020939
