In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
data = Path('combine_data.csv')
df = pd.read_csv(data)

In [4]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

In [5]:
# Drop the null rows
df = df.dropna()

df = df.drop(columns ='name')

In [6]:
df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,rating,review_count,price,latitude,longitude,stars,michelin,latBin,lonBin
0,4.0,735,4,37.39468,-122.08044,1,1,37.2,-122.2
1,4.5,348,4,37.427853,-122.14362,1,1,37.4,-122.2
2,3.5,886,3,37.42014,-122.21151,1,1,37.4,-122.4
3,4.0,1088,4,37.42897,-122.25178,1,1,37.4,-122.4
4,4.0,1244,4,37.25648,-122.03537,1,1,37.2,-122.2


In [56]:
# Split our preprocessed data into our features and target arrays
y = df['michelin']
X = df.drop(columns='michelin')



In [57]:
X.describe()

Unnamed: 0,rating,review_count,price,latitude,longitude,stars,latBin,lonBin
count,7110.0,7110.0,7110.0,7110.0,7110.0,7110.0,7110.0,7110.0
mean,4.057243,682.871589,2.207454,37.686195,-102.246969,0.032771,37.576259,-102.357665
std,0.449551,952.921199,0.517659,3.091578,20.632329,0.224208,3.085949,20.634838
min,1.0,1.0,0.0,32.691728,-122.88647,0.0,32.6,-123.0
25%,4.0,167.25,2.0,34.088765,-121.91909,0.0,34.0,-122.0
50%,4.0,382.0,2.0,37.781908,-117.160068,0.0,37.6,-117.2
75%,4.5,822.75,2.0,40.710849,-77.045175,0.0,40.6,-77.2
max,5.0,17070.0,4.0,53.512863,-2.946107,3.0,53.4,-3.0


In [58]:
# Check the balance of our target values
y.value_counts()

0    6931
1     179
Name: michelin, dtype: int64

### Naive Random Oversampling

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

In [60]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0: 5198, 1: 5198})

In [61]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [62]:
# Calcupate predicitons
y_pred = model.predict(X_test)

# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

1.0

In [63]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual no_star", "Actual star"], columns=["Predicted no_star", "Predicted star"])
cm_df

Unnamed: 0,Predicted no_star,Predicted star
Actual no_star,1733,0
Actual star,0,45


In [64]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00      1733
          1       1.00      1.00      1.00      1.00      1.00      1.00        45

avg / total       1.00      1.00      1.00      1.00      1.00      1.00      1778



### SMOTE Oversampling

In [65]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
# Instantiate the model
smote = SMOTE(random_state=1)
# Resample the targets
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 5198, 1: 5198})

In [66]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [67]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

1.0

In [68]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1733,    0],
       [   0,   45]])

In [69]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00      1733
          1       1.00      1.00      1.00      1.00      1.00      1.00        45

avg / total       1.00      1.00      1.00      1.00      1.00      1.00      1778



# Undersampling

In [70]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 134, 1: 134})

In [71]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=78)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=78)

In [72]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

1.0

In [73]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
cm3 = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm3_df = pd.DataFrame(
    cm3, index=["Actual no_star", "Actual star"], columns=["Predicted no_star", "Predicted star"])
cm3_df

Unnamed: 0,Predicted no_star,Predicted star
Actual no_star,1676,57
Actual star,0,45


In [74]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.97      1.00      0.98      0.98      0.96      1733
          1       0.44      1.00      0.97      0.61      0.98      0.97        45

avg / total       0.99      0.97      1.00      0.97      0.98      0.96      1778



# Combination (Over and Under) Sampling

In [75]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 5896, 1: 6123})

In [76]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [77]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.9835545297172533

In [78]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[1733,    0],
       [   0,   45]])

In [79]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00      1733
          1       1.00      1.00      1.00      1.00      1.00      1.00        45

avg / total       1.00      1.00      1.00      1.00      1.00      1.00      1778



### Balanced Random Forest Classifier

In [80]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier(n_estimators = 100)
random_forest = random_forest.fit(X_train, y_train)

In [81]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

1.0

In [82]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[1733,    0],
       [   0,   45]])

In [83]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00      1733
          1       1.00      1.00      1.00      1.00      1.00      1.00        45

avg / total       1.00      1.00      1.00      1.00      1.00      1.00      1778



In [84]:
# List the features sorted in descending order by feature importance
feature_names = X.columns
sorted(zip(random_forest.feature_importances_, feature_names), reverse=True)

[(0.5567463841044398, 'stars'),
 (0.27882369769809634, 'price'),
 (0.04977997555638017, 'latitude'),
 (0.03537507930806864, 'longitude'),
 (0.031749544865314186, 'review_count'),
 (0.020622466495937644, 'latBin'),
 (0.019925754875593435, 'lonBin'),
 (0.006977097096170009, 'rating')]

### Easy Ensemble AdaBoost Classifier

In [85]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
easy = EasyEnsembleClassifier(n_estimators = 100,random_state=1)
easy = easy.fit(X_train, y_train)

In [86]:
# Calculated the balanced accuracy score
y_pred = easy.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

1.0

In [87]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1733,    0],
       [   0,   45]])

In [88]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00      1733
          1       1.00      1.00      1.00      1.00      1.00      1.00        45

avg / total       1.00      1.00      1.00      1.00      1.00      1.00      1778

