## Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from shapely.geometry import Point


import sklearn as skl
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import ConfusionMatrixDisplay


### Load Data

In [None]:
business_location_df = pd.read_json("../data/business_location.json")
business_location_geometry = [Point(xy) for xy in zip(business_location_df.longitude, business_location_df.latitude)]
business_location_gdf = gpd.GeoDataFrame(business_location_df, crs="EPSG:4326", geometry=business_location_geometry)
business_location_gdf = business_location_gdf[business_location_gdf['longitude']!=180] #fix these outliers


## Preprocessing

In [None]:
franchise_mapping = {'FRANCHISE': 1, 'INDEPENDENT': 0}
business_location_df['franchise_bool'] = business_location_df['franchise'].map(franchise_mapping)
X = business_location_df.dropna()[['latitude', 'longitude','franchise_bool']]
y = business_location_df.dropna()['avg_rating']


## Analytics

### Classical ML

In [None]:
# Linear regression to predict avg_rating from lat lon and franchise


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

In [None]:
# Polynomial regression to predict avg_rating from lat lon and franchise



polynomial_regressor = Pipeline([('poly_features', PolynomialFeatures(degree=2)),
                                 ('linear_regression', LinearRegression())])

polynomial_regressor.fit(X_train, y_train)
y_poly = polynomial_regressor.predict(X_test)

mse_poly = mean_squared_error(y_test, y_poly)
r2_poly = r2_score(y_test, y_poly)
print(f"Mean Squared Error: {mse_poly}")
print(f"R-squared: {r2_poly}")

In [None]:
# Logistic regression to classify as franchise or not using avg_rating, lat and lon
model = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=42))
])


Xf = business_location_df.dropna()[['latitude', 'longitude','avg_rating']]#,'zip','blockgroup']]
yf = business_location_df.dropna()['franchise_bool']

Xf_train, Xf_test, yf_train, yf_test = train_test_split(Xf, yf, test_size=0.2, random_state=42)


model.fit(Xf_train, yf_train)

# Predict the class label (0 or 1)
yf_logist = model.predict(Xf_test)
print(f"Predicted classes: {yf_logist}")

# Predict the probability of belonging to class 1
yf_probabilities = model.predict_proba(Xf_test)[:, 1] #
print(f"Probabilities of class 1: {yf_probabilities}")




print(f"Accuracy: {accuracy_score(yf_test, yf_logist)}")
print("Classification Report:")
print(classification_report(yf_test, yf_logist))

cm_logist = confusion_matrix(yf_test, yf_logist)


disp = ConfusionMatrixDisplay.from_estimator(model, Xf_test, yf_test)
