# Draft for Model (Smoothie King) - dependent on feature selection

@TODO: add filtered feature to preprocess step before model comparison
- Logistic Regression
- RandomForestClassifier
- RandomForestClassifier + PCA
- Hierarchical Clustering
- Hierarchical Clustering + PCA

## Preprocess

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [1]:
import os
import string
import sys
from collections import deque

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import seaborn as sns
from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.decomposition import PCA


In [2]:
sys.path.append("../../src/.")
from preprocess_data import data_transform_pipeline

In [3]:
DIR = "../../data/"
SMOOTHIE = "Smoothie King/smoothie_king_"

In [4]:
smoothie_demographic = pd.read_csv(DIR + SMOOTHIE + "demographic_variables.csv")
smoothie_stores = pd.read_csv(DIR + SMOOTHIE + "stores.csv")
smoothie_poi_variables = pd.read_csv(DIR + SMOOTHIE + "poi_variables.csv")
smoothie_sister = pd.read_csv(DIR + SMOOTHIE + "competition_sister_variables.csv")
smoothie_trade_area = pd.read_csv(DIR + SMOOTHIE + "trade_area_variables.csv").rename(columns={"store_num": "store"})

In [5]:
smoothie_merged = smoothie_stores.merge(
    smoothie_demographic, on="store", how="outer"
).merge(
    smoothie_poi_variables, on="store", how="outer"
).merge(
    smoothie_sister, on="store", how="outer"
).merge(
    smoothie_trade_area, on="store", how="outer"
)

In [6]:
train_df, test_df = train_test_split(smoothie_merged, test_size=0.1, random_state=42)
X_train = train_df.drop(columns=["category"])
y_train = train_df["category"]
train_index = train_df["store"]
X_test = test_df.drop(columns=["category"])
y_test = test_df["category"]
test_index = test_df["store"]


In [7]:
drop_features = ["store", "longitude", "latitude", "__store_latitude"]
ordinal_features_oth = [
    "market_size",
    "store_density",
]
ordering_ordinal_oth = [
    ["Very Large Metro (1)", "Large Metro (2)", "Large City (3)", "Medium City (4)", "Small City (5)", "Small Town (6)"],
    ["Rural", "Exurban", "Suburban", "Light Suburban", "Light Urban", "Urban", "Super Urban"],
]
categorical_features = ["cbsa_name", "dma_name", "state_name"]

numeric_features = list(set(smoothie_merged.select_dtypes(include=np.number).columns.tolist()) - {"longitude", "latitude"})

In [8]:
transformed_train, transformed_test = data_transform_pipeline(
    X_train, 
    X_test, 
    train_index,
    test_index,
    drop_features + categorical_features, 
    ordinal_features_oth, 
    ordering_ordinal_oth, 
    [], 
    numeric_features
)

In [9]:
transformed_train

Unnamed: 0_level_0,dtpop_retired_disabled_3mi,hh_6pers_10mi,hh_type_female_p_5mi,total_comp_adjacent,hh_5pers_10mi,hh_type_female_nochild_10mi,banks_2mi,places_of_worship_10mi,hh_3vehicle_p_2mi,emp_construction_p_5mi,...,inrix_breakfast_ew,dtpop_students_1mi,other_p_ta,hispanic_p_3mi,millenial_ta,emp_healthcare_social_assistance_p_ta,hh_1pers_p_ta,spend_lunch_1mi,market_size,store_density
store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SK 1662,-0.579424,-0.359384,-1.128491,0.0,-0.257819,-0.600428,-0.852148,-0.496453,2.413640,1.914418,...,-0.127435,-0.260262,0.261417,-0.493691,1.628835,0.094609,-1.302303,-0.218520,1.0,0.0
SK 1855,1.238938,0.840169,-0.654563,0.0,0.953383,1.716003,-0.135743,1.219795,-0.815435,-0.832126,...,1.871402,-0.284324,-0.034546,-0.770779,0.725838,0.462556,1.035923,0.360451,0.0,3.0
SK 1814,-0.434547,0.550037,-0.857675,0.0,0.534410,-0.087818,-0.692947,0.069129,1.292340,-0.751213,...,-0.881252,-0.224830,-0.004950,0.004458,-1.042001,2.213988,-1.668389,-0.078498,0.0,1.0
SK 1174,-0.086866,0.802056,0.164655,0.0,0.664104,0.012040,-0.613346,-0.058723,0.791238,3.460754,...,0.418524,3.531476,-0.315711,1.231560,0.183949,-1.134903,-0.907875,0.262951,0.0,3.0
SK 0973,-0.056329,-0.392756,-0.805769,0.0,-0.278668,-0.401469,-0.135743,-0.596135,0.845168,-0.611863,...,0.591827,-0.327424,-0.789252,-0.521096,0.105596,-0.156728,-1.049586,-0.185362,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SK 0848,0.161860,0.287258,-0.027173,0.0,0.366118,0.476507,2.411472,0.498191,-1.210923,0.718705,...,-0.084375,-0.090506,-0.278716,-0.544846,-1.005663,1.122598,2.166065,0.675746,0.0,2.0
SK 0731,-0.321257,-0.709148,0.683719,0.0,-0.770148,-0.433541,0.262259,-0.596135,0.476644,-0.472513,...,-0.901985,-0.189663,-0.559880,-0.789657,0.469656,-0.335607,0.473804,-0.536380,3.0,1.0
SK 1554,-0.343752,0.033051,-0.408572,0.0,0.244378,-0.508812,0.182659,-0.390271,1.009206,-0.162347,...,1.734780,0.387295,0.305811,0.402124,0.017703,-0.502033,-1.062576,0.165580,0.0,1.0
SK 1070,-0.300394,-0.523324,1.719590,0.0,-0.575230,-0.240076,0.023458,-0.340431,0.332830,-1.007438,...,-0.405465,-0.258411,-1.551356,-0.788439,-1.231639,0.802201,0.373425,-0.278733,2.0,1.0


## Logistic Regression (One vs Rest)

In [13]:
class_weight = {
    "HOME": 0.29,
    "OTHER": 0.14,
    "SHOPPING": 0.24,
    "TRAVEL": 0.15,
    "WORK": 0.27
}

In [22]:
pipe_lr = make_pipeline(
    LogisticRegression(
        penalty="l1", 
        random_state=42, 
        solver="saga", 
        max_iter=2000, 
        multi_class="ovr", 
        class_weight=class_weight,
        n_jobs=-1
    )
)

---
Transformed

In [21]:
pipe_lr.fit(transformed_train, y_train)
print(pipe_lr.score(transformed_train, y_train))
print(pipe_lr.score(transformed_test, y_test))

0.6550279329608939
0.475


---
Transformed + reduced

In [16]:
features = [
    'market_size', 
    'com0811_p_ta', 
    'nces_public_schools_nearest_dist',
    'transitstop_nearest_dist',
    'osm_highway_exits_count_2mi',
    'dmm_nearest_dist',
    'dmm_gla_1mi',
    'ipeds_postsecondary_schools_total_enrollment_2mi',
    'hrsa_hospitals_3mi',
    'nces_private_schools_nearest_dist',
    'pop_seasonal_ta',
    'osm_nearest_exit_dist',
    'com0002_p_ta',
    'genz_p_ta',
    'centerxy_gla_effective_5mi',
    'military_installations_5mi',
    'places_of_worship_3mi',
    'hrsa_hospitals_nearest_dist',
    'inrix_ew_distance',
    'age85pl_p_ta',
    'dmm_count_2mi',
    'genx_p_ta',
    'dtpop_students_p_ta',
]

In [18]:
reduced_train = transformed_train[features]
reduced_test = transformed_test[features]
reduced_train

Unnamed: 0_level_0,market_size,com0811_p_ta,nces_public_schools_nearest_dist,transitstop_nearest_dist,osm_highway_exits_count_2mi,dmm_nearest_dist,dmm_gla_1mi,ipeds_postsecondary_schools_total_enrollment_2mi,hrsa_hospitals_3mi,nces_private_schools_nearest_dist,...,genz_p_ta,centerxy_gla_effective_5mi,military_installations_5mi,places_of_worship_3mi,hrsa_hospitals_nearest_dist,inrix_ew_distance,age85pl_p_ta,dmm_count_2mi,genx_p_ta,dtpop_students_p_ta
store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SK 1662,1.0,0.267111,1.338132,-0.346193,-0.772777,0.572698,-0.741297,-0.391589,-0.752078,-0.484490,...,0.816515,-0.867218,-0.219922,-0.518870,1.538579,0.189643,-0.879793,-1.061480,1.002332,0.589739
SK 1855,0.0,0.525438,0.255703,-0.373329,-0.772777,0.184308,-0.741297,-0.376878,-0.070879,-0.753225,...,-0.848056,-0.080284,-0.219922,0.348790,-0.890793,-0.544881,0.148171,-0.074441,1.045927,-0.755550
SK 1814,0.0,-1.110633,0.568984,-0.032137,0.020070,-0.479654,-0.288608,-0.391589,-0.411478,-0.424224,...,0.743548,0.762966,-0.219922,-0.413344,-1.120461,1.900470,-0.798023,-0.567961,0.664466,0.139076
SK 1174,0.0,-1.275023,-0.729726,-0.155706,-0.772777,0.367383,-0.741297,2.564449,-0.752078,-0.159688,...,-0.145744,0.308539,-0.219922,-0.260917,0.439084,-0.316340,-0.611121,-1.061480,-0.112987,2.698979
SK 0973,0.0,-0.498477,0.072358,-0.450916,-0.277248,-0.310077,-0.130037,-0.391589,-0.752078,-0.107871,...,-0.273437,-0.646202,1.010268,-0.436794,0.871340,1.179956,0.276667,-0.074441,-0.879542,-0.494199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SK 0848,0.0,0.210748,-0.021750,-0.448298,2.002190,-0.226667,-0.153204,1.418769,0.610320,-0.433074,...,-0.775089,0.121555,1.010268,0.677094,-0.985037,-0.578225,-0.541032,0.912597,1.823380,0.802506
SK 0731,3.0,0.527003,-0.525588,-0.451547,-0.475459,-0.510320,0.341832,0.119614,-0.411478,0.625096,...,-0.086458,-0.005545,-0.219922,-0.225742,-0.531229,0.685826,-0.622802,1.899636,0.751657,-0.100496
SK 1554,0.0,-1.611631,-0.068760,0.059641,-0.079036,-0.156793,-0.741297,-0.391589,-0.411478,-0.362380,...,0.586211,0.465162,-0.219922,-0.178841,-0.410919,0.043424,-0.751297,0.419078,1.158549,2.129367
SK 1070,2.0,-0.570495,-0.506042,-0.452651,0.020070,-0.496332,-0.187736,-0.375534,1.972717,-0.381624,...,-0.303080,-0.164587,-0.219922,-0.002964,-0.352415,-0.085082,0.206578,-0.074441,-0.156583,-0.515978


In [28]:
pipe_lr = make_pipeline(
    LogisticRegression(
        penalty="l1", 
        random_state=42, 
        solver="saga", 
        max_iter=2000, 
        multi_class="ovr", 
        # class_weight=class_weight,
        n_jobs=-1
    )
)

In [29]:
pipe_lr.fit(reduced_train, y_train)
print(pipe_lr.score(reduced_train, y_train))
print(pipe_lr.score(reduced_test, y_test))

0.4958100558659218
0.4375


## PCA

In [None]:
n_components = 70
pca = PCA(n_components=n_components, whiten=True, random_state=42)
pca.fit(transformed_X)

In [None]:
df = pd.DataFrame(
    data=np.cumsum(pca.explained_variance_ratio_),
    columns=["cummulative variance_explained (%)"],
    index=range(1, n_components + 1),
)
df.index.name = "n_components"

In [None]:
plt.figure(figsize=(8, 6))
plt.xticks(range(1, 71, 5))
plt.xlabel("number of components")
plt.ylabel("cumulative explained variance ratio")
plt.plot(range(1, 71), np.cumsum(pca.explained_variance_ratio_))
plt.grid()
plt.show()

In [None]:
feature_names_out = pca.get_feature_names_out(column_names)
feature_names_out

In [None]:
transformed_features = pd.DataFrame(abs(pca.components_), columns=column_names, index=feature_names_out)
transformed_features.head()

### Try 1: Filter and Count Important Features (of heavy weights)

Filter the values in the PCA component matrix, and count the occurrence of the features in the filtered matrix.

In [None]:
pca0_df = transformed_features.iloc[0].sort_values(axis=0, ascending=True)
pca0_df.filter(pca0_df > 0.01)

In [None]:
W = abs(pca.components_)
features = transformed_features.columns
long_results = []
for i in range(W.shape[0]):
    array = W[i]
    heavy_idx = np.where(array > 0.01)
    long_results += list(features[heavy_idx])

In [None]:
count = pd.Series(long_results).value_counts()
count[:20]

## Hierachical Clustering

In [None]:
from scipy.cluster.hierarchy import (
    average,
    complete,
    dendrogram,
    fcluster,
    single,
    ward,
)

In [None]:
linkage_array = complete(transformed_X)
hier_labels = fcluster(linkage_array, 5, criterion="maxclust") 
# plot_dendrogram_clusters(X, linkage_array, hier_labels, linkage_type='complete', title="maxclust 3")

In [None]:
hier_labels

## PCA + Hierarchical Clustering

In [None]:
from scipy.cluster.hierarchy import (
    average,
    complete,
    dendrogram,
    fcluster,
    single,
    ward,
)

X = pca.transform(transformed_X)

In [None]:
linkage_array = complete(X)
hier_labels = fcluster(linkage_array, 5, criterion="maxclust") 
# plot_dendrogram_clusters(X, linkage_array, hier_labels, linkage_type='complete', title="maxclust 3")

In [None]:
hier_labels

In [None]:
y_train

## Random Forest Classifier

In [None]:
rf_pipe = make_pipeline(preprocessor, RandomForestClassifier(max_depth=50))
scores = cross_validate(rf_pipe, X_train, y_train, return_train_score=True)
pd.DataFrame(scores)

In [None]:

# from sklearn.model_selection import RandomizedSearchCV

# param_grid = {
#     "randomforestclassifier__max_depth": range(5, 30),
# }

# random_search = RandomizedSearchCV(
#     rf_pipe_best, param_distributions=param_grid, n_jobs=-1, n_iter=20, cv=5, random_state=42
# )
# random_search.fit(X_train, y_train)
# pd.DataFrame(random_search.cv_results_)[
#     [
#         "mean_test_score",
#         "param_randomforestclassifier__max_depth",
#         "mean_fit_time",
#         "rank_test_score",
#     ]
# ].set_index("rank_test_score").sort_index().T

In [None]:
# from random search above, best max depth was 23
rf_pipe_best = make_pipeline(preprocessor, RandomForestClassifier(max_depth=23))
scores = cross_validate(rf_pipe_best, X_train, y_train, return_train_score=True)
pd.DataFrame(scores)

In [None]:
rf_best_model = rf_pipe_best.named_steps["randomforestclassifier"]

In [None]:
rf_best_model.fit(transformed_X, y_train)

In [None]:
importance = rf_best_model.feature_importances_
for i,v in enumerate(importance):
    if v > 0.002:
        print(f'Feature: {column_names[i]}, Score: {v}')
        
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

## PCA + Random Forest Classifier

In [None]:
from sklearn.decomposition import PCA


pca_rf_pipe = make_pipeline(preprocessor, PCA(n_components=60, whiten=True, random_state=0), RandomForestClassifier(max_depth=20))
scores = cross_validate(pca_rf_pipe, X_train, y_train, return_train_score=True)
pd.DataFrame(scores)

In [None]:
# from sklearn.model_selection import RandomizedSearchCV

# param_grid = {
#     "pca__n_components": range(10, 60),
#     "randomforestclassifier__max_depth": range(12, 30),
# }

# print("Grid size: %d" % (np.prod(list(map(len, param_grid.values())))))
# param_grid

In [None]:
# random_search = RandomizedSearchCV(
#     pca_rf_pipe, param_distributions=param_grid, n_jobs=-1, n_iter=20, cv=5, random_state=42
# )
# random_search.fit(X_train, y_train)

In [None]:
# pd.DataFrame(random_search.cv_results_)[
#     [
#         "mean_test_score",
#         "param_pca__n_components",
#         "param_randomforestclassifier__max_depth",
#         "mean_fit_time",
#         "rank_test_score",
#     ]
# ].set_index("rank_test_score").sort_index().T

In [None]:
# from above random search, best param is n_components=13 and max_depth=28
pca_rf_best_pipe = make_pipeline(
    preprocessor, 
    PCA(n_components=13, whiten=True, random_state=0), 
    RandomForestClassifier(max_depth=28)
)
pca_rf_best_pipe