# Draft for Model + PCA (Smoothie King)

RandomForestClassifier
RandomForestClassifier + PCA
Hierarchical Clustering
Hierarchical Clustering + PCA

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import os
import string
import sys
from collections import deque

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import seaborn as sns
from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.model_selection import (
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.decomposition import PCA

pd.reset_option('all')

In [None]:
DIR = "../../data/"
SMOOTHIE = "Smoothie King/smoothie_king_"

In [None]:
smoothie_demographic = pd.read_csv(DIR + SMOOTHIE + "demographic_variables.csv")
smoothie_stores = pd.read_csv(DIR + SMOOTHIE + "stores.csv")
smoothie_poi_variables = pd.read_csv(DIR + SMOOTHIE + "poi_variables.csv")
smoothie_sister = pd.read_csv(DIR + SMOOTHIE + "competition_sister_variables.csv")
smoothie_trade_area = pd.read_csv(DIR + SMOOTHIE + "trade_area_variables.csv").rename(columns={"store_num": "store"})

In [None]:
smoothie_merged = smoothie_stores.merge(
    smoothie_demographic, on="store", how="outer"
).merge(
    smoothie_poi_variables, on="store", how="outer"
).merge(
    smoothie_sister, on="store", how="outer"
).merge(
    smoothie_trade_area, on="store", how="outer"
)
smoothie_merged.to_csv(DIR + SMOOTHIE + "merged.csv")

In [None]:
train_df, test_df = train_test_split(smoothie_merged, test_size=0.1, random_state=42)
X_train = train_df.drop(columns=["category"])
y_train = train_df["category"]
X_test = test_df.drop(columns=["category"])
y_test = test_df["category"]

In [None]:
drop_features = ["store", "longitude", "latitude", "__store_latitude"]
ordinal_features_oth = [
    "market_size",
    "store_density",
]
ordering_ordinal_oth = [
    ["Very Large Metro (1)", "Large Metro (2)", "Large City (3)", "Medium City (4)", "Small City (5)", "Small Town (6)"],
    ["Rural", "Exurban", "Suburban", "Light Suburban", "Light Urban", "Urban", "Super Urban"],
]
categorical_features = ["cbsa_name", "dma_name", "state_name"]

numeric_features = list(set(smoothie_merged.select_dtypes(include=np.number).columns.tolist()) - {"longitude", "latitude"})

In [None]:
numeric_transformer = make_pipeline(
    SimpleImputer(strategy="median"), 
    StandardScaler()
)

ordinal_transformer_oth = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(categories=ordering_ordinal_oth),
)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore", sparse=False),
)

preprocessor = make_column_transformer(
    ("drop", drop_features),
    (numeric_transformer, numeric_features),
    (ordinal_transformer_oth, ordinal_features_oth),
    (categorical_transformer, categorical_features),
)

In [None]:
preprocessor.fit(X_train)

In [None]:
column_names = (
    numeric_features
    + preprocessor.named_transformers_['pipeline-2']['ordinalencoder'].get_feature_names_out().tolist()
    + preprocessor.named_transformers_['pipeline-3']['onehotencoder'].get_feature_names_out().tolist()
)
len(column_names)

In [None]:
preprocessor.named_transformers_['pipeline-3']['onehotencoder']

In [None]:
transformed_X = preprocessor.transform(X_train)
# X_test_processed = preprocessor.transform(X_test)

In [None]:
transformed_X_df = pd.DataFrame(transformed_X, columns=column_names)
transformed_X_df

In [None]:
n_components = 70
pca = PCA(n_components=n_components, whiten=True, random_state=42)
pca.fit(transformed_X)

In [None]:
df = pd.DataFrame(
    data=np.cumsum(pca.explained_variance_ratio_),
    columns=["cummulative variance_explained (%)"],
    index=range(1, n_components + 1),
)
df.index.name = "n_components"

In [None]:
plt.figure(figsize=(8, 6))
plt.xticks(range(1, 71, 5))
plt.xlabel("number of components")
plt.ylabel("cumulative explained variance ratio")
plt.plot(range(1, 71), np.cumsum(pca.explained_variance_ratio_))
plt.grid()
plt.show()

In [None]:
feature_names_out = pca.get_feature_names_out(column_names)
feature_names_out

In [None]:
transformed_features = pd.DataFrame(abs(pca.components_), columns=column_names, index=feature_names_out)
transformed_features.head()

### Try 1: Filter and Count Important Features (of heavy weights)

Filter the values in the PCA component matrix, and count the occurrence of the features in the filtered matrix.

In [None]:
pca0_df = transformed_features.iloc[0].sort_values(axis=0, ascending=True)
pca0_df.filter(pca0_df > 0.01)

In [None]:
W = abs(pca.components_)
features = transformed_features.columns
long_results = []
for i in range(W.shape[0]):
    array = W[i]
    heavy_idx = np.where(array > 0.01)
    long_results += list(features[heavy_idx])

In [None]:
count = pd.Series(long_results).value_counts()
count[:20]

## Hierachical Clustering

In [None]:
from scipy.cluster.hierarchy import (
    average,
    complete,
    dendrogram,
    fcluster,
    single,
    ward,
)

In [None]:
linkage_array = complete(transformed_X)
hier_labels = fcluster(linkage_array, 5, criterion="maxclust") 
# plot_dendrogram_clusters(X, linkage_array, hier_labels, linkage_type='complete', title="maxclust 3")

In [None]:
hier_labels

## PCA + Hierarchical Clustering

In [None]:
from scipy.cluster.hierarchy import (
    average,
    complete,
    dendrogram,
    fcluster,
    single,
    ward,
)

X = pca.transform(transformed_X)

In [None]:
linkage_array = complete(X)
hier_labels = fcluster(linkage_array, 5, criterion="maxclust") 
# plot_dendrogram_clusters(X, linkage_array, hier_labels, linkage_type='complete', title="maxclust 3")

In [None]:
hier_labels

In [None]:
y_train

## Random Forest Classifier

In [None]:
lr_pipe = make_pipeline(preprocessor, RandomForestClassifier(max_depth=50))
scores = cross_validate(lr_pipe, X_train, y_train, return_train_score=True)
pd.DataFrame(scores)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "randomforestclassifier__max_depth": range(5, 30),
}

random_search = RandomizedSearchCV(
    lr_pipe, param_distributions=param_grid, n_jobs=-1, n_iter=20, cv=5, random_state=42
)
random_search.fit(X_train, y_train)
pd.DataFrame(random_search.cv_results_)[
    [
        "mean_test_score",
        "param_randomforestclassifier__max_depth",
        "mean_fit_time",
        "rank_test_score",
    ]
].set_index("rank_test_score").sort_index().T

## PCA + Random Forest Classifier

In [None]:
from sklearn.decomposition import PCA


lr_pca_pipe = make_pipeline(preprocessor, PCA(n_components=60, whiten=True, random_state=0), RandomForestClassifier(max_depth=20))
scores = cross_validate(lr_pca_pipe, X_train, y_train, return_train_score=True)
pd.DataFrame(scores)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "pca__n_components": range(10, 60),
    "randomforestclassifier__max_depth": range(12, 30),
}

print("Grid size: %d" % (np.prod(list(map(len, param_grid.values())))))
param_grid

In [None]:
random_search = RandomizedSearchCV(
    lr_pca_pipe, param_distributions=param_grid, n_jobs=-1, n_iter=20, cv=5, random_state=42
)
random_search.fit(X_train, y_train)

In [None]:
pd.DataFrame(random_search.cv_results_)[
    [
        "mean_test_score",
        "param_pca__n_components",
        "param_randomforestclassifier__max_depth",
        "mean_fit_time",
        "rank_test_score",
    ]
].set_index("rank_test_score").sort_index().T