In [99]:
import os
import string
import sys
from collections import deque

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import seaborn as sns
from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.decomposition import PCA

pd.reset_option('all')

  pd.reset_option('all')
  pd.reset_option('all')
: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.

  pd.reset_option('all')


In [100]:
DIR = "../../data/"
SMOOTHIE = "Smoothie King/"

In [101]:
smoothie_demographic = pd.read_csv(DIR + SMOOTHIE + "processed_demographic.csv")
smoothie_stores = pd.read_csv(DIR + SMOOTHIE + "smoothie_king_stores.csv")
smoothie_poi_variables = pd.read_csv(DIR + SMOOTHIE + "processed_poi.csv")
# smoothie_sister = pd.read_csv(DIR + SMOOTHIE + "competition_sister_variables.csv")
smoothie_trade_area = pd.read_csv(DIR + SMOOTHIE + "processed_trade_area.csv").rename(columns={"store_num": "store"})

In [102]:
smoothie_merged = smoothie_stores.merge(
    smoothie_demographic, on="store", how="outer"
).merge(
    smoothie_poi_variables, on="store", how="outer"
).merge(
    smoothie_trade_area, on="store", how="outer"
)
smoothie_merged

Unnamed: 0,store,longitude,latitude,category,cbsa_name,dma_name,state_name,market_size,store_density,age0018_p_10mi,...,popgrfy_ta,popgrpy_ta,poverty_inpoverty_p_ta,spend_breakfastbrunch_ta,spend_dinner_ta,spend_foodbev_ta,spend_lunch_ta,wealth_hhavg_ta,wealth_hhtotal_ta,white_p_ta
0,SK 1504,-97.650392,30.519353,SHOPPING,"Austin-Round Rock, TX","Austin, TX",Texas,Large Metro (2),Light Suburban,0.2805,...,8.3789,3.9235,0.0611,7069439.0,40790484.0,230383651.0,23166216.0,240573.0,25223.0,0.4897
1,SK 0057,-88.171150,30.672501,SHOPPING,"Mobile, AL","Mobile et al, AL-FL",Alabama,Medium City (4),Light Suburban,0.2264,...,0.6017,0.3932,0.1830,4724526.0,25460067.0,160135521.0,14653701.0,217054.0,22216.0,0.5129
2,SK 1415,-90.535722,38.784250,HOME,"St. Louis, MO-IL","St. Louis, MO",Missouri,Very Large Metro (1),Light Suburban,0.2129,...,2.5003,0.7142,0.0639,4501211.0,24794631.0,151609187.0,14129014.0,245860.0,19907.0,0.8459
3,SK 1231,-80.134700,26.100737,TRAVEL,"Miami-Fort Lauderdale et al, FL","Miami-Ft. Lauderdale, FL",Florida,Very Large Metro (1),Suburban,0.2174,...,7.6482,12.7188,0.0831,4038906.0,23214366.0,146417939.0,12859709.0,255812.0,22124.0,0.7020
4,SK 1535,-96.856651,32.996408,WORK,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Light Suburban,0.2620,...,2.1871,-5.7794,0.0800,5919218.0,32751951.0,207616741.0,18696840.0,216763.0,29251.0,0.4593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,SK 1536,-96.872596,32.647809,HOME,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Exurban,0.2664,...,2.0540,-0.2994,0.1833,7614977.0,40037412.0,264008408.0,23332500.0,199437.0,29430.0,0.1490
792,SK 1886,-105.077634,40.564695,OTHER,"Fort Collins, CO","Denver, CO",Colorado,Medium City (4),Light Suburban,0.2224,...,4.7691,2.3442,0.2202,5118356.0,28458285.0,197532602.0,16637181.0,216659.0,25712.0,0.7874
793,SK 0162,-95.478001,30.316531,SHOPPING,"Houston-The Woodlands et al, TX","Houston, TX",Texas,Very Large Metro (1),Exurban,0.2412,...,9.9822,8.1691,0.1202,4677280.0,25445615.0,161088758.0,14627532.0,216855.0,19407.0,0.5041
794,SK 1449,-78.968258,35.064994,SHOPPING,"Fayetteville, NC","Raleigh et al, NC",North Carolina,Medium City (4),Exurban,0.2614,...,-0.3816,-1.0761,0.1809,10318478.0,54577513.0,349542567.0,31683602.0,208448.0,41681.0,0.3933


In [103]:
train_df, test_df = train_test_split(smoothie_merged, test_size=0.1, random_state=42)
X_train = train_df.drop(columns=["category"])
y_train = train_df["category"]
X_test = test_df.drop(columns=["category"])
y_test = test_df["category"]

In [104]:
drop_features = ['store',
 'longitude',
 'latitude',
 'cbsa_name',
 'dma_name',
 'state_name',
 ]

ordinal_features_oth = [
    "market_size",
    "store_density",
]
ordering_ordinal_oth = [
    ["Very Large Metro (1)", "Large Metro (2)", "Large City (3)", "Medium City (4)", "Small City (5)", "Small Town (6)"],
    ["Rural", "Exurban", "Suburban", "Light Suburban", "Light Urban", "Urban", "Super Urban"],
]
numeric_features = list(set(smoothie_merged.select_dtypes(include=np.number).columns.tolist()) - {"longitude", "latitude"})

In [105]:
numeric_transformer = make_pipeline(
    SimpleImputer(strategy="median"), 
    StandardScaler()
)

ordinal_transformer_oth = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(categories=ordering_ordinal_oth),
)

preprocessor = make_column_transformer(
    ("drop", drop_features),
    (numeric_transformer, numeric_features),
    (ordinal_transformer_oth, ordinal_features_oth),
)

In [106]:
preprocessor.fit(X_train)

In [107]:
column_names = (
    numeric_features
    + preprocessor.named_transformers_['pipeline-2']['ordinalencoder'].get_feature_names_out().tolist()
)
len(column_names)

920

In [108]:
transformed_X = preprocessor.transform(X_train)

In [109]:
transformed_X_df = pd.DataFrame(transformed_X, columns=column_names)
transformed_X_df

Unnamed: 0,hhinc100pl_p_1mi,emp_healthcare_social_assistance_p_1mi,hh_2vehicle_p_2mi,hh_0vehicle_p_2mi,edu_lt9_p_2mi,popgr10cn_3mi,hh_type_nonfam_p_2mi,disposable_inc_avg_10mi,inrix_dinner_ew,com0508_p_10mi,...,millenial_p_2mi,occ_military_p_1mi,gdp_5mi,spend_foodbev_3mi,medhhinc_ta,wealth_hhtotal_5mi,emp_healthcare_social_assistance_p_3mi,dtpop_students_prek_8th_p_1mi,x0,x1
0,1.810539,-0.768309,1.608135,-0.914694,-0.547754,1.033125,-1.044549,1.296042,0.546785,-0.096007,...,-1.840850,-0.073391,-0.286680,-0.134294,1.278336,-0.458407,-0.554679,0.548309,1.0,0.0
1,0.096773,1.057724,0.690787,-0.219625,-0.646834,-0.437044,0.380553,-0.476296,1.066532,0.650561,...,-1.267476,-0.073391,0.556189,0.819625,0.475989,0.921805,0.571455,-0.013335,0.0,3.0
2,2.018148,2.469029,1.548529,-0.821887,-0.468490,0.326839,-0.688791,1.097305,-0.980738,-1.159351,...,-1.090686,-0.073391,-0.152813,-0.157540,2.114088,-0.287045,1.745926,0.665319,0.0,1.0
3,-0.420928,-0.456304,0.484347,-0.442758,0.753497,-0.251449,-0.721885,-0.307706,0.964912,-1.777495,...,0.481315,-0.073391,-0.008175,0.004681,-0.067282,-0.206596,-1.104682,-0.738793,0.0,3.0
4,1.147380,-0.328282,0.987362,-0.142615,-0.418950,-0.365053,-0.550211,2.071362,1.080124,-0.674768,...,-0.703658,-0.073391,-0.170803,0.017035,1.640565,-0.300007,0.142949,-0.808998,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,1.495819,1.850385,-0.415556,0.260210,-0.745914,-0.271398,1.609160,-0.050970,-0.062282,1.061515,...,0.777559,-0.073391,0.694504,0.547845,0.965395,0.753635,-0.254203,-0.902606,0.0,2.0
712,0.041895,1.587443,0.344782,-0.596779,-0.924258,0.084731,-0.945268,-0.995101,-1.011806,0.977612,...,-0.522090,-0.073391,-0.390762,-0.384720,-0.337504,-0.474421,0.628937,-0.949409,3.0,1.0
713,0.429344,-1.092579,1.057145,-0.644170,1.800443,0.542454,-0.477818,2.415281,1.782396,-2.537761,...,-0.545980,-0.073391,-0.073063,-0.028478,1.003457,-0.185065,-0.915251,0.735524,0.0,1.0
714,-0.250344,-1.155440,-0.043383,-0.114970,-0.577478,-0.464431,-0.239956,-0.868585,-0.369081,1.302951,...,0.371419,-0.073391,-0.396751,-0.428644,-0.541813,-0.414887,1.333097,-0.317559,2.0,1.0


In [111]:
# from sklearn.feature_selection import RFECV

# rfecv = RFECV(LogisticRegression(random_state=42, multi_class="ovr", n_jobs=-1))

# pipe_rf_rfecv = make_pipeline(
#     preprocessor, rfecv, RandomForestClassifier(random_state=42)
# )
# pipe_rf_rfecv.fit(X_train, y_train);

In [125]:
print(pipe_rf_rfecv.score(X_train, y_train))
print(pipe_rf_rfecv.score(X_test, y_test))

1.0
0.4


In [126]:
pipe_rf_rfecv.named_steps["rfecv"].n_features_

20

In [129]:
pipe_rf_rfecv.named_steps["rfecv"].ranking_

array([788, 163, 748, 217,  67, 598, 211, 684, 806,  84, 822, 261, 800,
       784, 567, 558,  93, 448, 787, 500, 280, 588, 774, 326, 634, 483,
       368, 868,  73,   6,  92,   1, 152,  79, 123, 320, 325,   7,  72,
       197, 522, 241, 383, 481, 752, 283, 354, 689, 757, 183, 875,  69,
       203, 480,   1, 371, 119, 769,  83, 351, 890, 330, 305, 307, 294,
       194, 213, 341,  16, 215, 861,  76, 489, 223, 105, 887, 246, 815,
       858, 593, 603, 564, 628, 812,  98, 234, 736, 450, 873, 382, 631,
       156, 609,  85, 350, 807, 452, 584, 569, 252, 378,  29, 175, 712,
       373, 107, 715, 332, 517, 544, 714, 389, 866,  11, 160, 694,  46,
       614, 124, 797,  74, 199, 369, 471, 605, 154, 437, 430, 497, 279,
        34, 132, 898, 560, 463, 501, 358, 226, 375, 343,  70,  27, 314,
       162, 392, 402, 364,  86, 777, 359, 707, 401, 649, 461, 580, 182,
       726,  39,  28,  18, 377, 658, 511, 164, 398, 340, 434, 857, 655,
       424, 737, 794, 874,  81, 620, 136,  95, 641, 116,   1, 30

In [133]:
selected_features = pipe_rf_rfecv.named_steps['rfecv'].support_
print("Selected Features:")
for feature, selected in zip(X_train.columns, selected_features):
    if selected:
        print(feature)

Selected Features:
avg_faminc_3mi
com0205_p_1mi
edu_lt9_p_1mi
emp_accommodation_foodserv_p_5mi
emp_unknown_5mi
genx_p_5mi
gq_college_p_2mi
gq_other_p_1mi
hh_3pers_p_1mi
hh_type_nonfam_p_1mi
hu_ownerocc_5mi
occ_military_p_10mi
pop_dens_1mi
popgrfy_10mi
spend_lunch_3mi
centerxy_full_gla_5mi
inrix_night_ns
places_of_worship_2mi
avg_faminc_ta
mortgage_avgrisk_ta
