In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier

from sklearn.feature_selection import RFE

In [3]:
DIR = "../../data/"
SMOOTHIE = "Smoothie King/"

smoothie_demographic = pd.read_csv(DIR + SMOOTHIE + "processed_demographic.csv")
smoothie_stores = pd.read_csv(DIR + SMOOTHIE + "smoothie_king_stores.csv")
smoothie_poi_variables = pd.read_csv(DIR + SMOOTHIE + "processed_poi.csv")
# smoothie_sister = pd.read_csv(DIR + SMOOTHIE + "competition_sister_variables.csv")
smoothie_trade_area = pd.read_csv(DIR + SMOOTHIE + "processed_trade_area.csv").rename(columns={"store_num": "store"})

In [4]:
smoothie_merged = smoothie_stores.merge(
    smoothie_demographic, on="store", how="outer"
).merge(
    smoothie_poi_variables, on="store", how="outer"
).merge(
    smoothie_trade_area, on="store", how="outer"
)
smoothie_merged

Unnamed: 0,store,longitude,latitude,category,cbsa_name,dma_name,state_name,market_size,store_density,age0018_p_10mi,...,popgrfy_ta,popgrpy_ta,poverty_inpoverty_p_ta,spend_breakfastbrunch_ta,spend_dinner_ta,spend_foodbev_ta,spend_lunch_ta,wealth_hhavg_ta,wealth_hhtotal_ta,white_p_ta
0,SK 1504,-97.650392,30.519353,SHOPPING,"Austin-Round Rock, TX","Austin, TX",Texas,Large Metro (2),Light Suburban,0.2805,...,8.3789,3.9235,0.0611,7069439.0,40790484.0,230383651.0,23166216.0,240573.0,25223.0,0.4897
1,SK 0057,-88.171150,30.672501,SHOPPING,"Mobile, AL","Mobile et al, AL-FL",Alabama,Medium City (4),Light Suburban,0.2264,...,0.6017,0.3932,0.1830,4724526.0,25460067.0,160135521.0,14653701.0,217054.0,22216.0,0.5129
2,SK 1415,-90.535722,38.784250,HOME,"St. Louis, MO-IL","St. Louis, MO",Missouri,Very Large Metro (1),Light Suburban,0.2129,...,2.5003,0.7142,0.0639,4501211.0,24794631.0,151609187.0,14129014.0,245860.0,19907.0,0.8459
3,SK 1231,-80.134700,26.100737,TRAVEL,"Miami-Fort Lauderdale et al, FL","Miami-Ft. Lauderdale, FL",Florida,Very Large Metro (1),Suburban,0.2174,...,7.6482,12.7188,0.0831,4038906.0,23214366.0,146417939.0,12859709.0,255812.0,22124.0,0.7020
4,SK 1535,-96.856651,32.996408,WORK,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Light Suburban,0.2620,...,2.1871,-5.7794,0.0800,5919218.0,32751951.0,207616741.0,18696840.0,216763.0,29251.0,0.4593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,SK 1536,-96.872596,32.647809,HOME,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Exurban,0.2664,...,2.0540,-0.2994,0.1833,7614977.0,40037412.0,264008408.0,23332500.0,199437.0,29430.0,0.1490
792,SK 1886,-105.077634,40.564695,OTHER,"Fort Collins, CO","Denver, CO",Colorado,Medium City (4),Light Suburban,0.2224,...,4.7691,2.3442,0.2202,5118356.0,28458285.0,197532602.0,16637181.0,216659.0,25712.0,0.7874
793,SK 0162,-95.478001,30.316531,SHOPPING,"Houston-The Woodlands et al, TX","Houston, TX",Texas,Very Large Metro (1),Exurban,0.2412,...,9.9822,8.1691,0.1202,4677280.0,25445615.0,161088758.0,14627532.0,216855.0,19407.0,0.5041
794,SK 1449,-78.968258,35.064994,SHOPPING,"Fayetteville, NC","Raleigh et al, NC",North Carolina,Medium City (4),Exurban,0.2614,...,-0.3816,-1.0761,0.1809,10318478.0,54577513.0,349542567.0,31683602.0,208448.0,41681.0,0.3933


In [5]:
train_df, test_df = train_test_split(smoothie_merged, test_size=0.1, random_state=42)
X_train = train_df.drop(columns=["category"])
y_train = train_df["category"]
X_test = test_df.drop(columns=["category"])
y_test = test_df["category"]

In [6]:
drop_features = ['store',
 'longitude',
 'latitude',
 'cbsa_name',
 'dma_name',
 'state_name',
 ]

ordinal_features_oth = [
    "market_size",
    "store_density",
]
ordering_ordinal_oth = [
    ["Very Large Metro (1)", "Large Metro (2)", "Large City (3)", "Medium City (4)", "Small City (5)", "Small Town (6)"],
    ["Rural", "Exurban", "Suburban", "Light Suburban", "Light Urban", "Urban", "Super Urban"],
]
numeric_features = list(set(smoothie_merged.select_dtypes(include=np.number).columns.tolist()) - {"longitude", "latitude"})

In [7]:
numeric_transformer = make_pipeline(
    SimpleImputer(strategy="median"), 
    StandardScaler()
)

ordinal_transformer_oth = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(categories=ordering_ordinal_oth),
)

preprocessor = make_column_transformer(
    ("drop", drop_features),
    (numeric_transformer, numeric_features),
    (ordinal_transformer_oth, ordinal_features_oth),
)

In [8]:
preprocessor.fit(X_train)

In [9]:
column_names = (
    numeric_features
    + preprocessor.named_transformers_['pipeline-2']['ordinalencoder'].get_feature_names_out().tolist()
)
len(column_names)

920

In [10]:
transformed_X = preprocessor.transform(X_train)

In [11]:
transformed_X_df = pd.DataFrame(transformed_X, columns=column_names)
transformed_X_df

Unnamed: 0,spend_breakfastbrunch_ta,emp_accommodation_foodserv_p_3mi,hh_7pers_p_ta,hh_type_male_nochild_p_ta,hh_1vehicle_p_5mi,age0018_p_1mi,hh_type_1pers_p_10mi,hu_ownerocc_1mi,military_installations_2mi,medsalcy_3mi,...,emp_manfacturing_p_3mi,spend_lunch_3mi,inrix_dinner_ns,dtpop_students_9th_12th_p_1mi,hh_4vehicle_p_5mi,hh_type_male_child_p_1mi,emp_retail_trade_p_5mi,hh_type_fam_p_10mi,x0,x1
0,3.006608,-0.578451,-0.255387,-1.411746,-2.293129,-0.445402,-1.025068,-0.285635,-0.12555,0.479451,...,0.366232,-0.101906,0.488229,2.459765,1.252747,1.189833,-0.180202,0.623062,1.0,0.0
1,1.082548,-0.527091,-0.993526,-0.277228,0.427386,-0.169020,0.926991,1.572752,-0.12555,1.109466,...,-0.013909,0.699866,1.207545,-0.411251,-0.563990,-0.350107,-0.694044,-0.642321,0.0,3.0
2,-0.690315,-0.116207,1.053132,-1.075928,-1.771367,-0.042033,-2.235070,0.105572,-0.12555,0.197425,...,-0.287299,-0.132011,1.333107,0.999073,0.851812,0.416935,-1.089913,2.168023,0.0,1.0
3,0.045323,-0.013486,0.516304,0.811909,-1.020539,0.874261,-0.935003,0.332584,-0.12555,1.169469,...,0.353214,0.031376,-0.865277,-0.814200,1.046014,1.154701,-0.796288,0.979815,0.0,3.0
4,1.011306,-0.146167,-0.344859,0.258264,-1.070029,-1.184911,-0.696135,-0.336635,-0.12555,0.302638,...,-0.594537,0.007517,-0.165114,-0.411251,1.183836,-0.309120,-0.961452,0.735184,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,-0.973485,0.003634,-1.317860,-1.257451,1.277194,0.702456,1.056214,1.756228,-0.12555,0.746686,...,-0.521633,0.527092,-0.354165,-1.116412,-1.453564,-0.976622,-1.441213,-1.067513,0.0,2.0
712,0.224869,0.495839,-0.937607,0.212884,0.259121,-1.003146,0.578479,-0.512025,-0.12555,-1.196983,...,-0.607556,-0.385537,-0.902875,-0.663094,0.275468,-0.437936,1.128000,-0.206936,3.0,1.0
713,0.266487,-0.069126,1.008396,0.076742,-0.880554,1.616260,-1.959002,0.103706,-0.12555,0.805167,...,-0.081606,-0.016699,-0.956788,0.847967,0.306791,0.434501,0.278586,1.956883,0.0,1.0
714,-1.391657,-0.920854,-0.031709,-0.141086,1.183871,0.500771,0.453171,-0.129525,-0.12555,-0.154384,...,-1.091846,-0.409191,-0.480791,-0.461619,-0.620371,0.171014,-1.328482,-0.205480,2.0,1.0


In [None]:
def evaluate_models(rfe_estimator, rfe_num_features_start,  rfe_num_features_end, random_state=42):
	for i in range(rfe_num_features_start, rfe_num_features_end+1):
		rfe = RFE(estimator=rfe_estimator, n_features_to_select=i)
		pipe_rf_rfe = make_pipeline(preprocessor, 
			                        rfe, 
				                    RandomForestClassifier(random_state=random_state))
		
		pipe_rf_rfe.fit(X_train, y_train)
		
		print('---- RFE with ', i, ' features selected')
		print(pipe_rf_rfe.score(X_train, y_train))
		print(pipe_rf_rfe.score(X_test, y_test))	