In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier

In [2]:
trade_area = pd.read_csv("../../data/Smoothie King/smoothie_king_trade_area_variables.csv")
poi = pd.read_csv('../../data/Smoothie King/smoothie_king_poi_variables.csv')
stores = pd.read_csv('../../data/Smoothie King/smoothie_king_stores.csv')
merged = stores.merge(trade_area, left_on="store", right_on="store_num").merge(poi)
merged = merged.drop(columns=["store_num", "country_code"])
merged

Unnamed: 0,store,longitude,latitude,category,cbsa_name,dma_name,state_name,market_size,store_density,medhhinc_dma,...,nces_public_schools_total_enrollment_10mi,nces_public_schools_10mi,osm_nearest_exit_dist,osm_highway_exits_count_1mi,osm_highway_exits_count_2mi,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_highway_exits_count_10mi,transitstop_nearest_dist,transitstops
0,SK 1504,-97.650392,30.519353,SHOPPING,"Austin-Round Rock, TX","Austin, TX",Texas,Large Metro (2),Light Suburban,85075.0,...,110757.0,174,1.875541,0,2,17,45,123,14.818824,0
1,SK 0057,-88.171150,30.672501,SHOPPING,"Mobile, AL","Mobile et al, AL-FL",Alabama,Medium City (4),Light Suburban,58200.0,...,45635.0,78,2.617072,0,0,3,14,58,49.572856,0
2,SK 1415,-90.535722,38.784250,HOME,"St. Louis, MO-IL","St. Louis, MO",Missouri,Very Large Metro (1),Light Suburban,66049.0,...,55532.0,96,0.194937,2,8,25,45,162,3.518308,0
3,SK 1231,-80.134700,26.100737,TRAVEL,"Miami-Fort Lauderdale et al, FL","Miami-Ft. Lauderdale, FL",Florida,Very Large Metro (1),Suburban,62779.0,...,121834.0,184,1.393043,0,6,31,61,114,0.021790,25
4,SK 1535,-96.856651,32.996408,WORK,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Light Suburban,76139.0,...,179702.0,280,0.711949,3,14,23,72,366,0.055289,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,SK 1536,-96.872596,32.647809,HOME,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Exurban,76139.0,...,136369.0,254,0.261721,14,23,36,81,258,0.012345,24
792,SK 1886,-105.077634,40.564695,OTHER,"Fort Collins, CO","Denver, CO",Colorado,Medium City (4),Light Suburban,82668.0,...,36249.0,67,4.012518,0,0,0,10,17,0.189059,13
793,SK 0162,-95.478001,30.316531,SHOPPING,"Houston-The Woodlands et al, TX","Houston, TX",Texas,Very Large Metro (1),Exurban,72903.0,...,48840.0,61,0.397305,4,11,15,23,38,0.699036,0
794,SK 1449,-78.968258,35.064994,SHOPPING,"Fayetteville, NC","Raleigh et al, NC",North Carolina,Medium City (4),Exurban,66003.0,...,41121.0,75,0.706073,5,7,12,46,109,4.771075,0


In [3]:
print(len(merged["dma_name"].unique()))
len(merged["state_name"].unique())
len(merged["cbsa_name"].unique())

109


171

In [4]:
miss = merged.columns[merged.isnull().any()].tolist()
miss

['inrix_breakfast_ns',
 'inrix_lunch_ns',
 'inrix_afternoon_ns',
 'inrix_dinner_ns',
 'inrix_night_ns',
 'inrix_overnight_ns',
 'inrix_breakfast_ew',
 'inrix_lunch_ew',
 'inrix_afternoon_ew',
 'inrix_dinner_ew',
 'inrix_night_ew',
 'inrix_overnight_ew']

In [5]:
merged = merged.drop(columns=["store", "longitude", "latitude", "state_name", "cbsa_name", "dma_name"])
# merged = merged.drop(columns=miss)
merged = merged.dropna()

In [6]:
merged

Unnamed: 0,category,market_size,store_density,medhhinc_dma,medhhinc_1mi,age0018_ta,age0018_p_ta,age65pl_ta,age65pl_p_ta,age85pl_ta,...,nces_public_schools_total_enrollment_10mi,nces_public_schools_10mi,osm_nearest_exit_dist,osm_highway_exits_count_1mi,osm_highway_exits_count_2mi,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_highway_exits_count_10mi,transitstop_nearest_dist,transitstops
0,SHOPPING,Large Metro (2),Light Suburban,85075.0,83846.0,21456.0,0.2907,6709.0,0.0909,372.0,...,110757.0,174,1.875541,0,2,17,45,123,14.818824,0
1,SHOPPING,Medium City (4),Light Suburban,58200.0,50388.0,12717.0,0.2416,8094.0,0.1537,704.0,...,45635.0,78,2.617072,0,0,3,14,58,49.572856,0
2,HOME,Very Large Metro (1),Light Suburban,66049.0,59999.0,10127.0,0.2198,8873.0,0.1926,677.0,...,55532.0,96,0.194937,2,8,25,45,162,3.518308,0
3,TRAVEL,Very Large Metro (1),Suburban,62779.0,97439.0,8623.0,0.2060,8821.0,0.2108,775.0,...,121834.0,184,1.393043,0,6,31,61,114,0.021790,25
4,WORK,Very Large Metro (1),Light Suburban,76139.0,56156.0,17652.0,0.2866,7902.0,0.1283,348.0,...,179702.0,280,0.711949,3,14,23,72,366,0.055289,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,HOME,Very Large Metro (1),Exurban,76139.0,32999.0,22186.0,0.2745,12357.0,0.1529,900.0,...,136369.0,254,0.261721,14,23,36,81,258,0.012345,24
792,OTHER,Medium City (4),Light Suburban,82668.0,55848.0,13060.0,0.2083,6761.0,0.1078,629.0,...,36249.0,67,4.012518,0,0,0,10,17,0.189059,13
793,SHOPPING,Very Large Metro (1),Exurban,72903.0,51204.0,14791.0,0.2730,7410.0,0.1367,602.0,...,48840.0,61,0.397305,4,11,15,23,38,0.699036,0
794,SHOPPING,Medium City (4),Exurban,66003.0,50154.0,27431.0,0.2624,14186.0,0.1357,1158.0,...,41121.0,75,0.706073,5,7,12,46,109,4.771075,0


In [7]:
train_df, test_df = train_test_split(merged, test_size=0.1, random_state=42)
X_train = train_df.drop(columns=["category"])
y_train = train_df["category"]
X_test = test_df.drop(columns=["category"])
y_test = test_df["category"]

In [8]:
# categorical_features = ["store_density"]
ordinal_features = ["market_size", "store_density"]
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

In [9]:
market_levels = [
    "Small Town (6)",
    "Small City (5)",
    "Medium City (4)",
    "Large City (3)",
    "Large Metro (2)",
    "Very Large Metro (1)"
]
density_levels = [
    "Rural",
    "Exurban",
    "Suburban",
    "Light Suburban",
    "Light Urban",
    "Urban",
    "Super Urban"
]

In [10]:
ordinal_transformer = OrdinalEncoder(categories=[market_levels, density_levels], dtype=int)

preprocessor = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy="constant", fill_value=0), StandardScaler()), numeric_features),
    (ordinal_transformer, ordinal_features),
)

In [11]:
X_transformed = preprocessor.fit_transform(X_train)
pd.DataFrame(X_transformed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,327,328,329,330,331,332,333,334,335,336
0,0.505518,1.603831,0.240326,2.003826,-0.922216,-1.281884,-0.701716,-0.794687,-0.432801,-0.435210,...,0.323607,-0.657455,-0.775485,-0.300743,-0.110049,0.583815,-0.365336,-0.562677,5.0,1.0
1,0.505518,-0.993985,0.204838,0.795630,-0.509750,-0.885713,-0.735201,-0.935561,0.197289,0.270470,...,-0.467842,1.081276,0.334982,-0.078099,-0.431555,-0.400870,-0.379246,-0.562677,5.0,1.0
2,-1.467147,-2.192245,-1.298315,-0.531816,-1.320784,-0.324649,-0.786309,-0.090313,-0.696760,-0.643879,...,1.901325,-0.657455,-0.775485,-0.857354,-0.994191,-1.019566,1.059455,-0.562677,0.0,1.0
3,0.144807,0.340139,0.711398,1.272632,-0.341735,-0.997069,-0.380969,-0.653812,0.011464,-0.084267,...,0.096915,-0.657455,-0.674534,-0.523388,-0.270802,-0.470582,-0.385665,-0.562677,3.0,1.0
4,-0.274878,-0.246878,-0.073236,-0.829942,1.065911,1.140114,1.862501,1.881933,-0.434474,-0.533853,...,-0.134381,-0.657455,0.234031,0.033223,0.238250,0.182970,-0.447077,1.379677,5.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,0.689684,2.302042,-0.616581,0.425326,-0.898143,-0.545220,-0.846229,-0.794687,1.222654,2.700513,...,-0.533424,0.501699,0.032127,-0.078099,-0.217218,-0.505438,-0.009270,-0.562677,5.0,1.0
703,2.097560,1.529740,-0.202550,-1.410503,0.745517,0.568343,0.857962,0.707977,0.770259,0.746614,...,-0.487917,1.950641,1.041644,1.313429,1.711821,1.986774,-0.446378,0.454747,5.0,2.0
704,-0.183700,-1.028735,-0.513033,-0.585164,-0.407005,-0.157614,-0.113092,0.132738,-0.552016,-0.575587,...,-0.534673,0.211910,-0.169776,-0.523388,-0.726269,-0.923712,1.649894,-0.562677,1.0,1.0
705,-0.095571,0.530611,-0.670867,0.927434,-1.206624,-1.035615,-1.010127,-1.029478,0.213299,1.074793,...,-0.023855,-0.657455,0.234031,0.311529,0.559756,0.453105,-0.445582,0.454747,5.0,1.0


In [12]:
transformed_col_names = (
    numeric_features + ordinal_features 
)

In [13]:
pipe_lr = make_pipeline(
    preprocessor,
    LogisticRegression(penalty="l1", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", n_jobs=-1)
)

In [14]:
pipe_lr.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=0,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                              

In [15]:
pipe_lr.score(X_test, y_test)

0.5569620253164557

In [16]:
pipe_lr.named_steps["logisticregression"].classes_

array(['HOME', 'OTHER', 'SHOPPING', 'TRAVEL', 'WORK'], dtype=object)

In [17]:
l1_coefs = pipe_lr.named_steps["logisticregression"].coef_.flatten()
cut = len(l1_coefs) // 5
class_1_coefs = l1_coefs[:cut]
class_2_coefs = l1_coefs[cut : 2*cut]
class_3_coefs = l1_coefs[2*cut: 3*cut]
class_4_coefs = l1_coefs[3*cut: 4*cut]
class_5_coefs = l1_coefs[4*cut:]
coef_df = pd.DataFrame(class_1_coefs, index=transformed_col_names, columns=["HOME"])
coef_df["OTHER"] = class_2_coefs
coef_df["SHOPPING"] = class_3_coefs
coef_df["TRAVEL"] = class_4_coefs
coef_df["WORK"] = class_5_coefs

In [18]:
coef_df = coef_df.T
coef_df

Unnamed: 0,medhhinc_dma,medhhinc_1mi,age0018_ta,age0018_p_ta,age65pl_ta,age65pl_p_ta,age85pl_ta,age85pl_p_ta,asian_ta,asian_p_ta,...,osm_nearest_exit_dist,osm_highway_exits_count_1mi,osm_highway_exits_count_2mi,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_highway_exits_count_10mi,transitstop_nearest_dist,transitstops,market_size,store_density
HOME,0.331258,0.08221,0.0,0.005941,0.0,0.0,0.0,-0.473097,0.011747,0.063118,...,-0.050555,-0.136803,-0.428542,0.534565,0.199659,0.282943,0.0,0.149874,0.222763,0.010482
OTHER,0.49088,0.0,0.0,0.0,0.0,0.0,0.0,-0.206755,0.0,-0.062484,...,0.0,0.014266,0.0,0.761755,-0.091557,-0.546432,0.0,0.0,-0.314183,-0.106215
SHOPPING,-0.200092,-0.499653,0.0,-0.095399,0.0,-0.022762,0.0,0.320345,0.0,0.0,...,0.194139,0.225097,0.0,-0.475584,0.0,-0.240277,-0.205222,-0.026768,0.0,-0.19942
TRAVEL,-0.768188,0.367565,0.0,0.0,0.0,0.0,0.0,0.450181,0.0,-0.157047,...,-0.198113,0.0,-0.089401,-0.022409,0.0,0.0,0.118026,-0.01428,-0.931882,-0.330767
WORK,-0.269141,0.0,0.0,0.019698,0.0,0.0,0.166582,0.0,0.176617,0.0,...,-0.075034,0.0,0.186941,0.0,-0.72324,0.527089,0.094288,-0.023374,-0.357298,0.109801


In [19]:
coef_df.loc[:, (coef_df != 0).any(axis=0)]

Unnamed: 0,medhhinc_dma,medhhinc_1mi,age0018_p_ta,age65pl_p_ta,age85pl_ta,age85pl_p_ta,asian_ta,asian_p_ta,avg_faminc_ta,black_ta,...,osm_nearest_exit_dist,osm_highway_exits_count_1mi,osm_highway_exits_count_2mi,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_highway_exits_count_10mi,transitstop_nearest_dist,transitstops,market_size,store_density
HOME,0.331258,0.08221,0.005941,0.0,0.0,-0.473097,0.011747,0.063118,0.587553,0.0,...,-0.050555,-0.136803,-0.428542,0.534565,0.199659,0.282943,0.0,0.149874,0.222763,0.010482
OTHER,0.49088,0.0,0.0,0.0,0.0,-0.206755,0.0,-0.062484,-0.029083,0.027424,...,0.0,0.014266,0.0,0.761755,-0.091557,-0.546432,0.0,0.0,-0.314183,-0.106215
SHOPPING,-0.200092,-0.499653,-0.095399,-0.022762,0.0,0.320345,0.0,0.0,0.0,0.0,...,0.194139,0.225097,0.0,-0.475584,0.0,-0.240277,-0.205222,-0.026768,0.0,-0.19942
TRAVEL,-0.768188,0.367565,0.0,0.0,0.0,0.450181,0.0,-0.157047,0.0,0.0,...,-0.198113,0.0,-0.089401,-0.022409,0.0,0.0,0.118026,-0.01428,-0.931882,-0.330767
WORK,-0.269141,0.0,0.019698,0.0,0.166582,0.0,0.176617,0.0,0.0,0.0,...,-0.075034,0.0,0.186941,0.0,-0.72324,0.527089,0.094288,-0.023374,-0.357298,0.109801


In [20]:
test = {}
test["a"] = {}
test["a"]["b"] = 1
test

{'a': {'b': 1}}

In [21]:
test["a"]["c"] = 2
test

{'a': {'b': 1, 'c': 2}}

In [22]:
corr_lookup_dict = {}

In [23]:
correlation = merged.corr(numeric_only=True)
np.fill_diagonal(correlation.values, 0)
correlation

Unnamed: 0,medhhinc_dma,medhhinc_1mi,age0018_ta,age0018_p_ta,age65pl_ta,age65pl_p_ta,age85pl_ta,age85pl_p_ta,asian_ta,asian_p_ta,...,nces_public_schools_total_enrollment_10mi,nces_public_schools_10mi,osm_nearest_exit_dist,osm_highway_exits_count_1mi,osm_highway_exits_count_2mi,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_highway_exits_count_10mi,transitstop_nearest_dist,transitstops
medhhinc_dma,0.000000,0.468225,0.141289,0.120148,0.002420,-0.136161,-0.098578,-0.164483,0.329617,0.375141,...,0.506345,0.456820,-0.129300,0.204231,0.265740,0.314022,0.354534,0.455907,-0.285368,0.139828
medhhinc_1mi,0.468225,0.000000,-0.069198,-0.116188,-0.035734,0.042921,-0.149919,-0.130362,0.291665,0.372483,...,0.314323,0.258558,-0.056682,0.016767,0.040006,0.106176,0.169803,0.274635,-0.149391,-0.020870
age0018_ta,0.141289,-0.069198,0.000000,0.443674,0.581173,-0.362739,0.259813,-0.246947,0.426871,0.099530,...,0.236656,0.188982,-0.130943,0.007358,-0.019274,-0.015192,0.028922,0.095139,0.091815,-0.035237
age0018_p_ta,0.120148,-0.116188,0.443674,0.000000,-0.284231,-0.694311,-0.404678,-0.583136,0.184920,0.172957,...,0.301427,0.240890,-0.157486,0.269016,0.265154,0.257897,0.291167,0.297797,0.070619,0.049050
age65pl_ta,0.002420,-0.035734,0.581173,-0.284231,0.000000,0.450778,0.852359,0.481670,0.158785,-0.096836,...,0.036236,0.053295,-0.023522,-0.145392,-0.175974,-0.171670,-0.132094,-0.058580,-0.005251,0.005296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
osm_highway_exits_count_3mi,0.314022,0.106176,-0.015192,0.257897,-0.171670,-0.197973,-0.125117,-0.118893,0.068559,0.187094,...,0.492029,0.563006,-0.312757,0.616974,0.891392,0.000000,0.878979,0.722643,-0.112636,0.393119
osm_highway_exits_count_5mi,0.354534,0.169803,0.028922,0.291167,-0.132094,-0.195817,-0.104515,-0.125770,0.111681,0.252422,...,0.613555,0.681480,-0.287153,0.494451,0.741262,0.878979,0.000000,0.886562,-0.146500,0.456619
osm_highway_exits_count_10mi,0.455907,0.274635,0.095139,0.297797,-0.058580,-0.167501,-0.085455,-0.138358,0.194858,0.318529,...,0.763752,0.813088,-0.244804,0.415742,0.617028,0.722643,0.886562,0.000000,-0.219602,0.443250
transitstop_nearest_dist,-0.285368,-0.149391,0.091815,0.070619,-0.005251,-0.071259,-0.015513,-0.059460,-0.100966,-0.147335,...,-0.247805,-0.211407,0.067825,-0.080592,-0.086858,-0.112636,-0.146500,-0.219602,0.000000,-0.248129


In [24]:
high_corr = correlation.loc[:, (correlation > 0.98).any(axis=0)]
high_corr[high_corr["age0018_ta"] > 0.98]

Unnamed: 0,age0018_ta,age65pl_ta,disposable_inc_avg_ta,dtpop_children_at_home_ta,dtpop_children_at_home_p_ta,dtpop_retired_disabled_ta,dtpop_students_ta,dtpop_students_p_ta,dtpop_students_post_secondary_ta,dtpop_students_post_secondary_p_ta,...,inrix_afternoon_ns,inrix_dinner_ns,inrix_lunch_ew,inrix_afternoon_ew,inrix_dinner_ew,inrix_night_ew,maxtemp,avgmax60,avgmax70,avgmax80
dtpop_children_at_home_ta,0.995578,0.579325,-0.100979,0.0,0.418571,0.565941,0.325845,0.098113,0.261605,0.06184,...,0.090713,0.107133,0.100167,0.102906,0.124267,0.143754,-0.010629,-0.043634,-0.031876,-0.006459
dtpop_students_prek_8th_ta,0.998609,0.572295,-0.099491,0.994163,0.428996,0.556887,0.270173,0.046828,0.205392,0.010637,...,0.095952,0.113068,0.101744,0.104261,0.125678,0.145854,0.01632,-0.016945,-0.005277,0.018999


In [25]:
for col_name in high_corr.columns.tolist():
    corr_lookup_dict[col_name] = {}
    for index, row in high_corr[high_corr[col_name] > 0.98].iterrows():
        corr_lookup_dict[col_name][index] = row[col_name]

In [26]:
for k, v in corr_lookup_dict.items():
    print(k)
    print("Is correlated to")
    print(v)
    print("-------------------")

age0018_ta
Is correlated to
{'dtpop_children_at_home_ta': 0.9955778504230745, 'dtpop_students_prek_8th_ta': 0.9986086078244143}
-------------------
age65pl_ta
Is correlated to
{'dtpop_retired_disabled_ta': 0.9913033707678078}
-------------------
disposable_inc_avg_ta
Is correlated to
{'hhinc100pl_p_ta': 0.9881559914224929, 'medhhinc_ta': 0.9810098098001289}
-------------------
dtpop_children_at_home_ta
Is correlated to
{'age0018_ta': 0.9955778504230745, 'dtpop_students_prek_8th_ta': 0.9941629662332689}
-------------------
dtpop_children_at_home_p_ta
Is correlated to
{'dtpop_students_prek_8th_p_ta': 0.9917654997339937}
-------------------
dtpop_retired_disabled_ta
Is correlated to
{'age65pl_ta': 0.9913033707678078}
-------------------
dtpop_students_ta
Is correlated to
{'dtpop_students_post_secondary_ta': 0.9977337256166244}
-------------------
dtpop_students_p_ta
Is correlated to
{'dtpop_students_post_secondary_p_ta': 0.9964813292369007}
-------------------
dtpop_students_post_secondar