In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from lightgbm.sklearn import LGBMClassifier

In [2]:
trade_area = pd.read_csv("../../data/Smoothie King/smoothie_king_trade_area_variables.csv")
poi = pd.read_csv('../../data/Smoothie King/smoothie_king_poi_variables.csv')
stores = pd.read_csv('../../data/Smoothie King/smoothie_king_stores.csv')
merged = stores.merge(trade_area, left_on="store", right_on="store_num").merge(poi)
merged = merged.drop(columns=["store_num", "country_code"])
merged

Unnamed: 0,store,longitude,latitude,category,cbsa_name,dma_name,state_name,market_size,store_density,medhhinc_dma,...,nces_public_schools_total_enrollment_10mi,nces_public_schools_10mi,osm_nearest_exit_dist,osm_highway_exits_count_1mi,osm_highway_exits_count_2mi,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_highway_exits_count_10mi,transitstop_nearest_dist,transitstops
0,SK 1504,-97.650392,30.519353,SHOPPING,"Austin-Round Rock, TX","Austin, TX",Texas,Large Metro (2),Light Suburban,85075.0,...,110757.0,174,1.875541,0,2,17,45,123,14.818824,0
1,SK 0057,-88.171150,30.672501,SHOPPING,"Mobile, AL","Mobile et al, AL-FL",Alabama,Medium City (4),Light Suburban,58200.0,...,45635.0,78,2.617072,0,0,3,14,58,49.572856,0
2,SK 1415,-90.535722,38.784250,HOME,"St. Louis, MO-IL","St. Louis, MO",Missouri,Very Large Metro (1),Light Suburban,66049.0,...,55532.0,96,0.194937,2,8,25,45,162,3.518308,0
3,SK 1231,-80.134700,26.100737,TRAVEL,"Miami-Fort Lauderdale et al, FL","Miami-Ft. Lauderdale, FL",Florida,Very Large Metro (1),Suburban,62779.0,...,121834.0,184,1.393043,0,6,31,61,114,0.021790,25
4,SK 1535,-96.856651,32.996408,WORK,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Light Suburban,76139.0,...,179702.0,280,0.711949,3,14,23,72,366,0.055289,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,SK 1536,-96.872596,32.647809,HOME,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Exurban,76139.0,...,136369.0,254,0.261721,14,23,36,81,258,0.012345,24
792,SK 1886,-105.077634,40.564695,OTHER,"Fort Collins, CO","Denver, CO",Colorado,Medium City (4),Light Suburban,82668.0,...,36249.0,67,4.012518,0,0,0,10,17,0.189059,13
793,SK 0162,-95.478001,30.316531,SHOPPING,"Houston-The Woodlands et al, TX","Houston, TX",Texas,Very Large Metro (1),Exurban,72903.0,...,48840.0,61,0.397305,4,11,15,23,38,0.699036,0
794,SK 1449,-78.968258,35.064994,SHOPPING,"Fayetteville, NC","Raleigh et al, NC",North Carolina,Medium City (4),Exurban,66003.0,...,41121.0,75,0.706073,5,7,12,46,109,4.771075,0


In [3]:
to_remove = [col for col in merged.columns.tolist() if "_p_" in col]
merged = merged.drop(columns=to_remove)
merged

Unnamed: 0,store,longitude,latitude,category,cbsa_name,dma_name,state_name,market_size,store_density,medhhinc_dma,...,nces_public_schools_total_enrollment_10mi,nces_public_schools_10mi,osm_nearest_exit_dist,osm_highway_exits_count_1mi,osm_highway_exits_count_2mi,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_highway_exits_count_10mi,transitstop_nearest_dist,transitstops
0,SK 1504,-97.650392,30.519353,SHOPPING,"Austin-Round Rock, TX","Austin, TX",Texas,Large Metro (2),Light Suburban,85075.0,...,110757.0,174,1.875541,0,2,17,45,123,14.818824,0
1,SK 0057,-88.171150,30.672501,SHOPPING,"Mobile, AL","Mobile et al, AL-FL",Alabama,Medium City (4),Light Suburban,58200.0,...,45635.0,78,2.617072,0,0,3,14,58,49.572856,0
2,SK 1415,-90.535722,38.784250,HOME,"St. Louis, MO-IL","St. Louis, MO",Missouri,Very Large Metro (1),Light Suburban,66049.0,...,55532.0,96,0.194937,2,8,25,45,162,3.518308,0
3,SK 1231,-80.134700,26.100737,TRAVEL,"Miami-Fort Lauderdale et al, FL","Miami-Ft. Lauderdale, FL",Florida,Very Large Metro (1),Suburban,62779.0,...,121834.0,184,1.393043,0,6,31,61,114,0.021790,25
4,SK 1535,-96.856651,32.996408,WORK,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Light Suburban,76139.0,...,179702.0,280,0.711949,3,14,23,72,366,0.055289,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,SK 1536,-96.872596,32.647809,HOME,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Exurban,76139.0,...,136369.0,254,0.261721,14,23,36,81,258,0.012345,24
792,SK 1886,-105.077634,40.564695,OTHER,"Fort Collins, CO","Denver, CO",Colorado,Medium City (4),Light Suburban,82668.0,...,36249.0,67,4.012518,0,0,0,10,17,0.189059,13
793,SK 0162,-95.478001,30.316531,SHOPPING,"Houston-The Woodlands et al, TX","Houston, TX",Texas,Very Large Metro (1),Exurban,72903.0,...,48840.0,61,0.397305,4,11,15,23,38,0.699036,0
794,SK 1449,-78.968258,35.064994,SHOPPING,"Fayetteville, NC","Raleigh et al, NC",North Carolina,Medium City (4),Exurban,66003.0,...,41121.0,75,0.706073,5,7,12,46,109,4.771075,0


In [4]:
print(len(merged["dma_name"].unique()))
len(merged["state_name"].unique())
len(merged["cbsa_name"].unique())

109


171

In [5]:
miss = merged.columns[merged.isnull().any()].tolist()
miss

['inrix_breakfast_ns',
 'inrix_lunch_ns',
 'inrix_afternoon_ns',
 'inrix_dinner_ns',
 'inrix_night_ns',
 'inrix_overnight_ns',
 'inrix_breakfast_ew',
 'inrix_lunch_ew',
 'inrix_afternoon_ew',
 'inrix_dinner_ew',
 'inrix_night_ew',
 'inrix_overnight_ew']

In [6]:
merged = merged.drop(columns=["store", "longitude", "latitude", "state_name", "cbsa_name", "dma_name"])
# merged = merged.drop(columns=miss)
merged = merged.dropna()

In [7]:
merged

Unnamed: 0,category,market_size,store_density,medhhinc_dma,medhhinc_1mi,age0018_ta,age65pl_ta,age85pl_ta,asian_ta,avg_faminc_ta,...,nces_public_schools_total_enrollment_10mi,nces_public_schools_10mi,osm_nearest_exit_dist,osm_highway_exits_count_1mi,osm_highway_exits_count_2mi,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_highway_exits_count_10mi,transitstop_nearest_dist,transitstops
0,SHOPPING,Large Metro (2),Light Suburban,85075.0,83846.0,21456.0,6709.0,372.0,4092.9205,108837.0,...,110757.0,174,1.875541,0,2,17,45,123,14.818824,0
1,SHOPPING,Medium City (4),Light Suburban,58200.0,50388.0,12717.0,8094.0,704.0,1723.0000,80752.0,...,45635.0,78,2.617072,0,0,3,14,58,49.572856,0
2,HOME,Very Large Metro (1),Light Suburban,66049.0,59999.0,10127.0,8873.0,677.0,1082.0001,106893.0,...,55532.0,96,0.194937,2,8,25,45,162,3.518308,0
3,TRAVEL,Very Large Metro (1),Suburban,62779.0,97439.0,8623.0,8821.0,775.0,850.0585,134373.0,...,121834.0,184,1.393043,0,6,31,61,114,0.021790,25
4,WORK,Very Large Metro (1),Light Suburban,76139.0,56156.0,17652.0,7902.0,348.0,4425.0000,102367.0,...,179702.0,280,0.711949,3,14,23,72,366,0.055289,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,HOME,Very Large Metro (1),Exurban,76139.0,32999.0,22186.0,12357.0,900.0,689.0000,66363.0,...,136369.0,254,0.261721,14,23,36,81,258,0.012345,24
792,OTHER,Medium City (4),Light Suburban,82668.0,55848.0,13060.0,6761.0,629.0,1907.5900,95226.0,...,36249.0,67,4.012518,0,0,0,10,17,0.189059,13
793,SHOPPING,Very Large Metro (1),Exurban,72903.0,51204.0,14791.0,7410.0,602.0,929.9680,89271.0,...,48840.0,61,0.397305,4,11,15,23,38,0.699036,0
794,SHOPPING,Medium City (4),Exurban,66003.0,50154.0,27431.0,14186.0,1158.0,3537.1850,67117.0,...,41121.0,75,0.706073,5,7,12,46,109,4.771075,0


In [8]:
train_df, test_df = train_test_split(merged, test_size=0.1, random_state=42)
X_train = train_df.drop(columns=["category"])
y_train = train_df["category"]
X_test = test_df.drop(columns=["category"])
y_test = test_df["category"]

In [9]:
# categorical_features = ["store_density"]
ordinal_features = ["market_size", "store_density"]
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

In [10]:
market_levels = [
    "Small Town (6)",
    "Small City (5)",
    "Medium City (4)",
    "Large City (3)",
    "Large Metro (2)",
    "Very Large Metro (1)"
]
density_levels = [
    "Rural",
    "Exurban",
    "Suburban",
    "Light Suburban",
    "Light Urban",
    "Urban",
    "Super Urban"
]

In [11]:
ordinal_transformer = OrdinalEncoder(categories=[market_levels, density_levels], dtype=int)

preprocessor = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy="constant", fill_value=0), StandardScaler()), numeric_features),
    (ordinal_transformer, ordinal_features),
)

In [12]:
X_transformed = preprocessor.fit_transform(X_train)
pd.DataFrame(X_transformed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,230,231,232,233,234,235,236,237,238,239
0,0.505518,1.603831,0.240326,-0.922216,-0.701716,-0.432801,-0.408111,-0.359667,1.994195,-0.707100,...,0.323607,-0.657455,-0.775485,-0.300743,-0.110049,0.583815,-0.365336,-0.562677,5.0,1.0
1,0.505518,-0.993985,0.204838,-0.509750,-0.735201,0.197289,-1.214297,-1.005594,2.327336,-0.183032,...,-0.467842,1.081276,0.334982,-0.078099,-0.431555,-0.400870,-0.379246,-0.562677,5.0,1.0
2,-1.467147,-2.192245,-1.298315,-1.320784,-0.786309,-0.696760,-1.154591,-0.602471,0.056524,-1.478467,...,1.901325,-0.657455,-0.775485,-0.857354,-0.994191,-1.019566,1.059455,-0.562677,0.0,1.0
3,0.144807,0.340139,0.711398,-0.341735,-0.380969,0.011464,1.227123,0.922776,-0.773621,-0.068242,...,0.096915,-0.657455,-0.674534,-0.523388,-0.270802,-0.470582,-0.385665,-0.562677,3.0,1.0
4,-0.274878,-0.246878,-0.073236,1.065911,1.862501,-0.434474,0.038023,-0.057034,-0.813717,0.600470,...,-0.134381,-0.657455,0.234031,0.033223,0.238250,0.182970,-0.447077,1.379677,5.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,0.689684,2.302042,-0.616581,-0.898143,-0.846229,1.222654,1.309654,1.396476,-0.269640,-0.729441,...,-0.533424,0.501699,0.032127,-0.078099,-0.217218,-0.505438,-0.009270,-0.562677,5.0,1.0
703,2.097560,1.529740,-0.202550,0.745517,0.857962,0.770259,1.637960,1.713950,-0.442362,0.620885,...,-0.487917,1.950641,1.041644,1.313429,1.711821,1.986774,-0.446378,0.454747,5.0,2.0
704,-0.183700,-1.028735,-0.513033,-0.407005,-0.113092,-0.552016,-1.105428,-1.161848,-0.737739,-0.701707,...,-0.534673,0.211910,-0.169776,-0.523388,-0.726269,-0.923712,1.649894,-0.562677,1.0,1.0
705,-0.095571,0.530611,-0.670867,-1.206624,-1.010127,0.213299,-0.003261,-0.288191,0.472808,-1.073042,...,-0.023855,-0.657455,0.234031,0.311529,0.559756,0.453105,-0.445582,0.454747,5.0,1.0


In [13]:
transformed_col_names = (
    numeric_features + ordinal_features 
)

In [14]:
pipe_lr = make_pipeline(
    preprocessor,
    LogisticRegression(penalty="l1", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", n_jobs=-1)
)

In [15]:
pipe_lr.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=0,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                              

In [16]:
pipe_lr.score(X_test, y_test)

0.5822784810126582

In [17]:
pipe_lr.named_steps["logisticregression"].classes_

array(['HOME', 'OTHER', 'SHOPPING', 'TRAVEL', 'WORK'], dtype=object)

In [18]:
l1_coefs = pipe_lr.named_steps["logisticregression"].coef_.flatten()
cut = len(l1_coefs) // 5
class_1_coefs = l1_coefs[:cut]
class_2_coefs = l1_coefs[cut : 2*cut]
class_3_coefs = l1_coefs[2*cut: 3*cut]
class_4_coefs = l1_coefs[3*cut: 4*cut]
class_5_coefs = l1_coefs[4*cut:]
coef_df = pd.DataFrame(class_1_coefs, index=transformed_col_names, columns=["HOME"])
coef_df["OTHER"] = class_2_coefs
coef_df["SHOPPING"] = class_3_coefs
coef_df["TRAVEL"] = class_4_coefs
coef_df["WORK"] = class_5_coefs

In [19]:
coef_df = coef_df.T
coef_df

Unnamed: 0,medhhinc_dma,medhhinc_1mi,age0018_ta,age65pl_ta,age85pl_ta,asian_ta,avg_faminc_ta,avghhinc_ta,black_ta,boomer_ta,...,osm_nearest_exit_dist,osm_highway_exits_count_1mi,osm_highway_exits_count_2mi,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_highway_exits_count_10mi,transitstop_nearest_dist,transitstops,market_size,store_density
HOME,0.352843,0.0,0.0,0.0,-0.256616,0.01225,0.630703,0.0,0.0,0.0,...,0.0,-0.08561,-0.27267,0.306834,0.196773,0.289835,0.007386,0.174323,0.146229,-0.00337
OTHER,0.389974,-0.035917,0.0,0.0,-0.029862,-0.074451,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.618226,-0.226862,-0.09803,0.018059,-0.056722,-0.025868,-0.23156
SHOPPING,-0.30707,-0.233854,0.0,0.0,0.0,0.0,0.0,0.055341,0.089223,0.0,...,0.061614,0.213135,0.0,-0.540263,0.0,-0.101108,-0.209023,-0.00792,-0.010699,-0.157062
TRAVEL,-0.844225,0.220076,0.0,0.0,0.14702,0.0,0.0,0.0,0.0,-0.279221,...,-0.061179,0.083385,0.0,-0.134969,0.0,0.0,0.132853,-0.047425,-0.739764,-0.226735
WORK,-0.282618,0.105568,0.0,0.033359,0.302182,0.097578,0.0,0.0,0.0,0.0,...,-0.111425,0.0,0.127954,0.0,-0.493668,0.268313,0.085477,-0.086402,-0.33329,0.171889


In [20]:
coef_df.loc[:, (coef_df != 0).any(axis=0)]

Unnamed: 0,medhhinc_dma,medhhinc_1mi,age65pl_ta,age85pl_ta,asian_ta,avg_faminc_ta,avghhinc_ta,black_ta,boomer_ta,crime_total_index_ta,...,osm_nearest_exit_dist,osm_highway_exits_count_1mi,osm_highway_exits_count_2mi,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_highway_exits_count_10mi,transitstop_nearest_dist,transitstops,market_size,store_density
HOME,0.352843,0.0,0.0,-0.256616,0.01225,0.630703,0.0,0.0,0.0,-0.060179,...,0.0,-0.08561,-0.27267,0.306834,0.196773,0.289835,0.007386,0.174323,0.146229,-0.00337
OTHER,0.389974,-0.035917,0.0,-0.029862,-0.074451,0.0,0.0,0.0,0.0,-0.214381,...,0.0,0.0,0.0,0.618226,-0.226862,-0.09803,0.018059,-0.056722,-0.025868,-0.23156
SHOPPING,-0.30707,-0.233854,0.0,0.0,0.0,0.0,0.055341,0.089223,0.0,0.0,...,0.061614,0.213135,0.0,-0.540263,0.0,-0.101108,-0.209023,-0.00792,-0.010699,-0.157062
TRAVEL,-0.844225,0.220076,0.0,0.14702,0.0,0.0,0.0,0.0,-0.279221,-0.120072,...,-0.061179,0.083385,0.0,-0.134969,0.0,0.0,0.132853,-0.047425,-0.739764,-0.226735
WORK,-0.282618,0.105568,0.033359,0.302182,0.097578,0.0,0.0,0.0,0.0,0.162578,...,-0.111425,0.0,0.127954,0.0,-0.493668,0.268313,0.085477,-0.086402,-0.33329,0.171889


In [21]:
pd.DataFrame(coef_df.iloc[0].sort_values(ascending=False))

Unnamed: 0,HOME
daypop_dens_ta,0.899271
dmm_count_3mi,0.777235
edu_highsch_ta,0.712002
avg_faminc_ta,0.630703
edu_assocdeg_ta,0.607530
...,...
nces_public_schools_5mi,-0.619110
empcy_ta,-0.637264
dmm_count_10mi,-0.761975
places_of_worship_1mi,-0.876044


In [22]:
pd.DataFrame(coef_df.iloc[1].sort_values(ascending=False))

Unnamed: 0,OTHER
pop_dens_ta,0.988463
ipeds_postsecondary_schools_5mi,0.673073
military_installations_10mi,0.657784
edu_profdeg_ta,0.626600
osm_highway_exits_count_3mi,0.618226
...,...
inrix_overnight_ns,-0.502167
medsalcy_ta,-0.583552
daypop_dens_ta,-0.674126
avgmax40,-0.682490


In [23]:
pd.DataFrame(coef_df.iloc[2].sort_values(ascending=False))

Unnamed: 0,SHOPPING
edu_bach_ta,0.928182
dmm_count_10mi,0.796677
nces_public_schools_3mi,0.741049
centerxy_count_3mi,0.709635
military_installations_1mi,0.650980
...,...
osm_highway_exits_count_3mi,-0.540263
ipeds_postsecondary_schools_3mi,-0.760976
centerxy_gla_effective_10mi,-0.767415
medhhinc_ta,-0.792925


In [24]:
pd.DataFrame(coef_df.iloc[3].sort_values(ascending=False))

Unnamed: 0,TRAVEL
poverty_inpoverty_ta,0.799486
nces_private_schools_total_enrollment_2mi,0.740614
inrix_overnight_ns,0.583950
ipeds_postsecondary_schools_total_enrollment_2mi,0.509659
dtpop_work_at_home_ta,0.492546
...,...
hrsa_hospitals_3mi,-0.633541
nces_private_schools_total_enrollment_1mi,-0.697471
market_size,-0.739764
nces_public_schools_total_enrollment_3mi,-0.745074


In [25]:
pd.DataFrame(coef_df.iloc[4].sort_values(ascending=False))

Unnamed: 0,WORK
centerxy_gla_effective_10mi,0.972059
inrix_aadt_ew,0.891723
hu_renterocc_ta,0.850971
nces_private_schools_total_enrollment_2mi,0.774173
empcy_ta,0.743842
...,...
dmm_count_5mi,-0.686333
hispanic_ta,-0.721721
inrix_night_ew,-0.780386
hh_type_nonfam_ta,-0.935267


In [26]:
pipe_lgbm = make_pipeline(
    preprocessor,
    LGBMClassifier(random_state=42, learning_rate=0.1, reg_alpha=0.1, reg_lambda=0.5, objective="multiclass")
)

In [27]:
pipe_lgbm.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=0,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                              

In [28]:
pipe_lgbm.score(X_test, y_test)

0.6075949367088608

In [29]:
pipe_lgbm.named_steps["lgbmclassifier"].feature_importances_

array([ 98,  56,   4,  42,  76,  62,  31,  25,  85,  24,  84,  28,  20,
        48,   7,  49,  37,  23,  12,  39,   4,  63,  48,  34,  59,  25,
        10,  20,  11, 104,  57,  64,  20,  91,  36,   0,  85, 160,  21,
         2,  36,  56,  22,  23,  33,  34,  27,  31,  35,  17,  57,   0,
        11,  21,  19,  53,  20,  58,  40,  22,  49,  34,  58,   9,  50,
        44,  23,  40,  29,  58,  40,  20,  56,  21,  74,  22,  80,  85,
         0,  59,  40,  32,  53, 101, 137, 118,   7, 103,  27,   2,  28,
         5,   7,  11,   1,  55,   0,  65,  45,   5,  40,  12,  53,  28,
        37,   9,  20,   7,  66,   5, 187,  44,  12,  35,  19,  24,  20,
        34,  39,   9,  13,  59,  50,  68,  67,  48,  61,  40,  70,  51,
        10,  17,  82,  42,  13,  43,   5,  50,   3,  23,  44,  15,  35,
        35,   7,  36,  39,  50, 121,  45,   9,   1,  91,   2,  90,  20,
        69,  19,  60,  35,  89, 125,  50,  29,  24,  22,  49,  46,  74,
       131,  53,  48,  17,  26,  43,  58, 102,  50,   0,  40,   

In [30]:
data = {
    "Importance": pipe_lgbm.named_steps["lgbmclassifier"].feature_importances_,
}
pd.DataFrame(data=data, index=transformed_col_names,).sort_values(
    by="Importance", ascending=False
)[:10]

Unnamed: 0,Importance
centerxy_full_nearest_dist,187
empcy_ta,160
nces_public_schools_nearest_dist,141
pop_seasonal_ta,137
inrix_ew_distance,131
inrix_ns_distance,125
transitstop_nearest_dist,123
hrsa_hospitals_nearest_dist,121
pop_transient_ta,118
edu_doctorate_ta,104


In [31]:
results = {}
results["Logistic Regression"] = [pipe_lr.score(X_train, y_train), pipe_lr.score(X_test, y_test)]
results["Light GBM"] = [pipe_lgbm.score(X_train, y_train), pipe_lgbm.score(X_test, y_test)]
pd.DataFrame(results, index=["Train score", "Test score"]).T

Unnamed: 0,Train score,Test score
Logistic Regression,0.732673,0.582278
Light GBM,1.0,0.607595


In [32]:
corr_lookup_dict = {}

In [33]:
correlation = merged.corr(numeric_only=True)
np.fill_diagonal(correlation.values, 0)
correlation

Unnamed: 0,medhhinc_dma,medhhinc_1mi,age0018_ta,age65pl_ta,age85pl_ta,asian_ta,avg_faminc_ta,avghhinc_ta,black_ta,boomer_ta,...,nces_public_schools_total_enrollment_10mi,nces_public_schools_10mi,osm_nearest_exit_dist,osm_highway_exits_count_1mi,osm_highway_exits_count_2mi,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_highway_exits_count_10mi,transitstop_nearest_dist,transitstops
medhhinc_dma,0.000000,0.468225,0.141289,0.002420,-0.098578,0.329617,0.475630,0.445989,-0.092994,0.125045,...,0.506345,0.456820,-0.129300,0.204231,0.265740,0.314022,0.354534,0.455907,-0.285368,0.139828
medhhinc_1mi,0.468225,0.000000,-0.069198,-0.035734,-0.149919,0.291665,0.842937,0.853511,-0.324604,0.099117,...,0.314323,0.258558,-0.056682,0.016767,0.040006,0.106176,0.169803,0.274635,-0.149391,-0.020870
age0018_ta,0.141289,-0.069198,0.000000,0.581173,0.259813,0.426871,-0.158444,-0.154798,0.348076,0.760155,...,0.236656,0.188982,-0.130943,0.007358,-0.019274,-0.015192,0.028922,0.095139,0.091815,-0.035237
age65pl_ta,0.002420,-0.035734,0.581173,0.000000,0.852359,0.158785,-0.068107,-0.075765,0.116876,0.915272,...,0.036236,0.053295,-0.023522,-0.145392,-0.175974,-0.171670,-0.132094,-0.058580,-0.005251,0.005296
age85pl_ta,-0.098578,-0.149919,0.259813,0.852359,0.000000,-0.065736,-0.152277,-0.152390,-0.024004,0.602907,...,-0.037718,0.018255,0.012690,-0.129381,-0.125793,-0.125117,-0.104515,-0.085455,-0.015513,0.128380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
osm_highway_exits_count_3mi,0.314022,0.106176,-0.015192,-0.171670,-0.125117,0.068559,0.137125,0.133724,-0.008472,-0.167879,...,0.492029,0.563006,-0.312757,0.616974,0.891392,0.000000,0.878979,0.722643,-0.112636,0.393119
osm_highway_exits_count_5mi,0.354534,0.169803,0.028922,-0.132094,-0.104515,0.111681,0.206151,0.224165,-0.002133,-0.120350,...,0.613555,0.681480,-0.287153,0.494451,0.741262,0.878979,0.000000,0.886562,-0.146500,0.456619
osm_highway_exits_count_10mi,0.455907,0.274635,0.095139,-0.058580,-0.085455,0.194858,0.291563,0.315672,-0.012067,-0.011764,...,0.763752,0.813088,-0.244804,0.415742,0.617028,0.722643,0.886562,0.000000,-0.219602,0.443250
transitstop_nearest_dist,-0.285368,-0.149391,0.091815,-0.005251,-0.015513,-0.100966,-0.182707,-0.174324,-0.022088,-0.017782,...,-0.247805,-0.211407,0.067825,-0.080592,-0.086858,-0.112636,-0.146500,-0.219602,0.000000,-0.248129


In [34]:
high_corr = correlation.loc[:, (correlation > 0.98).any(axis=0)]
high_corr[high_corr["age0018_ta"] > 0.98]

Unnamed: 0,age0018_ta,age65pl_ta,disposable_inc_avg_ta,dtpop_children_at_home_ta,dtpop_retired_disabled_ta,dtpop_students_ta,dtpop_students_post_secondary_ta,dtpop_students_prek_8th_ta,edu_bachplus_ta,edu_bachplus_female_ta,...,inrix_afternoon_ns,inrix_dinner_ns,inrix_lunch_ew,inrix_afternoon_ew,inrix_dinner_ew,inrix_night_ew,maxtemp,avgmax60,avgmax70,avgmax80
dtpop_children_at_home_ta,0.995578,0.579325,-0.100979,0.0,0.565941,0.325845,0.261605,0.994163,0.591673,0.622934,...,0.090713,0.107133,0.100167,0.102906,0.124267,0.143754,-0.010629,-0.043634,-0.031876,-0.006459
dtpop_students_prek_8th_ta,0.998609,0.572295,-0.099491,0.994163,0.556887,0.270173,0.205392,0.0,0.582796,0.6147,...,0.095952,0.113068,0.101744,0.104261,0.125678,0.145854,0.01632,-0.016945,-0.005277,0.018999


In [35]:
for col_name in high_corr.columns.tolist():
    corr_lookup_dict[col_name] = {}
    for index, row in high_corr[high_corr[col_name] > 0.98].iterrows():
        corr_lookup_dict[col_name][index] = row[col_name]

In [36]:
for k, v in corr_lookup_dict.items():
    print(k)
    print("Is correlated to")
    print(v)
    print("-------------------")

age0018_ta
Is correlated to
{'dtpop_children_at_home_ta': 0.9955778504230745, 'dtpop_students_prek_8th_ta': 0.9986086078244143}
-------------------
age65pl_ta
Is correlated to
{'dtpop_retired_disabled_ta': 0.9913033707678078}
-------------------
disposable_inc_avg_ta
Is correlated to
{'medhhinc_ta': 0.9810098098001289}
-------------------
dtpop_children_at_home_ta
Is correlated to
{'age0018_ta': 0.9955778504230745, 'dtpop_students_prek_8th_ta': 0.9941629662332689}
-------------------
dtpop_retired_disabled_ta
Is correlated to
{'age65pl_ta': 0.9913033707678078}
-------------------
dtpop_students_ta
Is correlated to
{'dtpop_students_post_secondary_ta': 0.9977337256166244}
-------------------
dtpop_students_post_secondary_ta
Is correlated to
{'dtpop_students_ta': 0.9977337256166244}
-------------------
dtpop_students_prek_8th_ta
Is correlated to
{'age0018_ta': 0.9986086078244143, 'dtpop_children_at_home_ta': 0.9941629662332689}
-------------------
edu_bachplus_ta
Is correlated to
{'edu_ba