In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectFromModel

In [2]:
trade_area = pd.read_csv("../../data/Smoothie King/smoothie_king_trade_area_variables.csv")
poi = pd.read_csv('../../data/Smoothie King/processed_poi.csv')
stores = pd.read_csv('../../data/Smoothie King/smoothie_king_stores.csv')
trade_area = pd.read_csv("../../data/Smoothie King/processed_trade_area.csv")
merged = stores.merge(trade_area, left_on="store", right_on="store_num").merge(poi)
merged = merged.drop(columns=["store_num", "country_code"])
merged

Unnamed: 0,store,longitude,latitude,category,cbsa_name,dma_name,state_name,market_size,store_density,age0018_p_ta,...,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_nearest_exit_dist,places_of_worship_10mi,places_of_worship_1mi,places_of_worship_2mi,places_of_worship_3mi,places_of_worship_5mi,transitstop_nearest_dist,transitstops
0,SK 1504,-97.650392,30.519353,SHOPPING,"Austin-Round Rock, TX","Austin, TX",Texas,Large Metro (2),Light Suburban,0.2907,...,17,45,1.875541,314,5,30,76,128,14.818824,0
1,SK 0057,-88.171150,30.672501,SHOPPING,"Mobile, AL","Mobile et al, AL-FL",Alabama,Medium City (4),Light Suburban,0.2416,...,3,14,2.617072,305,12,82,109,154,49.572856,0
2,SK 1415,-90.535722,38.784250,HOME,"St. Louis, MO-IL","St. Louis, MO",Missouri,Very Large Metro (1),Light Suburban,0.2198,...,25,45,0.194937,277,6,23,57,78,3.518308,0
3,SK 1231,-80.134700,26.100737,TRAVEL,"Miami-Fort Lauderdale et al, FL","Miami-Ft. Lauderdale, FL",Florida,Very Large Metro (1),Suburban,0.2060,...,31,61,1.393043,1202,10,50,116,393,0.021790,25
4,SK 1535,-96.856651,32.996408,WORK,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Light Suburban,0.2866,...,23,72,0.711949,903,12,58,117,273,0.055289,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,SK 1536,-96.872596,32.647809,HOME,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Exurban,0.2745,...,36,81,0.261721,866,8,40,115,323,0.012345,24
792,SK 1886,-105.077634,40.564695,OTHER,"Fort Collins, CO","Denver, CO",Colorado,Medium City (4),Light Suburban,0.2083,...,0,10,4.012518,175,20,71,112,128,0.189059,13
793,SK 0162,-95.478001,30.316531,SHOPPING,"Houston-The Woodlands et al, TX","Houston, TX",Texas,Very Large Metro (1),Exurban,0.2730,...,15,23,0.397305,140,20,42,53,68,0.699036,0
794,SK 1449,-78.968258,35.064994,SHOPPING,"Fayetteville, NC","Raleigh et al, NC",North Carolina,Medium City (4),Exurban,0.2624,...,12,46,0.706073,293,8,43,85,169,4.771075,0


In [3]:
merged = merged.drop(columns=["store", "longitude", "latitude", "state_name", "cbsa_name", "dma_name"])
merged = merged.dropna()

In [4]:
le = LabelEncoder()
merged["category"] = le.fit_transform(merged["category"])
merged

Unnamed: 0,category,market_size,store_density,age0018_p_ta,age65pl_p_ta,age85pl_p_ta,asian_p_ta,avg_faminc_ta,avghhinc_ta,black_p_ta,...,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_nearest_exit_dist,places_of_worship_10mi,places_of_worship_1mi,places_of_worship_2mi,places_of_worship_3mi,places_of_worship_5mi,transitstop_nearest_dist,transitstops
0,2,Large Metro (2),Light Suburban,0.2907,0.0909,0.0050,0.0555,108837.0,106093.0,0.0985,...,17,45,1.875541,314,5,30,76,128,14.818824,0
1,2,Medium City (4),Light Suburban,0.2416,0.1537,0.0134,0.0327,80752.0,70536.0,0.3890,...,3,14,2.617072,305,12,82,109,154,49.572856,0
2,0,Very Large Metro (1),Light Suburban,0.2198,0.1926,0.0147,0.0235,106893.0,97112.0,0.0552,...,25,45,0.194937,277,6,23,57,78,3.518308,0
3,3,Very Large Metro (1),Suburban,0.2060,0.2108,0.0185,0.0203,134373.0,149467.0,0.0998,...,31,61,1.393043,1202,10,50,116,393,0.021790,25
4,4,Very Large Metro (1),Light Suburban,0.2866,0.1283,0.0057,0.0719,102367.0,96066.0,0.1935,...,23,72,0.711949,903,12,58,117,273,0.055289,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,0,Very Large Metro (1),Exurban,0.2745,0.1529,0.0111,0.0085,66363.0,59465.0,0.5114,...,36,81,0.261721,866,8,40,115,323,0.012345,24
792,1,Medium City (4),Light Suburban,0.2083,0.1078,0.0100,0.0304,95226.0,73773.0,0.0159,...,0,10,4.012518,175,20,71,112,128,0.189059,13
793,2,Very Large Metro (1),Exurban,0.2730,0.1367,0.0111,0.0172,89271.0,86552.0,0.1078,...,15,23,0.397305,140,20,42,53,68,0.699036,0
794,2,Medium City (4),Exurban,0.2624,0.1357,0.0111,0.0338,67117.0,62256.0,0.3939,...,12,46,0.706073,293,8,43,85,169,4.771075,0


In [5]:
le.classes_

array(['HOME', 'OTHER', 'SHOPPING', 'TRAVEL', 'WORK'], dtype=object)

In [6]:
train_df, test_df = train_test_split(merged, test_size=0.1, random_state=42)
X_train = train_df.drop(columns=["category"])
y_train = train_df["category"]
X_test = test_df.drop(columns=["category"])
y_test = test_df["category"]

In [7]:
X_train

Unnamed: 0,market_size,store_density,age0018_p_ta,age65pl_p_ta,age85pl_p_ta,asian_p_ta,avg_faminc_ta,avghhinc_ta,black_p_ta,boomer_p_ta,...,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_nearest_exit_dist,places_of_worship_10mi,places_of_worship_1mi,places_of_worship_2mi,places_of_worship_3mi,places_of_worship_5mi,transitstop_nearest_dist,transitstops
347,Very Large Metro (1),Exurban,0.2969,0.1014,0.0071,0.0261,93118.0,89967.0,0.6010,0.1739,...,10,33,2.615653,746,4,13,53,162,1.748239,0
111,Very Large Metro (1),Exurban,0.2584,0.1199,0.0059,0.0633,72243.0,67727.0,0.5925,0.2033,...,14,21,0.388134,472,7,38,63,149,1.455125,0
54,Small Town (6),Exurban,0.2161,0.1461,0.0131,0.0151,73789.0,81607.0,0.4227,0.1953,...,0,0,7.056117,37,14,23,27,30,31.770532,0
747,Large City (3),Exurban,0.2736,0.1147,0.0083,0.0446,135460.0,134123.0,0.0187,0.1858,...,6,27,1.977634,153,4,14,23,48,1.319867,0
194,Very Large Metro (1),Light Suburban,0.2066,0.2145,0.0299,0.0209,104670.0,100387.0,0.0129,0.2525,...,16,46,1.326653,623,9,42,91,237,0.025833,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Very Large Metro (1),Exurban,0.2466,0.1358,0.0071,0.1914,137597.0,150433.0,0.1700,0.2253,...,14,29,0.203553,477,7,15,26,102,9.251025,0
107,Very Large Metro (1),Suburban,0.1881,0.1878,0.0199,0.0884,146098.0,161364.0,0.0730,0.2473,...,39,101,0.331632,1753,22,77,176,410,0.040567,11
275,Small City (5),Exurban,0.2144,0.1539,0.0150,0.0187,75062.0,62347.0,0.0340,0.1862,...,6,10,0.200039,48,5,15,26,40,44.211875,0
443,Very Large Metro (1),Exurban,0.2626,0.1129,0.0051,0.1057,103601.0,92428.0,0.4126,0.1956,...,21,58,1.637729,511,8,32,55,130,0.057342,11


In [8]:
ordinal_features = ["market_size", "store_density"]
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
# drop_features = ["store", "longitude", "latitude", "state_name", "cbsa_name", "dma_name"]

In [9]:
market_levels = [
    "Small Town (6)",
    "Small City (5)",
    "Medium City (4)",
    "Large City (3)",
    "Large Metro (2)",
    "Very Large Metro (1)"
]
density_levels = [
    "Rural",
    "Exurban",
    "Suburban",
    "Light Suburban",
    "Light Urban",
    "Urban",
    "Super Urban"
]

In [10]:
all_features = numeric_features + ordinal_features

In [11]:
ordinal_transformer = OrdinalEncoder(categories=[market_levels, density_levels], dtype=int)

preprocessor = make_column_transformer(
    # ("drop", drop_features),
    (StandardScaler(), numeric_features),
    (ordinal_transformer, ordinal_features),
)

In [12]:
class_weight = {
    "HOME": 0.24,
    "OTHER": 0.16,
    "SHOPPING": 0.22,
    "TRAVEL": 0.15,
    "WORK": 0.23
}
encoded_class_weights = {i: class_weight[label] for i, label in enumerate(le.classes_)}
class_weight = encoded_class_weights

## Build Logistic Regression Models

In [13]:
pipe_lr = make_pipeline(
    preprocessor,
    LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", n_jobs=-1, class_weight=class_weight)
)
pipe_lr.fit(X_train, y_train);

In [14]:
pipe_lr.score(X_train, y_train)

0.652050919377652

In [15]:
pipe_lr.score(X_test, y_test)

0.6582278481012658

In [16]:
pipe_lr_multi = make_pipeline(
    preprocessor,
    LogisticRegression(C=0.1, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="multinomial", n_jobs=-1, 
                       class_weight=class_weight)
)
pipe_lr_multi.fit(X_train, y_train);

In [17]:
pipe_lr_multi.score(X_train, y_train)

0.6633663366336634

In [18]:
pipe_lr_multi.score(X_test, y_test)

0.6329113924050633

In [19]:
prediction_result = pd.DataFrame({
    "True label": y_test,
    "Logistic Regression (OVR)": pipe_lr.predict(X_test),
    "Logistic Regression (multinomial)": pipe_lr_multi.predict(X_test)
})

## Build Random Forest Models

### Basic RF model

In [20]:
# pipe_rf = make_pipeline(
#     preprocessor,
#     RandomForestClassifier(n_jobs=-1, random_state=42)
# )
# rf_param_grid = {
#     "randomforestclassifier__n_estimators": [25, 50, 100, 150],
#     "randomforestclassifier__max_depth": [5, 10, 20, 30],
#     "randomforestclassifier__max_leaf_nodes": [30, 50, 70],
#     "randomforestclassifier__min_samples_leaf": [10, 20, 30, 40, 50],
#     "randomforestclassifier__min_samples_split": [10, 20, 30],
#     "randomforestclassifier__class_weight": [None, "balanced", class_weight]
# }
# rf_grid_search = RandomizedSearchCV(
#     pipe_rf, rf_param_grid, n_iter=30, cv=5, n_jobs=-1, return_train_score=True
# )
# rf_grid_search.fit(X_train, y_train);

In [21]:
# pd.DataFrame(rf_grid_search.cv_results_)[
#     [
#         "param_randomforestclassifier__n_estimators",
#         "param_randomforestclassifier__max_depth",
#         "param_randomforestclassifier__max_leaf_nodes",
#         "param_randomforestclassifier__min_samples_leaf",
#         "param_randomforestclassifier__min_samples_split",
#         "param_randomforestclassifier__class_weight",
#         "mean_fit_time",
#         "rank_test_score",
#         "mean_test_score",
#     ]
# ].set_index("rank_test_score").sort_index().head(10)

In [22]:
rf_tuned = make_pipeline(
    preprocessor,
    RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight=class_weight, 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
)
rf_tuned.fit(X_train, y_train);

In [23]:
rf_tuned.score(X_train, y_train)

0.6888260254596889

In [24]:
rf_tuned.score(X_test, y_test)

0.6708860759493671

In [25]:
prediction_result["Random Forest"] = rf_tuned.predict(X_test)

In [26]:
# Takes long time to run

# rf_ovr_test = make_pipeline(
#     preprocessor,
#     SelectFromModel(
#         LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
#                            n_jobs=-1, class_weight=class_weight)
#     ),
#     RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight="balanced", 
#                            min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
# )
# rf_param_grid = {
#     "randomforestclassifier__n_estimators": [25, 50, 100, 150],
#     # "randomforestclassifier__max_features": ["sqrt", "log2", None],
#     "randomforestclassifier__max_features": [5, 10, 15, 20, 30],
#     "randomforestclassifier__max_depth": [5, 10, 20, 30],
#     "randomforestclassifier__max_leaf_nodes": [30, 50, 70],
#     "randomforestclassifier__min_samples_leaf": [10, 20, 30, 40, 50],
#     "randomforestclassifier__min_samples_split": [10, 20, 30, 40],
#     "randomforestclassifier__class_weight": [None, "balanced", class_weight]
# }
# rf_ovr_search = RandomizedSearchCV(
#     rf_ovr_test, rf_param_grid, n_iter=30, cv=5, n_jobs=-1, random_state=42
# )

In [27]:
# rf_ovr_search.fit(X_train, y_train);

In [28]:
# pd.DataFrame(rf_ovr_search.cv_results_)[
#     [
#         "param_randomforestclassifier__n_estimators",
#         "param_randomforestclassifier__max_features",
#         "param_randomforestclassifier__max_depth",
#         "param_randomforestclassifier__max_leaf_nodes",
#         "param_randomforestclassifier__min_samples_leaf",
#         "param_randomforestclassifier__min_samples_split",
#         "param_randomforestclassifier__class_weight",
#         "mean_fit_time",
#         "rank_test_score",
#         "mean_test_score",
#     ]
# ].set_index("rank_test_score").sort_values(by="mean_test_score", ascending=False)

In [29]:
# rf_ovr_search.best_params_

In [30]:
# rf_ovr_search.score(X_train, y_train)

In [31]:
# rf_ovr_search.score(X_test, y_test)

### RandomForest with regularization using LogisticRegression

In [32]:
pipe_lr_rf = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
                           n_jobs=-1, class_weight=class_weight)
    ),
    RandomForestClassifier(n_estimators=100, max_depth=20, max_leaf_nodes=70, class_weight=class_weight, 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
)
pipe_lr_rf.fit(X_train, y_train);

In [33]:
pipe_lr_rf.score(X_train, y_train)

0.6775106082036775

In [34]:
pipe_lr_rf.score(X_test, y_test)

0.6708860759493671

In [35]:
prediction_result["Random Forest (L2 reg with LR)"] = pipe_lr_rf.predict(X_test)

In [36]:
# pipe_lr_rf_test = make_pipeline(
#     preprocessor,
#     SelectFromModel(
#         LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
#                            n_jobs=-1, class_weight=class_weight)
#     ),
#     RandomForestClassifier(n_estimators=100, max_depth=20, max_leaf_nodes=70, class_weight=class_weight, 
#                            min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
# )

In [37]:
lr_rf_param = {
    "selectfrommodel__estimator__C": [0.1, 0.15, 0.2, 0.3, 0.4],
    "selectfrommodel__estimator__penalty": ["l1", "l2"],
    "selectfrommodel__estimator__multi_class": ["ovr", "multinomial"],
    "selectfrommodel__estimator__class_weight": [None, "balanced", class_weight]
}
lr_rf_search = RandomizedSearchCV(
    pipe_lr_rf, lr_rf_param, n_iter=30, cv=5, n_jobs=-1, random_state=42
)
lr_rf_search.fit(X_train, y_train);

In [38]:
pd.DataFrame(lr_rf_search.cv_results_)[
    [
        "param_selectfrommodel__estimator__C",
        "param_selectfrommodel__estimator__penalty",
        "param_selectfrommodel__estimator__multi_class",
        "param_selectfrommodel__estimator__class_weight",
        "mean_fit_time",
        "rank_test_score",
        "mean_test_score",
    ]
].set_index("rank_test_score").sort_index().head(10)

Unnamed: 0_level_0,param_selectfrommodel__estimator__C,param_selectfrommodel__estimator__penalty,param_selectfrommodel__estimator__multi_class,param_selectfrommodel__estimator__class_weight,mean_fit_time,mean_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.1,l1,ovr,,21.797817,0.451184
2,0.4,l1,multinomial,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",31.096654,0.446898
3,0.1,l2,ovr,balanced,25.706812,0.44555
4,0.2,l1,multinomial,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",34.847072,0.44552
5,0.3,l2,ovr,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",30.079334,0.444141
6,0.15,l2,ovr,,21.831814,0.442723
6,0.1,l1,multinomial,balanced,19.767326,0.442723
8,0.4,l2,ovr,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",17.63543,0.441325
9,0.2,l1,multinomial,balanced,17.977526,0.441285
10,0.2,l2,ovr,,17.475453,0.441265


In [39]:
lr_rf_search.score(X_train, y_train)

0.6817538896746818

In [40]:
lr_rf_search.score(X_test, y_test)

0.6708860759493671

In [41]:
lr_rf_l1 = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.1, penalty="l1", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
                           n_jobs=-1, class_weight=None)
    ),
    RandomForestClassifier(n_estimators=100, max_depth=20, max_leaf_nodes=70, class_weight=class_weight, 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
)
lr_rf_l1.fit(X_train, y_train);

In [42]:
print(lr_rf_l1.score(X_train, y_train))
print(lr_rf_l1.score(X_test, y_test))

0.6817538896746818
0.6708860759493671


In [43]:
prediction_result["Random Forest (L1 reg with LR)"] = lr_rf_l1.predict(X_test)

In [44]:
pipe_lr_rf_rank2 = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.4, penalty="l1", random_state=42, solver="saga", max_iter=10000, multi_class="multinomial", 
                           n_jobs=-1, class_weight=class_weight)
    ),
    RandomForestClassifier(n_estimators=100, max_depth=20, max_leaf_nodes=70, class_weight=class_weight, 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
)
pipe_lr_rf_rank2.fit(X_train, y_train);

In [45]:
print(pipe_lr_rf_rank2.score(X_train, y_train))
print(pipe_lr_rf_rank2.score(X_test, y_test))

0.669024045261669
0.6582278481012658


**Second best set of parameters do not lead to better result. Just use the best set of parameters.**

### Random Forest with OVR and regularization using LogisticRegression

In [46]:
lr_rf_ovr = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
                           n_jobs=-1, class_weight=class_weight)
    ),
    OneVsRestClassifier(
        RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight=None, 
                               min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
    )
)

In [47]:
lr_rf_ovr_search = RandomizedSearchCV(
    lr_rf_ovr, lr_rf_param, n_iter=30, cv=5, n_jobs=-1, random_state=42
)
lr_rf_ovr_search.fit(X_train, y_train);

In [48]:
pd.DataFrame(lr_rf_ovr_search.cv_results_)[
    [
        "param_selectfrommodel__estimator__C",
        "param_selectfrommodel__estimator__penalty",
        "param_selectfrommodel__estimator__multi_class",
        "param_selectfrommodel__estimator__class_weight",
        "mean_fit_time",
        "rank_test_score",
        "mean_test_score",
    ]
].set_index("rank_test_score").sort_values(by="mean_test_score", ascending=False).head(10)

Unnamed: 0_level_0,param_selectfrommodel__estimator__C,param_selectfrommodel__estimator__penalty,param_selectfrommodel__estimator__multi_class,param_selectfrommodel__estimator__class_weight,mean_fit_time,mean_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.4,l1,multinomial,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",34.917008,0.466717
2,0.4,l2,ovr,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",19.832661,0.465328
3,0.1,l2,ovr,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",26.926346,0.46384
4,0.1,l2,multinomial,,19.519118,0.462501
5,0.3,l1,multinomial,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",26.154251,0.461073
6,0.4,l1,ovr,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",29.544698,0.461053
7,0.2,l2,ovr,,18.837667,0.459664
7,0.15,l1,ovr,,22.428232,0.459664
9,0.1,l1,ovr,,25.143753,0.459654
10,0.2,l1,multinomial,balanced,18.674835,0.458236


In [49]:
lr_rf_ovr_search.score(X_train, y_train)

0.7736916548797736

In [50]:
lr_rf_ovr_search.score(X_test, y_test)

0.6455696202531646

In [51]:
lr_rf_ovr = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.15, penalty="l1", random_state=42, solver="saga", max_iter=10000, multi_class="multinomial", 
                           n_jobs=-1, class_weight=None)
    ),
    OneVsRestClassifier(
        RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight=None, 
                               min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
    )
)
lr_rf_ovr.fit(X_train, y_train);

In [52]:
print(lr_rf_ovr.score(X_train, y_train))
print(lr_rf_ovr.score(X_test, y_test))

0.7835926449787836
0.6708860759493671


In [53]:
prediction_result["Random Forest (OVR with L1 reg using LR)"] = lr_rf_ovr.predict(X_test)

In [54]:
rf_ovr = make_pipeline(
    preprocessor,
    OneVsRestClassifier(
        RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight="balanced", 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
    )
)
rf_ovr.fit(X_train, y_train);

In [55]:
rf_ovr.score(X_train, y_train)

0.8642149929278642

In [56]:
rf_ovr.score(X_test, y_test)

0.6582278481012658

In [57]:
# prediction_result["rf_ovr"] = rf_ovr.predict(X_test)

## LGBM Model - Poor score, won't use

In [58]:
# pipe_lgbm_ovr = make_pipeline(
#     preprocessor,
#     OneVsRestClassifier(
#         LGBMClassifier(random_state=42, n_jobs=-1)
#     )
# )
# param_lgbm = {
#     "onevsrestclassifier__estimator__n_estimators": [50, 100, 150, 200],
#     "onevsrestclassifier__estimator__boosting_type": ["gbdt", "dart"],
#     "onevsrestclassifier__estimator__learning_rate": [0.1, 0.3, 0.5, 0.7],
#     "onevsrestclassifier__estimator__max_depth": [5, 10, 15, 20],
#     "onevsrestclassifier__estimator__reg_alpha": [0.1, 0.3, 0.5, 0.7],
#     "onevsrestclassifier__estimator__reg_lambda": [0.1, 0.3, 0.5, 0.7],
#     # "onevsrestclassifier__estimator__min_child_samples": [20, 30, 40, 50],
#     "onevsrestclassifier__estimator__num_leaves": [5, 10, 15, 20],
#     "onevsrestclassifier__estimator__min_data_in_leaf": [30, 40, 50, 60],
#     "onevsrestclassifier__estimator__class_weight": [None, "balanced"]
# }
# lgbm_search = RandomizedSearchCV(
#     pipe_lgbm_ovr, param_lgbm, n_iter=30, cv=5, n_jobs=-1, random_state=42
# )

In [59]:
# lgbm_search.fit(X_train, y_train);

In [60]:
# pd.DataFrame(lgbm_search.cv_results_)[
#     [
#         "param_onevsrestclassifier__estimator__n_estimators",
#         "param_onevsrestclassifier__estimator__boosting_type",
#         "param_onevsrestclassifier__estimator__learning_rate",
#         "param_onevsrestclassifier__estimator__max_depth",
#         "param_onevsrestclassifier__estimator__reg_alpha",
#         "param_onevsrestclassifier__estimator__reg_lambda",
#         "param_onevsrestclassifier__estimator__class_weight",
#         # "param_onevsrestclassifier__estimator__min_child_samples",
#         "param_onevsrestclassifier__estimator__num_leaves",
#         "param_onevsrestclassifier__estimator__min_data_in_leaf",
#         "mean_fit_time",
#         "rank_test_score",
#         "mean_test_score",
#     ]
# ].set_index("rank_test_score").sort_index().head(20)

In [61]:
# lgbm_search.score(X_train, y_train)

In [62]:
# lgbm_search.score(X_test, y_test)

In [63]:
# lgbm_test = make_pipeline(
#     preprocessor,
#     OneVsRestClassifier(
#         LGBMClassifier(random_state=42, n_jobs=-1, n_estimators=150, boosting_type="gbdt", learning_rate=0.1, max_depth=10,
#                       reg_alpha=0.7, reg_lambda=0.7, num_leaves=20, min_data_in_leaf=30, class_weight=None)
#     )
# )
# lgbm_test.fit(X_train, y_train);

In [64]:
# lgbm_test.score(X_train, y_train)

In [65]:
# lgbm_test.score(X_test, y_test)

In [66]:
# lgbm_search.best_params_
# score=0.62025

## XGB Model - poor score, won't use

In [67]:
# pipe_xgb = make_pipeline(
#     preprocessor,
#     OneVsRestClassifier(
#         XGBClassifier(random_state=42, verbosity=0)
#     )
# )
# param_xgb = {
#     "onevsrestclassifier__estimator__booster": ["gbtree", "dart"],
#     "onevsrestclassifier__estimator__learning_rate": [0.1, 0.3, 0.5, 0.7],
#     "onevsrestclassifier__estimator__max_depth": [3, 5, 7, 9],
#     "onevsrestclassifier__estimator__reg_alpha": [0.1, 0.3, 0.5, 0.7],
#     "onevsrestclassifier__estimator__reg_lambda": [0.1, 0.3, 0.5, 0.7],
#     "onevsrestclassifier__estimator__subsample": [0.3, 0.5, 0.7, 1],
#     "onevsrestclassifier__estimator__min_child_weight": [1, 4, 7, 10]
# }
# xgb_search = RandomizedSearchCV(
#     pipe_xgb, param_xgb, n_iter=30, cv=5, n_jobs=-1, random_state=42
# )

In [68]:
# xgb_search.fit(X_train, y_train);

In [69]:
# pd.DataFrame(xgb_search.cv_results_)[
#     [
#         "param_onevsrestclassifier__estimator__booster",
#         "param_onevsrestclassifier__estimator__learning_rate",
#         "param_onevsrestclassifier__estimator__max_depth",
#         "param_onevsrestclassifier__estimator__reg_alpha",
#         "param_onevsrestclassifier__estimator__reg_lambda",
#         "param_onevsrestclassifier__estimator__subsample",
#         "param_onevsrestclassifier__estimator__min_child_weight",
#         "mean_fit_time",
#         "rank_test_score",
#         "mean_test_score",
#     ]
# ].set_index("rank_test_score").sort_index().head(20)

In [70]:
# xgb_search.score(X_train, y_train)

In [71]:
# xgb_search.score(X_test, y_test)

In [72]:
# xgb_test = make_pipeline(
#     preprocessor,
#     OneVsRestClassifier(
#         XGBClassifier(random_state=42, verbosity=0, booster="gbtree", learning_rate=0.1, max_depth=5, reg_alpha=0.5, reg_lambda=0.1,
#                      subsample=0.7, min_child_weight=1)
#     )
# )
# xgb_test.fit(X_train, y_train);

In [73]:
# xgb_test.score(X_train, y_train)

In [74]:
# xgb_test.score(X_test, y_test)
# XGBClassifier(random_state=42, verbosity=0, booster="gbtree", learning_rate=0.1, max_depth=5, reg_alpha=0.5, reg_lambda=0.1,
#                     subsample=0.7)

In [75]:
# lr_xgb_test = make_pipeline(
#     preprocessor,
#     SelectFromModel(
#         LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
#                            n_jobs=-1, class_weight=class_weight)
#     ),
#     OneVsRestClassifier(
#         XGBClassifier(random_state=42, verbosity=0, booster="gbtree", learning_rate=0.1, max_depth=5, reg_alpha=0.5, reg_lambda=0.1,
#                      subsample=0.7)
#     )
# )
# lr_xgb_test.fit(X_train, y_train);

In [76]:
# lr_xgb_test.score(X_train, y_train)

In [77]:
# lr_xgb_test.score(X_test, y_test)

In [78]:
# prediction_result["xgb"] = xgb_test.predict(X_test)

In [79]:
prediction_result.head(20)

Unnamed: 0,True label,Logistic Regression (OVR),Logistic Regression (multinomial),Random Forest,Random Forest (L2 reg with LR),Random Forest (L1 reg with LR),Random Forest (OVR with L1 reg using LR)
763,2,2,2,0,0,0,0
39,0,0,0,0,0,0,0
214,0,0,0,0,0,0,0
202,0,2,4,0,2,0,2
239,4,4,4,4,4,4,4
218,0,0,0,0,0,0,0
556,4,4,4,4,4,4,1
610,2,2,2,2,2,2,2
305,4,4,4,4,4,4,4
139,2,2,2,2,2,2,2


In [80]:
prediction_result.iloc[20:50]

Unnamed: 0,True label,Logistic Regression (OVR),Logistic Regression (multinomial),Random Forest,Random Forest (L2 reg with LR),Random Forest (L1 reg with LR),Random Forest (OVR with L1 reg using LR)
758,2,0,0,0,0,0,0
454,2,2,2,2,2,2,2
291,0,0,0,0,0,0,0
342,1,4,4,4,4,4,4
23,2,2,2,2,2,2,2
251,0,0,0,0,0,2,0
608,2,2,2,2,2,2,2
385,0,0,0,0,0,0,0
622,3,1,1,4,4,4,4
437,4,4,4,4,4,4,4


In [81]:
prediction_result.iloc[50:]

Unnamed: 0,True label,Logistic Regression (OVR),Logistic Regression (multinomial),Random Forest,Random Forest (L2 reg with LR),Random Forest (L1 reg with LR),Random Forest (OVR with L1 reg using LR)
630,4,4,4,4,4,4,4
791,0,2,2,2,0,4,0
658,0,0,0,4,0,0,0
49,4,4,4,4,4,4,4
78,0,0,0,0,2,0,2
235,2,2,2,2,2,2,2
441,2,2,2,2,2,2,2
66,4,4,1,0,0,2,0
265,1,2,2,2,2,2,2
336,0,0,0,0,0,0,0


## Try two different combinations of ensembling models

### 1: RF + RF (OVR, L1) + RF(L1)

In [82]:
classifiers_1 = {
    "rf": rf_tuned,
    "lr_rf_ovr": lr_rf_ovr,
    "lr_rf_l1": lr_rf_l1
}
# classifiers_1 = {
#     "lr_rf_l2": pipe_lr_rf,
#     "lr_rf_ovr": lr_rf_ovr,
#     "lr_rf_l1": lr_rf_l1
# }
classifiers_2 = {
    "rf": rf_tuned,
    "lr_rf": pipe_lr_rf,
    "lr_rf_l1": lr_rf_l1
}

In [83]:
# classifiers = {
#     "lr": pipe_lr,
#     "lr_multi": pipe_lr_multi,
#     "rf": rf_tuned,
#     "lr_rf": pipe_lr_rf,
#     "rf_ovr": rf_ovr,
#     "lr_rf_ovr": lr_rf_ovr,
#     "lr_rf_l1": lr_rf_l1,
# }

In [84]:
averaging_model = VotingClassifier(
    list(classifiers_1.items()), voting="soft"
)
averaging_model.fit(X_train, y_train);

In [85]:
averaging_model.score(X_train, y_train)

0.7171145685997171

In [86]:
averaging_model.score(X_test, y_test)

0.6962025316455697

In [87]:
averaging_model_hard = VotingClassifier(
    list(classifiers_1.items()), voting="hard"
)
averaging_model_hard.fit(X_train, y_train);

In [88]:
averaging_model_hard.score(X_train, y_train)

0.7072135785007072

In [89]:
averaging_model_hard.score(X_test, y_test)

0.7088607594936709

In [90]:
stacking_model = StackingClassifier(list(classifiers_1.items()))
stacking_model.fit(X_train, y_train);

In [91]:
stacking_model.score(X_train, y_train)

0.7312588401697313

In [92]:
stacking_model.score(X_test, y_test)

0.6329113924050633

In [93]:
stack_rf = StackingClassifier(list(classifiers_1.items()), RandomForestClassifier(n_jobs=-1, random_state=42))
stack_rf.fit(X_train, y_train);

In [94]:
stack_rf.score(X_train, y_train)

0.6803394625176803

In [95]:
stack_rf.score(X_test, y_test)

0.6329113924050633

### 2: RF + RF (L2) + RF(L1)

In [96]:
averaging_model_2 = VotingClassifier(
    list(classifiers_2.items()), voting="soft"
)
averaging_model_2.fit(X_train, y_train);

In [97]:
averaging_model_2.score(X_train, y_train)

0.6902404526166902

In [98]:
averaging_model_2.score(X_test, y_test)

0.6708860759493671

In [99]:
averaging_model_hard_2 = VotingClassifier(
    list(classifiers_2.items()), voting="hard"
)
averaging_model_hard_2.fit(X_train, y_train);

In [100]:
averaging_model_hard_2.score(X_train, y_train)

0.6845827439886846

In [101]:
averaging_model_hard_2.score(X_test, y_test)

0.6962025316455697

In [102]:
stacking_model_2 = StackingClassifier(list(classifiers_1.items()))
stacking_model_2.fit(X_train, y_train);

In [103]:
stacking_model_2.score(X_train, y_train)

0.7312588401697313

In [104]:
stacking_model_2.score(X_test, y_test)

0.6329113924050633

In [105]:
stack_rf_2 = StackingClassifier(list(classifiers_1.items()), RandomForestClassifier(n_jobs=-1, random_state=42))
stack_rf_2.fit(X_train, y_train);

In [106]:
stack_rf_2.score(X_train, y_train)

0.6803394625176803

In [107]:
stack_rf_2.score(X_test, y_test)

0.6329113924050633

## Examine features used by best model

In [108]:
rf_tuned_feature_importance = pd.DataFrame(
    rf_tuned.named_steps['randomforestclassifier'].feature_importances_,
    index=all_features
).sort_values(by=0, ascending=False)
rf_tuned_feature_importance.head(10)

Unnamed: 0,0
emp_p_ta,0.043217
dtpop_children_at_home_p_ta,0.024727
centerxy_full_gla_1mi,0.021405
dtpop_students_9th_12th_p_ta,0.020858
dtpop_students_prek_8th_p_ta,0.01606
centerxy_full_0p5_intersect_gla,0.015414
hrsa_number_of_certified_beds_2mi,0.014702
dmm_gla_1mi,0.014668
dtpop_retired_disabled_p_ta,0.014322
dtpop_work_at_home_p_ta,0.014148


In [109]:
len(lr_rf_l1.named_steps['randomforestclassifier'].feature_importances_)

103

In [110]:
lr_rf_l1_selected = lr_rf_l1.named_steps['selectfrommodel'].get_support()
lr_rf_l1_selected_features = [feat for (feat, is_selected) in zip(all_features, lr_rf_l1_selected) if is_selected]
lr_rf_l1_feature_importance = pd.DataFrame(
    lr_rf_l1.named_steps['randomforestclassifier'].feature_importances_,
    index=lr_rf_l1_selected_features
).sort_values(by=0, ascending=False)
lr_rf_l1_feature_importance.head(10)

Unnamed: 0,0
emp_p_ta,0.063927
centerxy_full_0p5_intersect_gla,0.031018
dtpop_work_at_home_p_ta,0.028581
centerxy_full_0p5_intersect_count,0.028162
dtpop_retired_disabled_p_ta,0.025666
dtpop_unemployed_p_ta,0.025513
crime_total_index_ta,0.024608
ipeds_postsecondary_schools_total_enrollment_3mi,0.023571
centerxy_full_gla_1mi,0.023262
hrsa_number_of_certified_beds_2mi,0.02176


In [111]:
for i in range(len(lr_rf_ovr.named_steps['onevsrestclassifier'].estimators_)):
    print(len(lr_rf_ovr.named_steps['onevsrestclassifier'].estimators_[i].feature_importances_))

130
130
130
130
130


In [112]:
lr_rf_ovr_selected = lr_rf_ovr.named_steps['selectfrommodel'].get_support()
lr_rf_ovr_selected_features = [feat for (feat, is_selected) in zip(all_features, lr_rf_ovr_selected) if is_selected]
lr_rf_ovr_feature_importance = pd.DataFrame({
    "HOME": lr_rf_ovr.named_steps['onevsrestclassifier'].estimators_[0].feature_importances_,
    "OTHER": lr_rf_ovr.named_steps['onevsrestclassifier'].estimators_[1].feature_importances_,
    "SHOPPING": lr_rf_ovr.named_steps['onevsrestclassifier'].estimators_[2].feature_importances_,
    "TRAVEL": lr_rf_ovr.named_steps['onevsrestclassifier'].estimators_[3].feature_importances_,
    "WORK": lr_rf_ovr.named_steps['onevsrestclassifier'].estimators_[4].feature_importances_
}, index=lr_rf_ovr_selected_features)
lr_rf_ovr_feature_importance

Unnamed: 0,HOME,OTHER,SHOPPING,TRAVEL,WORK
age0018_p_ta,0.009073,0.006190,0.005119,0.048678,0.003732
age85pl_p_ta,0.004893,0.003982,0.002039,0.002418,0.002362
black_p_ta,0.009974,0.001615,0.016388,0.000484,0.006240
com0002_p_ta,0.025359,0.003578,0.003573,0.005240,0.022623
com0205_p_ta,0.014398,0.002996,0.009221,0.004981,0.007603
...,...,...,...,...,...
osm_nearest_exit_dist,0.004254,0.009724,0.005691,0.006389,0.001636
transitstop_nearest_dist,0.005128,0.009136,0.004347,0.011484,0.014152
transitstops,0.002257,0.002423,0.001532,0.000930,0.002694
market_size,0.005334,0.001004,0.001061,0.012318,0.003200


In [113]:
lr_rf_ovr_feature_importance[["HOME"]].sort_values(by="HOME", ascending=False).head(10)

Unnamed: 0,HOME
emp_p_ta,0.068548
dtpop_retired_disabled_p_ta,0.035051
pop_transient_ta,0.032379
dtpop_students_9th_12th_p_ta,0.027404
dtpop_work_at_home_p_ta,0.027245
dtpop_unemployed_p_ta,0.026026
hh_0vehicle_p_ta,0.025461
com0002_p_ta,0.025359
emp_educ_services_p_ta,0.02391
hh_3pers_p_ta,0.023129


In [114]:
lr_rf_ovr_feature_importance[["OTHER"]].sort_values(by="OTHER", ascending=False).head(10)

Unnamed: 0,OTHER
centerxy_full_0p5_intersect_gla,0.046731
hrsa_number_of_certified_beds_3mi,0.040955
ipeds_postsecondary_schools_total_enrollment_1mi,0.039672
ipeds_postsecondary_schools_3mi,0.030282
centerxy_full_nearest_dist,0.02431
edu_bachplus_p_ta,0.021456
hrsa_number_of_certified_beds_5mi,0.019978
edu_assocdeg_p_ta,0.01937
hrsa_number_of_certified_beds_2mi,0.019365
hrsa_hospitals_nearest_dist,0.017351


In [115]:
lr_rf_ovr_feature_importance[["SHOPPING"]].sort_values(by="SHOPPING", ascending=False).head(10)

Unnamed: 0,SHOPPING
centerxy_full_0p5_intersect_gla,0.062258
centerxy_full_gla_1mi,0.061865
dmm_count_1mi,0.050824
dmm_nearest_dist,0.026498
centerxy_full_0p5_intersect_count,0.026283
centerxy_full_nearest_dist,0.022373
centerxy_full_gla_3mi,0.020586
emp_retail_trade_p_ta,0.018372
centerxy_gla_effective_2mi,0.017442
black_p_ta,0.016388


In [116]:
lr_rf_ovr_feature_importance[["TRAVEL"]].sort_values(by="TRAVEL", ascending=False).head(10)

Unnamed: 0,TRAVEL
genx_p_ta,0.064772
hh_type_nonfam_p_ta,0.048883
age0018_p_ta,0.048678
poverty_inpoverty_p_ta,0.042562
nces_public_schools_nearest_dist,0.0377
hh_2pers_p_ta,0.033705
ipeds_postsecondary_schools_total_enrollment_2mi,0.030281
nces_public_schools_total_enrollment_10mi,0.029622
pop_seasonal_ta,0.027556
hu_vacant_ta,0.025927


In [117]:
lr_rf_ovr_feature_importance[["WORK"]].sort_values(by="WORK", ascending=False).head(10)

Unnamed: 0,WORK
emp_p_ta,0.051941
dtpop_work_at_home_p_ta,0.041628
dtpop_students_9th_12th_p_ta,0.028215
com0002_p_ta,0.022623
crime_total_index_ta,0.021865
poverty_inpoverty_p_ta,0.020929
centerxy_gla_effective_3mi,0.0184
emp_wholesale_trade_p_ta,0.018323
centerxy_full_nearest_dist,0.017915
emp_manfacturing_p_ta,0.015794


In [118]:
pipe_lr.named_steps['logisticregression'].coef_

array([[-0.03191046,  0.03078444, -0.00909258, ...,  0.14915485,
         0.13086501,  0.00093062],
       [ 0.00593491, -0.01439243, -0.05805205, ..., -0.05059905,
        -0.12200323, -0.0316698 ],
       [ 0.04248838, -0.05698709,  0.02447002, ..., -0.04239244,
        -0.14330206, -0.11479628],
       [-0.0743549 ,  0.00643303,  0.07205439, ...,  0.02261271,
        -0.70338488, -0.31951041],
       [ 0.02580378,  0.027293  , -0.02909045, ..., -0.08837481,
        -0.32305743, -0.02375686]])

In [119]:
lr_feature_importance = pd.DataFrame({
    "HOME": pipe_lr.named_steps['logisticregression'].coef_[0],
    "OTHER": pipe_lr.named_steps['logisticregression'].coef_[1],
    "SHOPPING": pipe_lr.named_steps['logisticregression'].coef_[2],
    "TRAVEL": pipe_lr.named_steps['logisticregression'].coef_[3],
    "WORK": pipe_lr.named_steps['logisticregression'].coef_[4]
}, index=all_features)
lr_feature_importance

Unnamed: 0,HOME,OTHER,SHOPPING,TRAVEL,WORK
age0018_p_ta,-0.031910,0.005935,0.042488,-0.074355,0.025804
age65pl_p_ta,0.030784,-0.014392,-0.056987,0.006433,0.027293
age85pl_p_ta,-0.009093,-0.058052,0.024470,0.072054,-0.029090
asian_p_ta,0.022446,-0.017614,-0.037982,-0.023955,0.044936
avg_faminc_ta,0.077683,-0.016418,-0.021589,-0.000093,-0.007304
...,...,...,...,...,...
places_of_worship_5mi,0.009047,-0.008627,0.006277,0.020766,-0.030915
transitstop_nearest_dist,-0.058536,0.037945,-0.143986,-0.006538,0.064718
transitstops,0.149155,-0.050599,-0.042392,0.022613,-0.088375
market_size,0.130865,-0.122003,-0.143302,-0.703385,-0.323057


In [120]:
lr_feature_importance[["HOME"]].sort_values(by="HOME", ascending=False)

Unnamed: 0,HOME
dtpop_retired_disabled_p_ta,0.221850
daypop_dens_ta,0.200181
hispanic_p_ta,0.194779
popgr10cn_ta,0.185273
dtpop_homemakers_p_ta,0.174490
...,...
com0205_p_ta,-0.179286
emp_p_ta,-0.186813
dmm_gla_10mi,-0.197070
dmm_count_1mi,-0.197642


In [121]:
lr_feature_importance[["OTHER"]].sort_values(by="OTHER", ascending=False)

Unnamed: 0,OTHER
ipeds_postsecondary_schools_total_enrollment_1mi,0.221405
ipeds_postsecondary_schools_5mi,0.159264
emp_arts_entertainment_p_ta,0.149586
ipeds_postsecondary_schools_3mi,0.147262
emp_transportation_storage_p_ta,0.142373
...,...
emp_construction_p_ta,-0.128463
centerxy_full_0p5_intersect_count,-0.181912
inrix_ns_distance,-0.185453
centerxy_full_0p5_intersect_gla,-0.207997


In [122]:
lr_feature_importance[["SHOPPING"]].sort_values(by="SHOPPING", ascending=False)

Unnamed: 0,SHOPPING
military_installations_1mi,0.229583
centerxy_full_0p5_intersect_gla,0.225647
nces_public_schools_1mi,0.160738
centerxy_full_count_3mi,0.148351
emp_other_p_ta,0.147714
...,...
osm_highway_exits_count_3mi,-0.155625
nces_private_schools_nearest_dist,-0.156659
hrsa_number_of_certified_beds_5mi,-0.158291
emp_accommodation_foodserv_p_ta,-0.164461


In [123]:
lr_feature_importance[["TRAVEL"]].sort_values(by="TRAVEL", ascending=False)

Unnamed: 0,TRAVEL
ipeds_postsecondary_schools_total_enrollment_2mi,0.161269
nces_public_schools_nearest_dist,0.150863
hh_type_nonfam_p_ta,0.131939
ipeds_postsecondary_schools_total_enrollment_1mi,0.120142
hu_vacant_ta,0.114098
...,...
age0018_p_ta,-0.074355
emp_wholesale_trade_p_ta,-0.076478
nces_public_schools_total_enrollment_1mi,-0.082145
store_density,-0.319510


In [124]:
lr_feature_importance[["WORK"]].sort_values(by="WORK", ascending=False)

Unnamed: 0,WORK
military_installations_2mi,0.208692
inrix_aadt_ew,0.200650
nces_public_schools_total_enrollment_10mi,0.184855
centerxy_gla_effective_10mi,0.174292
emp_p_ta,0.165026
...,...
nces_private_schools_total_enrollment_10mi,-0.185801
dtpop_unemployed_p_ta,-0.191755
centerxy_full_0p5_intersect_count,-0.196593
military_installations_10mi,-0.252598
