In [462]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
# from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import Lasso, Ridge

import warnings
warnings.filterwarnings("ignore")

## Load Data

In [8]:
trade_area = pd.read_csv("../../data/Smoothie King/smoothie_king_trade_area_variables.csv")
stores = pd.read_csv('../../data/Smoothie King/smoothie_king_stores.csv')

processed_demographic = pd.read_csv('../../data/Smoothie King/processed_demographic.csv')
processed_poi = pd.read_csv('../../data/Smoothie King/processed_poi.csv')
processed_trade_area = pd.read_csv("../../data/Smoothie King/processed_trade_area.csv")

merged = stores.merge(processed_trade_area, left_on="store", right_on="store_num").merge(processed_poi)
merged = merged.drop(columns=["store_num", "country_code"])
merged = merged.merge(processed_demographic, on='store')
merged

Unnamed: 0,store,longitude,latitude,category,cbsa_name,dma_name,state_name,market_size,store_density,age0018_p_ta,...,wealth_hhtotal_10mi,wealth_hhtotal_1mi,wealth_hhtotal_2mi,wealth_hhtotal_3mi,wealth_hhtotal_5mi,white_p_10mi,white_p_1mi,white_p_2mi,white_p_3mi,white_p_5mi
0,SK 1504,-97.650392,30.519353,SHOPPING,"Austin-Round Rock, TX","Austin, TX",Texas,Large Metro (2),Light Suburban,0.2907,...,241941.0,4785.0,19470.0,36419.0,79913.0,0.5118,0.4940,0.4745,0.4848,0.4933
1,SK 0057,-88.171150,30.672501,SHOPPING,"Mobile, AL","Mobile et al, AL-FL",Alabama,Medium City (4),Light Suburban,0.2416,...,120338.0,3357.0,16102.0,31227.0,60385.0,0.5205,0.7086,0.6128,0.6007,0.5423
2,SK 1415,-90.535722,38.784250,HOME,"St. Louis, MO-IL","St. Louis, MO",Missouri,Very Large Metro (1),Light Suburban,0.2198,...,169964.0,6216.0,19220.0,37834.0,62074.0,0.8088,0.8246,0.8307,0.8399,0.8622
3,SK 1231,-80.134700,26.100737,TRAVEL,"Miami-Fort Lauderdale et al, FL","Miami-Ft. Lauderdale, FL",Florida,Very Large Metro (1),Suburban,0.2060,...,422361.0,5996.0,28419.0,47104.0,104492.0,0.3940,0.6687,0.6887,0.5651,0.4456
4,SK 1535,-96.856651,32.996408,WORK,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Light Suburban,0.2866,...,479049.0,15491.0,37406.0,64597.0,154192.0,0.4906,0.3683,0.4413,0.4877,0.4822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,SK 1536,-96.872596,32.647809,HOME,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Exurban,0.2745,...,231740.0,4972.0,13427.0,31174.0,80875.0,0.1615,0.0518,0.1095,0.1357,0.1291
792,SK 1886,-105.077634,40.564695,OTHER,"Fort Collins, CO","Denver, CO",Colorado,Medium City (4),Light Suburban,0.2083,...,102970.0,5387.0,25743.0,47572.0,70570.0,0.8203,0.8247,0.8233,0.8060,0.8004
793,SK 0162,-95.478001,30.316531,SHOPPING,"Houston-The Woodlands et al, TX","Houston, TX",Texas,Very Large Metro (1),Exurban,0.2730,...,103313.0,3105.0,12138.0,20657.0,35348.0,0.6788,0.6116,0.4522,0.4567,0.5694
794,SK 1449,-78.968258,35.064994,SHOPPING,"Fayetteville, NC","Raleigh et al, NC",North Carolina,Medium City (4),Exurban,0.2624,...,119859.0,3274.0,14426.0,31451.0,63792.0,0.4147,0.3955,0.3864,0.3822,0.3765


In [9]:
merged = merged.drop(columns=["store", "longitude", "latitude", "state_name", "cbsa_name", "dma_name"])
merged = merged.dropna()

In [10]:
le = LabelEncoder()
merged["category"] = le.fit_transform(merged["category"])
merged

Unnamed: 0,category,market_size,store_density,age0018_p_ta,age65pl_p_ta,age85pl_p_ta,asian_p_ta,avg_faminc_ta,avghhinc_ta,black_p_ta,...,wealth_hhtotal_10mi,wealth_hhtotal_1mi,wealth_hhtotal_2mi,wealth_hhtotal_3mi,wealth_hhtotal_5mi,white_p_10mi,white_p_1mi,white_p_2mi,white_p_3mi,white_p_5mi
0,2,Large Metro (2),Light Suburban,0.2907,0.0909,0.0050,0.0555,108837.0,106093.0,0.0985,...,241941.0,4785.0,19470.0,36419.0,79913.0,0.5118,0.4940,0.4745,0.4848,0.4933
1,2,Medium City (4),Light Suburban,0.2416,0.1537,0.0134,0.0327,80752.0,70536.0,0.3890,...,120338.0,3357.0,16102.0,31227.0,60385.0,0.5205,0.7086,0.6128,0.6007,0.5423
2,0,Very Large Metro (1),Light Suburban,0.2198,0.1926,0.0147,0.0235,106893.0,97112.0,0.0552,...,169964.0,6216.0,19220.0,37834.0,62074.0,0.8088,0.8246,0.8307,0.8399,0.8622
3,3,Very Large Metro (1),Suburban,0.2060,0.2108,0.0185,0.0203,134373.0,149467.0,0.0998,...,422361.0,5996.0,28419.0,47104.0,104492.0,0.3940,0.6687,0.6887,0.5651,0.4456
4,4,Very Large Metro (1),Light Suburban,0.2866,0.1283,0.0057,0.0719,102367.0,96066.0,0.1935,...,479049.0,15491.0,37406.0,64597.0,154192.0,0.4906,0.3683,0.4413,0.4877,0.4822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,0,Very Large Metro (1),Exurban,0.2745,0.1529,0.0111,0.0085,66363.0,59465.0,0.5114,...,231740.0,4972.0,13427.0,31174.0,80875.0,0.1615,0.0518,0.1095,0.1357,0.1291
792,1,Medium City (4),Light Suburban,0.2083,0.1078,0.0100,0.0304,95226.0,73773.0,0.0159,...,102970.0,5387.0,25743.0,47572.0,70570.0,0.8203,0.8247,0.8233,0.8060,0.8004
793,2,Very Large Metro (1),Exurban,0.2730,0.1367,0.0111,0.0172,89271.0,86552.0,0.1078,...,103313.0,3105.0,12138.0,20657.0,35348.0,0.6788,0.6116,0.4522,0.4567,0.5694
794,2,Medium City (4),Exurban,0.2624,0.1357,0.0111,0.0338,67117.0,62256.0,0.3939,...,119859.0,3274.0,14426.0,31451.0,63792.0,0.4147,0.3955,0.3864,0.3822,0.3765


In [11]:
le.classes_

array(['HOME', 'OTHER', 'SHOPPING', 'TRAVEL', 'WORK'], dtype=object)

In [12]:
train_df, test_df = train_test_split(merged, test_size=0.1, random_state=42)
X_train = train_df.drop(columns=["category"])
y_train = train_df["category"]
X_test = test_df.drop(columns=["category"])
y_test = test_df["category"]

In [13]:
X_train

Unnamed: 0,market_size,store_density,age0018_p_ta,age65pl_p_ta,age85pl_p_ta,asian_p_ta,avg_faminc_ta,avghhinc_ta,black_p_ta,boomer_p_ta,...,wealth_hhtotal_10mi,wealth_hhtotal_1mi,wealth_hhtotal_2mi,wealth_hhtotal_3mi,wealth_hhtotal_5mi,white_p_10mi,white_p_1mi,white_p_2mi,white_p_3mi,white_p_5mi
347,Very Large Metro (1),Exurban,0.2969,0.1014,0.0071,0.0261,93118.0,89967.0,0.6010,0.1739,...,339976.0,2897.0,7695.0,20339.0,69606.0,0.3556,0.3138,0.3472,0.2909,0.3012
111,Very Large Metro (1),Exurban,0.2584,0.1199,0.0059,0.0633,72243.0,67727.0,0.5925,0.2033,...,165612.0,4039.0,11630.0,21781.0,48837.0,0.1917,0.1104,0.1591,0.1876,0.2289
54,Small Town (6),Exurban,0.2161,0.1461,0.0131,0.0151,73789.0,81607.0,0.4227,0.1953,...,17023.0,2130.0,5055.0,7792.0,11036.0,0.5330,0.6126,0.4865,0.4629,0.4879
747,Large City (3),Exurban,0.2736,0.1147,0.0083,0.0446,135460.0,134123.0,0.0187,0.1858,...,111942.0,6034.0,15331.0,24097.0,40865.0,0.8548,0.8571,0.8752,0.8696,0.8640
194,Very Large Metro (1),Light Suburban,0.2066,0.2145,0.0299,0.0209,104670.0,100387.0,0.0129,0.2525,...,383657.0,7770.0,21480.0,52006.0,157618.0,0.1545,0.1730,0.1775,0.1671,0.1539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Very Large Metro (1),Exurban,0.2466,0.1358,0.0071,0.1914,137597.0,150433.0,0.1700,0.2253,...,198818.0,2850.0,12780.0,19454.0,51903.0,0.2879,0.4312,0.4339,0.4043,0.3858
107,Very Large Metro (1),Suburban,0.1881,0.1878,0.0199,0.0884,146098.0,161364.0,0.0730,0.2473,...,618729.0,6372.0,23557.0,52008.0,144620.0,0.4821,0.7458,0.6984,0.4822,0.5032
275,Small City (5),Exurban,0.2144,0.1539,0.0150,0.0187,75062.0,62347.0,0.0340,0.1862,...,32520.0,1077.0,4642.0,9652.0,20486.0,0.8959,0.8952,0.8633,0.8600,0.8638
443,Very Large Metro (1),Exurban,0.2626,0.1129,0.0051,0.1057,103601.0,92428.0,0.4126,0.1956,...,230960.0,3283.0,12629.0,31658.0,66989.0,0.4286,0.3996,0.3771,0.3867,0.3569


In [14]:
pd.DataFrame(y_train)

Unnamed: 0,category
347,2
111,4
54,0
747,0
194,0
...,...
72,0
107,4
275,4
443,2


In [15]:
X_test

Unnamed: 0,market_size,store_density,age0018_p_ta,age65pl_p_ta,age85pl_p_ta,asian_p_ta,avg_faminc_ta,avghhinc_ta,black_p_ta,boomer_p_ta,...,wealth_hhtotal_10mi,wealth_hhtotal_1mi,wealth_hhtotal_2mi,wealth_hhtotal_3mi,wealth_hhtotal_5mi,white_p_10mi,white_p_1mi,white_p_2mi,white_p_3mi,white_p_5mi
763,Very Large Metro (1),Light Suburban,0.2089,0.1888,0.0190,0.0351,89178.0,79052.0,0.2004,0.2642,...,425785.0,4181.0,16756.0,38863.0,99277.0,0.5366,0.8026,0.7641,0.7298,0.5924
39,Very Large Metro (1),Exurban,0.2318,0.1285,0.0069,0.0300,101364.0,102495.0,0.3710,0.2285,...,153826.0,1858.0,6439.0,15598.0,44653.0,0.4448,0.4813,0.4673,0.4774,0.4642
214,Very Large Metro (1),Exurban,0.1728,0.2239,0.0169,0.0574,156673.0,164913.0,0.0116,0.3197,...,226771.0,3057.0,10208.0,17393.0,40455.0,0.7572,0.8839,0.8756,0.8771,0.8767
202,Large City (3),Light Suburban,0.1699,0.3051,0.0486,0.0133,107283.0,105748.0,0.0360,0.2926,...,145103.0,3694.0,13301.0,34356.0,71441.0,0.7879,0.8643,0.8544,0.8352,0.7615
239,Medium City (4),Rural,0.2394,0.1695,0.0134,0.0072,75991.0,69866.0,0.0684,0.2257,...,46769.0,2601.0,7086.0,10005.0,12789.0,0.7522,0.8381,0.8516,0.8566,0.8610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
716,Very Large Metro (1),Light Suburban,0.2643,0.1401,0.0118,0.0486,121663.0,144570.0,0.0486,0.2040,...,632997.0,6324.0,20117.0,54582.0,206752.0,0.3119,0.5108,0.4629,0.4882,0.4662
333,Medium City (4),Rural,0.2272,0.1991,0.0223,0.0867,108180.0,97932.0,0.1168,0.2347,...,71079.0,2013.0,5666.0,9563.0,29802.0,0.6993,0.6805,0.6978,0.7122,0.7136
238,Large City (3),Rural,0.2066,0.1813,0.0128,0.0171,98571.0,97421.0,0.0775,0.2471,...,69141.0,746.0,4253.0,7670.0,19110.0,0.7505,0.7251,0.7878,0.7899,0.7923
615,Very Large Metro (1),Light Suburban,0.2367,0.1503,0.0147,0.0091,67353.0,64782.0,0.2797,0.2080,...,507286.0,8939.0,29623.0,52751.0,151766.0,0.1624,0.0578,0.0581,0.0684,0.0786


In [17]:
pd.DataFrame(y_test)

Unnamed: 0,category
763,2
39,0
214,0
202,0
239,4
...,...
716,2
333,2
238,0
615,0


In [18]:
ordinal_features = ["market_size", "store_density"]
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
# drop_features = ["store", "longitude", "latitude", "state_name", "cbsa_name", "dma_name"]

In [19]:
market_levels = [
    "Small Town (6)",
    "Small City (5)",
    "Medium City (4)",
    "Large City (3)",
    "Large Metro (2)",
    "Very Large Metro (1)"
]
density_levels = [
    "Rural",
    "Exurban",
    "Suburban",
    "Light Suburban",
    "Light Urban",
    "Urban",
    "Super Urban"
]

In [20]:
ordinal_transformer = OrdinalEncoder(categories=[market_levels, density_levels], dtype=int)

preprocessor = make_column_transformer(
    # ("drop", drop_features),
    (StandardScaler(), numeric_features),
    (ordinal_transformer, ordinal_features),
)

In [21]:
class_weight = {
    "HOME": 0.24,
    "OTHER": 0.16,
    "SHOPPING": 0.22,
    "TRAVEL": 0.15,
    "WORK": 0.23
}
encoded_class_weights = {i: class_weight[label] for i, label in enumerate(le.classes_)}
class_weight = encoded_class_weights

## Build Logistic Regression Models

In [514]:
pipe_lr = make_pipeline(
    preprocessor,
    LogisticRegression(C=0.1, max_iter=1000, multi_class='ovr', n_jobs=-1,
    penalty='l2', random_state=42, solver='liblinear', class_weight=class_weight)
)
pipe_lr.fit(X_train, y_train);

In [515]:
pipe_lr.score(X_train, y_train)

0.8132956152758133

In [516]:
pipe_lr.score(X_test, y_test)

0.5316455696202531

In [518]:
pipe_lr_multi = make_pipeline(
    preprocessor,
    LogisticRegression(C=0.1, max_iter=1000, multi_class='multinomial', n_jobs=-1,
    penalty='l2', random_state=42, solver='saga', class_weight=class_weight)
    # LogisticRegression(C=0.1, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="multinomial", n_jobs=-1, 
    #                    class_weight=class_weight)
)
pipe_lr_multi.fit(X_train, y_train);

In [519]:
pipe_lr_multi.score(X_train, y_train)

0.7666195190947667

In [520]:
pipe_lr_multi.score(X_test, y_test)

0.5316455696202531

In [28]:
prediction_result = pd.DataFrame({
    "True label": y_test,
    "Logistic Regression (OVR)": pipe_lr.predict(X_test),
    "Logistic Regression (multinomial)": pipe_lr_multi.predict(X_test)
})

## Build LASSO/RidgeModel

In [465]:
pipe_lasso = make_pipeline(
    preprocessor,
    Lasso(random_state=42)
)
param_param = {
    "lasso__alpha": [0.00001, 0.00002, 0.00003, 0.00004],
    # "lasso__fit_intercept": [False, True],
    "lasso__max_iter": np.arange(9000, 11000, 500),
    "lasso__tol": 10.0**np.arange(-6, -1, 1),
    # "lasso__selection": ['cyclic', 'random']
}
lasso_search = RandomizedSearchCV(
    pipe_lasso, param_param, n_iter=50, cv=5, random_state=42
)

In [466]:
lasso_search.fit(X_train, y_train);

In [467]:
lasso_search.best_estimator_.named_steps['lasso']

In [482]:
lasso_rf_ovr = make_pipeline(
    preprocessor,
    SelectFromModel(
        # Lasso(alpha=0.00002, fit_intercept=False, precompute=False, 
        # tol=0.00001, random_state=42, selection='cyclic', max_iter=10000)
        Lasso(alpha=0.00002, fit_intercept=False, precompute=False, 
        tol=0.0005, random_state=42, selection='cyclic', max_iter=10000)
        # lasso_search.best_estimator_.named_steps['lasso']
    ),
    OneVsRestClassifier(
        # RandomForestClassifier(n_estimators=400, max_depth=20, max_leaf_nodes=30, class_weight=None, 
        #                        min_samples_leaf=10, min_samples_split=10, n_jobs=-1, random_state=42)
        rf_grid_search_best_est
    )
)
lasso_rf_ovr.fit(X_train, y_train)

In [484]:
print("Training set score: {:.2f}".format(lasso_rf_ovr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso_rf_ovr.score(X_test, y_test)))

Training set score: 0.69
Test set score: 0.66


In [487]:
ridge_rf_ovr = make_pipeline(
    preprocessor,
    SelectFromModel(
        Ridge(alpha=0.0009, fit_intercept=False, solver='auto', #positive=True,
        tol=0.001, random_state=42, max_iter=20000)
    ),
    OneVsRestClassifier(
        # RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight=None, 
        #                        min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
        rf_grid_search_best_est
    )
)
ridge_rf_ovr.fit(X_train, y_train);

In [488]:
print("Training set score: {:.2f}".format(ridge_rf_ovr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge_rf_ovr.score(X_test, y_test)))

Training set score: 0.67
Test set score: 0.65


## Build Random Forest Models

### Basic RF model

In [29]:
pipe_rf = make_pipeline(
    preprocessor,
    RandomForestClassifier(n_jobs=-1, random_state=42)
)
rf_param_grid = {
    "randomforestclassifier__n_estimators": [25, 50, 100, 150],
    "randomforestclassifier__max_depth": [5, 10, 20, 30],
    "randomforestclassifier__max_leaf_nodes": [30, 50, 70],
    "randomforestclassifier__min_samples_leaf": [10, 20, 30, 40, 50],
    "randomforestclassifier__min_samples_split": [10, 20, 30],
    "randomforestclassifier__class_weight": [None, "balanced", class_weight]
}
rf_grid_search = RandomizedSearchCV(
    pipe_rf, rf_param_grid, n_iter=30, cv=5, n_jobs=-1, return_train_score=True
)
rf_grid_search.fit(X_train, y_train);

In [32]:
rf_grid_search_best_est = rf_grid_search.best_estimator_.named_steps['randomforestclassifier']
rf_grid_search_best_est

In [None]:
# pd.DataFrame(rf_grid_search.cv_results_)[
#     [
#         "param_randomforestclassifier__n_estimators",
#         "param_randomforestclassifier__max_depth",
#         "param_randomforestclassifier__max_leaf_nodes",
#         "param_randomforestclassifier__min_samples_leaf",
#         "param_randomforestclassifier__min_samples_split",
#         "param_randomforestclassifier__class_weight",
#         "mean_fit_time",
#         "rank_test_score",
#         "mean_test_score",
#     ]
# ].set_index("rank_test_score").sort_index().head(10)

In [35]:
rf_tuned_base = make_pipeline(
    preprocessor,
    rf_grid_search_best_est
)
rf_tuned_base.fit(X_train, y_train);

In [36]:
rf_tuned_base.score(X_train, y_train)

0.6265912305516266

In [37]:
rf_tuned_base.score(X_test, y_test)

0.6075949367088608

In [43]:
prediction_result["Random Forest(Base)"] = rf_tuned_base.predict(X_test)

In [38]:
rf_tuned = make_pipeline(
    preprocessor,
    RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight=class_weight, 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
)
rf_tuned.fit(X_train, y_train);

In [39]:
rf_tuned.score(X_train, y_train)

0.7157001414427157

In [40]:
rf_tuned.score(X_test, y_test)

0.5443037974683544

In [41]:
prediction_result["Random Forest"] = rf_tuned.predict(X_test)

In [44]:
prediction_result

Unnamed: 0,True label,Logistic Regression (OVR),Logistic Regression (multinomial),Random Forest,Random Forest(Base)
763,2,0,0,0,0
39,0,0,0,0,0
214,0,0,0,0,0
202,0,4,4,0,0
239,4,4,4,0,2
...,...,...,...,...,...
716,2,2,4,4,4
333,2,0,0,2,2
238,0,2,2,0,0
615,0,0,0,0,0


In [45]:
# Takes long time to run

rf_ovr_test = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
                           n_jobs=-1, class_weight=class_weight)
    ),
    RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight="balanced", 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
)
rf_param_grid = {
    "randomforestclassifier__n_estimators": [25, 50, 100, 150],
    # "randomforestclassifier__max_features": ["sqrt", "log2", None],
    "randomforestclassifier__max_features": [5, 10, 15, 20, 30],
    "randomforestclassifier__max_depth": [5, 10, 20, 30],
    "randomforestclassifier__max_leaf_nodes": [30, 50, 70],
    "randomforestclassifier__min_samples_leaf": [10, 20, 30, 40, 50],
    "randomforestclassifier__min_samples_split": [10, 20, 30, 40],
    "randomforestclassifier__class_weight": [None, "balanced", class_weight]
}
rf_ovr_search = RandomizedSearchCV(
    rf_ovr_test, rf_param_grid, n_iter=30, cv=5, n_jobs=-1, random_state=42
)

In [46]:
rf_ovr_search.fit(X_train, y_train);

In [None]:
# pd.DataFrame(rf_ovr_search.cv_results_)[
#     [
#         "param_randomforestclassifier__n_estimators",
#         "param_randomforestclassifier__max_features",
#         "param_randomforestclassifier__max_depth",
#         "param_randomforestclassifier__max_leaf_nodes",
#         "param_randomforestclassifier__min_samples_leaf",
#         "param_randomforestclassifier__min_samples_split",
#         "param_randomforestclassifier__class_weight",
#         "mean_fit_time",
#         "rank_test_score",
#         "mean_test_score",
#     ]
# ].set_index("rank_test_score").sort_values(by="mean_test_score", ascending=False)

In [47]:
rf_ovr_search.best_params_

{'randomforestclassifier__n_estimators': 100,
 'randomforestclassifier__min_samples_split': 10,
 'randomforestclassifier__min_samples_leaf': 10,
 'randomforestclassifier__max_leaf_nodes': 70,
 'randomforestclassifier__max_features': 30,
 'randomforestclassifier__max_depth': 20,
 'randomforestclassifier__class_weight': 'balanced'}

In [48]:
rf_ovr_search.score(X_train, y_train)

0.8557284299858557

In [49]:
rf_ovr_search.score(X_test, y_test)

0.620253164556962

### RandomForest with regularization using LogisticRegression

In [50]:
pipe_lr_rf = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
                           n_jobs=-1, class_weight=class_weight)
    ),
    RandomForestClassifier(n_estimators=100, max_depth=20, max_leaf_nodes=70, class_weight=class_weight, 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
)
pipe_lr_rf.fit(X_train, y_train);

In [51]:
pipe_lr_rf.score(X_train, y_train)

0.7171145685997171

In [52]:
pipe_lr_rf.score(X_test, y_test)

0.569620253164557

In [53]:
prediction_result["Random Forest (L2 reg with LR)"] = pipe_lr_rf.predict(X_test)

In [None]:
# pipe_lr_rf_test = make_pipeline(
#     preprocessor,
#     SelectFromModel(
#         LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
#                            n_jobs=-1, class_weight=class_weight)
#     ),
#     RandomForestClassifier(n_estimators=100, max_depth=20, max_leaf_nodes=70, class_weight=class_weight, 
#                            min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
# )

In [54]:
lr_rf_param = {
    "selectfrommodel__estimator__C": [0.1, 0.15, 0.2, 0.3, 0.4],
    "selectfrommodel__estimator__penalty": ["l1", "l2"],
    "selectfrommodel__estimator__multi_class": ["ovr", "multinomial"],
    "selectfrommodel__estimator__class_weight": [None, "balanced", class_weight]
}
lr_rf_search = RandomizedSearchCV(
    pipe_lr_rf, lr_rf_param, n_iter=30, cv=5, n_jobs=-1, random_state=42
)
lr_rf_search.fit(X_train, y_train);

In [55]:
pd.DataFrame(lr_rf_search.cv_results_)[
    [
        "param_selectfrommodel__estimator__C",
        "param_selectfrommodel__estimator__penalty",
        "param_selectfrommodel__estimator__multi_class",
        "param_selectfrommodel__estimator__class_weight",
        "mean_fit_time",
        "rank_test_score",
        "mean_test_score",
    ]
].set_index("rank_test_score").sort_index().head(10)

Unnamed: 0_level_0,param_selectfrommodel__estimator__C,param_selectfrommodel__estimator__penalty,param_selectfrommodel__estimator__multi_class,param_selectfrommodel__estimator__class_weight,mean_fit_time,mean_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.2,l1,multinomial,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",42.314941,0.469573
2,0.1,l2,ovr,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",14.685809,0.469553
3,0.4,l1,ovr,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",34.24278,0.466717
4,0.15,l2,multinomial,balanced,18.643862,0.461123
5,0.2,l2,multinomial,balanced,20.258965,0.459704
6,0.15,l2,ovr,,22.785199,0.459674
7,0.15,l1,ovr,,52.136794,0.459654
8,0.4,l1,multinomial,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",45.146192,0.458246
9,0.1,l1,ovr,,67.790766,0.458216
10,0.3,l2,ovr,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",29.568501,0.456887


In [56]:
lr_rf_search.score(X_train, y_train)

0.6676096181046676

In [57]:
lr_rf_search.score(X_test, y_test)

0.5949367088607594

In [58]:
lr_rf_l1 = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.1, penalty="l1", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
                           n_jobs=-1, class_weight=None)
    ),
    RandomForestClassifier(n_estimators=100, max_depth=20, max_leaf_nodes=70, class_weight=class_weight, 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
)
lr_rf_l1.fit(X_train, y_train);

In [59]:
print(lr_rf_l1.score(X_train, y_train))
print(lr_rf_l1.score(X_test, y_test))

0.693069306930693
0.5949367088607594


In [60]:
prediction_result["Random Forest (L1 reg with LR)"] = lr_rf_l1.predict(X_test)

In [61]:
pipe_lr_rf_rank2 = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.4, penalty="l1", random_state=42, solver="saga", max_iter=10000, multi_class="multinomial", 
                           n_jobs=-1, class_weight=class_weight)
    ),
    RandomForestClassifier(n_estimators=100, max_depth=20, max_leaf_nodes=70, class_weight=class_weight, 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
)
pipe_lr_rf_rank2.fit(X_train, y_train);

In [62]:
print(pipe_lr_rf_rank2.score(X_train, y_train))
print(pipe_lr_rf_rank2.score(X_test, y_test))

0.6973125884016973
0.5569620253164557


**Second best set of parameters do not lead to better result. Just use the best set of parameters.**

### Random Forest with OVR and regularization using LogisticRegression

In [63]:
lr_rf_ovr = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
                           n_jobs=-1, class_weight=class_weight)
    ),
    OneVsRestClassifier(
        RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight=None, 
                               min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
    )
)

In [64]:
lr_rf_ovr_search = RandomizedSearchCV(
    lr_rf_ovr, lr_rf_param, n_iter=30, cv=5, n_jobs=-1, random_state=42
)
lr_rf_ovr_search.fit(X_train, y_train);

In [65]:
pd.DataFrame(lr_rf_ovr_search.cv_results_)[
    [
        "param_selectfrommodel__estimator__C",
        "param_selectfrommodel__estimator__penalty",
        "param_selectfrommodel__estimator__multi_class",
        "param_selectfrommodel__estimator__class_weight",
        "mean_fit_time",
        "rank_test_score",
        "mean_test_score",
    ]
].set_index("rank_test_score").sort_values(by="mean_test_score", ascending=False).head(10)

Unnamed: 0_level_0,param_selectfrommodel__estimator__C,param_selectfrommodel__estimator__penalty,param_selectfrommodel__estimator__multi_class,param_selectfrommodel__estimator__class_weight,mean_fit_time,mean_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.3,l2,ovr,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",29.951961,0.497872
2,0.4,l1,multinomial,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",44.621645,0.497852
3,0.4,l2,ovr,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",20.464391,0.495045
4,0.3,l1,multinomial,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",33.234358,0.492179
5,0.1,l1,ovr,balanced,27.67488,0.485176
6,0.4,l1,ovr,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",34.347968,0.482339
7,0.2,l2,ovr,,17.754153,0.482309
8,0.2,l2,ovr,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",25.093369,0.482299
9,0.1,l2,ovr,"{0: 0.24, 1: 0.16, 2: 0.22, 3: 0.15, 4: 0.23}",15.177449,0.482279
10,0.1,l2,multinomial,,22.485967,0.479533


In [66]:
lr_rf_ovr_search.score(X_train, y_train)

0.8132956152758133

In [67]:
lr_rf_ovr_search.score(X_test, y_test)

0.6075949367088608

In [68]:
lr_rf_ovr = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.15, penalty="l1", random_state=42, solver="saga", max_iter=10000, multi_class="multinomial", 
                           n_jobs=-1, class_weight=None)
    ),
    OneVsRestClassifier(
        RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight=None, 
                               min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
    )
)
lr_rf_ovr.fit(X_train, y_train);

In [69]:
print(lr_rf_ovr.score(X_train, y_train))
print(lr_rf_ovr.score(X_test, y_test))

0.8203677510608204
0.5949367088607594


In [70]:
prediction_result["Random Forest (OVR with L1 reg using LR)"] = lr_rf_ovr.predict(X_test)

In [71]:
rf_ovr = make_pipeline(
    preprocessor,
    OneVsRestClassifier(
        RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight="balanced", 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
    )
)
rf_ovr.fit(X_train, y_train);

In [72]:
rf_ovr.score(X_train, y_train)

0.8925035360678925

In [73]:
rf_ovr.score(X_test, y_test)

0.6075949367088608

In [74]:
prediction_result["rf_ovr"] = rf_ovr.predict(X_test)

## LGBM Model - Poor score, won't use

In [75]:
pipe_lgbm_ovr = make_pipeline(
    preprocessor,
    OneVsRestClassifier(
        LGBMClassifier(random_state=42, n_jobs=-1)
    )
)
param_lgbm = {
    "onevsrestclassifier__estimator__n_estimators": [50, 100, 150, 200],
    "onevsrestclassifier__estimator__boosting_type": ["gbdt", "dart"],
    "onevsrestclassifier__estimator__learning_rate": [0.1, 0.3, 0.5, 0.7],
    "onevsrestclassifier__estimator__max_depth": [5, 10, 15, 20],
    "onevsrestclassifier__estimator__reg_alpha": [0.1, 0.3, 0.5, 0.7],
    "onevsrestclassifier__estimator__reg_lambda": [0.1, 0.3, 0.5, 0.7],
    # "onevsrestclassifier__estimator__min_child_samples": [20, 30, 40, 50],
    "onevsrestclassifier__estimator__num_leaves": [5, 10, 15, 20],
    "onevsrestclassifier__estimator__min_data_in_leaf": [30, 40, 50, 60],
    "onevsrestclassifier__estimator__class_weight": [None, "balanced"]
}
lgbm_search = RandomizedSearchCV(
    pipe_lgbm_ovr, param_lgbm, n_iter=30, cv=5, n_jobs=-1, random_state=42
)

In [76]:
lgbm_search.fit(X_train, y_train);







In [None]:
# pd.DataFrame(lgbm_search.cv_results_)[
#     [
#         "param_onevsrestclassifier__estimator__n_estimators",
#         "param_onevsrestclassifier__estimator__boosting_type",
#         "param_onevsrestclassifier__estimator__learning_rate",
#         "param_onevsrestclassifier__estimator__max_depth",
#         "param_onevsrestclassifier__estimator__reg_alpha",
#         "param_onevsrestclassifier__estimator__reg_lambda",
#         "param_onevsrestclassifier__estimator__class_weight",
#         # "param_onevsrestclassifier__estimator__min_child_samples",
#         "param_onevsrestclassifier__estimator__num_leaves",
#         "param_onevsrestclassifier__estimator__min_data_in_leaf",
#         "mean_fit_time",
#         "rank_test_score",
#         "mean_test_score",
#     ]
# ].set_index("rank_test_score").sort_index().head(20)

In [77]:
lgbm_search.score(X_train, y_train)

1.0

In [78]:
lgbm_search.score(X_test, y_test)

0.5569620253164557

In [79]:
lgbm_test = make_pipeline(
    preprocessor,
    OneVsRestClassifier(
        LGBMClassifier(random_state=42, n_jobs=-1, n_estimators=150, boosting_type="gbdt", learning_rate=0.1, max_depth=10,
                      reg_alpha=0.7, reg_lambda=0.7, num_leaves=20, min_data_in_leaf=30, class_weight=None)
    )
)
lgbm_test.fit(X_train, y_train);



In [80]:
lgbm_test.score(X_train, y_train)

1.0

In [81]:
lgbm_test.score(X_test, y_test)

0.5822784810126582

In [82]:
lgbm_search.best_params_
# score=0.62025

{'onevsrestclassifier__estimator__reg_lambda': 0.5,
 'onevsrestclassifier__estimator__reg_alpha': 0.1,
 'onevsrestclassifier__estimator__num_leaves': 10,
 'onevsrestclassifier__estimator__n_estimators': 200,
 'onevsrestclassifier__estimator__min_data_in_leaf': 40,
 'onevsrestclassifier__estimator__max_depth': 5,
 'onevsrestclassifier__estimator__learning_rate': 0.1,
 'onevsrestclassifier__estimator__class_weight': 'balanced',
 'onevsrestclassifier__estimator__boosting_type': 'gbdt'}

## XGB Model - poor score, won't use

In [83]:
pipe_xgb = make_pipeline(
    preprocessor,
    OneVsRestClassifier(
        XGBClassifier(random_state=42, verbosity=0)
    )
)
param_xgb = {
    "onevsrestclassifier__estimator__booster": ["gbtree", "dart"],
    "onevsrestclassifier__estimator__learning_rate": [0.1, 0.3, 0.5, 0.7],
    "onevsrestclassifier__estimator__max_depth": [3, 5, 7, 9],
    "onevsrestclassifier__estimator__reg_alpha": [0.1, 0.3, 0.5, 0.7],
    "onevsrestclassifier__estimator__reg_lambda": [0.1, 0.3, 0.5, 0.7],
    "onevsrestclassifier__estimator__subsample": [0.3, 0.5, 0.7, 1],
    "onevsrestclassifier__estimator__min_child_weight": [1, 4, 7, 10]
}
xgb_search = RandomizedSearchCV(
    pipe_xgb, param_xgb, n_iter=30, cv=5, n_jobs=-1, random_state=42
)

In [84]:
xgb_search.fit(X_train, y_train);



In [None]:
# pd.DataFrame(xgb_search.cv_results_)[
#     [
#         "param_onevsrestclassifier__estimator__booster",
#         "param_onevsrestclassifier__estimator__learning_rate",
#         "param_onevsrestclassifier__estimator__max_depth",
#         "param_onevsrestclassifier__estimator__reg_alpha",
#         "param_onevsrestclassifier__estimator__reg_lambda",
#         "param_onevsrestclassifier__estimator__subsample",
#         "param_onevsrestclassifier__estimator__min_child_weight",
#         "mean_fit_time",
#         "rank_test_score",
#         "mean_test_score",
#     ]
# ].set_index("rank_test_score").sort_index().head(20)

In [85]:
xgb_search.score(X_train, y_train)

1.0

In [86]:
xgb_search.score(X_test, y_test)

0.569620253164557

In [87]:
xgb_test = make_pipeline(
    preprocessor,
    OneVsRestClassifier(
        XGBClassifier(random_state=42, verbosity=0, booster="gbtree", learning_rate=0.1, max_depth=5, reg_alpha=0.5, reg_lambda=0.1,
                     subsample=0.7, min_child_weight=1)
    )
)
xgb_test.fit(X_train, y_train);

In [88]:
xgb_test.score(X_train, y_train)

1.0

In [89]:
xgb_test.score(X_test, y_test)
XGBClassifier(random_state=42, verbosity=0, booster="gbtree", learning_rate=0.1, max_depth=5, reg_alpha=0.5, reg_lambda=0.1,
                    subsample=0.7)

In [90]:
lr_xgb_test = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
                           n_jobs=-1, class_weight=class_weight)
    ),
    OneVsRestClassifier(
        XGBClassifier(random_state=42, verbosity=0, booster="gbtree", learning_rate=0.1, max_depth=5, reg_alpha=0.5, reg_lambda=0.1,
                     subsample=0.7)
    )
)
lr_xgb_test.fit(X_train, y_train);

In [91]:
lr_xgb_test.score(X_train, y_train)

1.0

In [92]:
lr_xgb_test.score(X_test, y_test)

0.6075949367088608

In [93]:
prediction_result["xgb"] = xgb_test.predict(X_test)

In [None]:
prediction_result.head(20)

In [None]:
prediction_result.iloc[20:50]

In [None]:
prediction_result.iloc[50:]

## Try two different combinations of ensembling models

In [525]:
classifiers_1 = {
    "rf": rf_tuned,
    "lr_rf_ovr": lr_rf_ovr,
    "lr_rf_l1": lr_rf_l1
}
classifiers_2 = {
    "rf": rf_tuned,
    "lr_rf": pipe_lr_rf,
    "lr_rf_l1": lr_rf_l1
}

classifiers_3 = {
    "lgbm": pipe_lgbm_ovr,
    "xgboost": pipe_xgb,
    "rf": rf_tuned,
    "lr_rf_l1": lr_rf_l1,
    "lr_rf_ovr": lr_rf_ovr,
    "lasso_rf": lasso_rf_ovr,
    "ridge_rf": ridge_rf_ovr
}

### 1: RF + RF (OVR, L1) + RF(L1)

In [None]:
# classifiers = {
#     "lr": pipe_lr,
#     "lr_multi": pipe_lr_multi,
#     "rf": rf_tuned,
#     "lr_rf": pipe_lr_rf,
#     "rf_ovr": rf_ovr,
#     "lr_rf_ovr": lr_rf_ovr,
#     "lr_rf_l1": lr_rf_l1,
# }

In [95]:
averaging_model = VotingClassifier(
    list(classifiers_1.items()), voting="soft"
)
averaging_model.fit(X_train, y_train);

In [96]:
averaging_model.score(X_train, y_train)

0.7425742574257426

In [97]:
averaging_model.score(X_test, y_test)

0.5949367088607594

In [98]:
averaging_model_hard = VotingClassifier(
    list(classifiers_1.items()), voting="hard"
)
averaging_model_hard.fit(X_train, y_train);

In [99]:
averaging_model_hard.score(X_train, y_train)

0.7256011315417256

In [100]:
averaging_model_hard.score(X_test, y_test)

0.5822784810126582

In [101]:
stacking_model = StackingClassifier(list(classifiers_1.items()))
stacking_model.fit(X_train, y_train);

In [102]:
stacking_model.score(X_train, y_train)

0.7666195190947667

In [103]:
stacking_model.score(X_test, y_test)

0.5949367088607594

In [104]:
stack_rf = StackingClassifier(list(classifiers_1.items()), RandomForestClassifier(n_jobs=-1, random_state=42))
stack_rf.fit(X_train, y_train);

In [105]:
stack_rf.score(X_train, y_train)

0.7128712871287128

In [106]:
stack_rf.score(X_test, y_test)

0.6329113924050633

### 2: RF + RF (L2) + RF(L1)

In [107]:
averaging_model_2 = VotingClassifier(
    list(classifiers_2.items()), voting="soft"
)
averaging_model_2.fit(X_train, y_train);

In [108]:
averaging_model_2.score(X_train, y_train)

0.7114568599717115

In [109]:
averaging_model_2.score(X_test, y_test)

0.5949367088607594

In [110]:
averaging_model_hard_2 = VotingClassifier(
    list(classifiers_2.items()), voting="hard"
)
averaging_model_hard_2.fit(X_train, y_train);

In [111]:
averaging_model_hard_2.score(X_train, y_train)

0.7157001414427157

In [112]:
averaging_model_hard_2.score(X_test, y_test)

0.569620253164557

In [529]:
stacking_model_2 = StackingClassifier(list(classifiers_2.items()))
stacking_model_2.fit(X_train, y_train);

In [None]:
stacking_model_2.score(X_train, y_train)

0.7666195190947667

In [None]:
stacking_model_2.score(X_test, y_test)

0.5949367088607594

In [None]:
stack_rf_2 = StackingClassifier(list(classifiers_2.items()), RandomForestClassifier(n_jobs=-1, random_state=42))
stack_rf_2.fit(X_train, y_train);

In [None]:
stack_rf_2.score(X_train, y_train)

0.7128712871287128

In [None]:
stack_rf_2.score(X_test, y_test)

0.6329113924050633

### 3: RF + RF (L2) + RF(L1)

In [526]:
averaging_model_3 = VotingClassifier(
    list(classifiers_3.items()), voting="soft"
)
averaging_model_3.fit(X_train, y_train);

In [527]:
averaging_model_3.score(X_train, y_train)

0.9957567185289957

In [528]:
averaging_model_3.score(X_test, y_test)

0.5949367088607594

In [None]:
averaging_model_hard_3 = VotingClassifier(
    list(classifiers_3.items()), voting="hard"
)
averaging_model_hard_3.fit(X_train, y_train);

In [None]:
averaging_model_hard_3.score(X_train, y_train)

0.7157001414427157

In [None]:
averaging_model_hard_3.score(X_test, y_test)

0.569620253164557

In [None]:
stacking_model_3 = StackingClassifier(list(classifiers_3.items()))
stacking_model_3.fit(X_train, y_train);

In [None]:
stacking_model_3.score(X_train, y_train)

0.7666195190947667

In [None]:
stacking_model_3.score(X_test, y_test)

0.5949367088607594

In [None]:
stack_rf_3 = StackingClassifier(list(classifiers_3.items()), RandomForestClassifier(n_jobs=-1, random_state=42))
stack_rf_3.fit(X_train, y_train);

In [None]:
stack_rf_3.score(X_train, y_train)

0.7128712871287128

In [None]:
stack_rf_3.score(X_test, y_test)

0.6329113924050633