In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
# from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.feature_selection import RFE

import warnings
warnings.filterwarnings("ignore")

## Load and Pre-process Data

In [2]:
DIR = "../../data/"
SMOOTHIE = "Smoothie King/"

smoothie_demographic = pd.read_csv(DIR + SMOOTHIE + "processed_demographic.csv")
smoothie_stores = pd.read_csv(DIR + SMOOTHIE + "smoothie_king_stores.csv")
smoothie_poi_variables = pd.read_csv(DIR + SMOOTHIE + "processed_poi.csv")
# smoothie_sister = pd.read_csv(DIR + SMOOTHIE + "competition_sister_variables.csv")
smoothie_trade_area = pd.read_csv(DIR + SMOOTHIE + "processed_trade_area.csv").rename(columns={"store_num": "store"})

In [3]:
smoothie_merged = smoothie_stores.merge(
    smoothie_demographic, on="store", how="outer"
).merge(
    smoothie_poi_variables, on="store", how="outer"
).merge(
    smoothie_trade_area, on="store", how="outer"
)
smoothie_merged

Unnamed: 0,store,longitude,latitude,category,cbsa_name,dma_name,state_name,market_size,store_density,age0018_p_10mi,...,popgrfy_ta,popgrpy_ta,poverty_inpoverty_p_ta,spend_breakfastbrunch_ta,spend_dinner_ta,spend_foodbev_ta,spend_lunch_ta,wealth_hhavg_ta,wealth_hhtotal_ta,white_p_ta
0,SK 1504,-97.650392,30.519353,SHOPPING,"Austin-Round Rock, TX","Austin, TX",Texas,Large Metro (2),Light Suburban,0.2805,...,8.3789,3.9235,0.0611,7069439.0,40790484.0,230383651.0,23166216.0,240573.0,25223.0,0.4897
1,SK 0057,-88.171150,30.672501,SHOPPING,"Mobile, AL","Mobile et al, AL-FL",Alabama,Medium City (4),Light Suburban,0.2264,...,0.6017,0.3932,0.1830,4724526.0,25460067.0,160135521.0,14653701.0,217054.0,22216.0,0.5129
2,SK 1415,-90.535722,38.784250,HOME,"St. Louis, MO-IL","St. Louis, MO",Missouri,Very Large Metro (1),Light Suburban,0.2129,...,2.5003,0.7142,0.0639,4501211.0,24794631.0,151609187.0,14129014.0,245860.0,19907.0,0.8459
3,SK 1231,-80.134700,26.100737,TRAVEL,"Miami-Fort Lauderdale et al, FL","Miami-Ft. Lauderdale, FL",Florida,Very Large Metro (1),Suburban,0.2174,...,7.6482,12.7188,0.0831,4038906.0,23214366.0,146417939.0,12859709.0,255812.0,22124.0,0.7020
4,SK 1535,-96.856651,32.996408,WORK,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Light Suburban,0.2620,...,2.1871,-5.7794,0.0800,5919218.0,32751951.0,207616741.0,18696840.0,216763.0,29251.0,0.4593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,SK 1536,-96.872596,32.647809,HOME,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Exurban,0.2664,...,2.0540,-0.2994,0.1833,7614977.0,40037412.0,264008408.0,23332500.0,199437.0,29430.0,0.1490
792,SK 1886,-105.077634,40.564695,OTHER,"Fort Collins, CO","Denver, CO",Colorado,Medium City (4),Light Suburban,0.2224,...,4.7691,2.3442,0.2202,5118356.0,28458285.0,197532602.0,16637181.0,216659.0,25712.0,0.7874
793,SK 0162,-95.478001,30.316531,SHOPPING,"Houston-The Woodlands et al, TX","Houston, TX",Texas,Very Large Metro (1),Exurban,0.2412,...,9.9822,8.1691,0.1202,4677280.0,25445615.0,161088758.0,14627532.0,216855.0,19407.0,0.5041
794,SK 1449,-78.968258,35.064994,SHOPPING,"Fayetteville, NC","Raleigh et al, NC",North Carolina,Medium City (4),Exurban,0.2614,...,-0.3816,-1.0761,0.1809,10318478.0,54577513.0,349542567.0,31683602.0,208448.0,41681.0,0.3933


In [4]:
# smoothie_merged = smoothie_merged.drop(columns=["store", "longitude", "latitude", "state_name", "cbsa_name", "dma_name"])
smoothie_merged = smoothie_merged.dropna()

In [5]:
# le = LabelEncoder()
# smoothie_merged["category"] = le.fit_transform(smoothie_merged["category"])
# smoothie_merged

In [6]:
# le.classes_

In [7]:
train_df, test_df = train_test_split(smoothie_merged, test_size=0.1, random_state=42)
X_train = train_df.drop(columns=["category"])
y_train = train_df["category"]
X_test = test_df.drop(columns=["category"])
y_test = test_df["category"]

In [8]:
X_train

Unnamed: 0,store,longitude,latitude,cbsa_name,dma_name,state_name,market_size,store_density,age0018_p_10mi,age0018_p_1mi,...,popgrfy_ta,popgrpy_ta,poverty_inpoverty_p_ta,spend_breakfastbrunch_ta,spend_dinner_ta,spend_foodbev_ta,spend_lunch_ta,wealth_hhavg_ta,wealth_hhtotal_ta,white_p_ta
347,SK 1190,-84.526876,33.805263,"Atlanta-Sandy Springs et al, GA","Atlanta, GA",Georgia,Very Large Metro (1),Exurban,0.2577,0.3140,...,6.7371,6.9005,0.1391,4605139.0,25550739.0,163539069.0,14579649.0,224903.0,18071.0,0.1949
111,SK 1691,-84.275254,33.550809,"Atlanta-Sandy Springs et al, GA","Atlanta, GA",Georgia,Very Large Metro (1),Exurban,0.2497,0.3095,...,7.1401,4.8396,0.1086,5137151.0,27922973.0,181373630.0,16115685.0,215428.0,19927.0,0.1810
54,SK 1429,-83.227129,33.082301,"Milledgeville, GA","Macon, GA",Georgia,Small Town (6),Exurban,0.2166,0.1758,...,3.0957,3.0749,0.2600,1812957.0,9820364.0,63494913.0,5663733.0,208187.0,8019.0,0.5244
747,SK 1469,-93.851958,41.601759,"Des Moines-West Des Moines, IA","Des Moines-Ames, IA",Iowa,Large City (3),Exurban,0.2410,0.3099,...,21.3935,25.0273,0.0412,6043103.0,36108285.0,208268660.0,20041021.0,274621.0,23326.0,0.8814
194,SK 0855,-80.366428,25.701421,"Miami-Fort Lauderdale et al, FL","Miami-Ft. Lauderdale, FL",Florida,Very Large Metro (1),Light Suburban,0.2104,0.2140,...,1.4475,-1.0841,0.1262,6195800.0,34382343.0,221889135.0,19469562.0,244502.0,21025.0,0.1740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,SK 1814,-95.687153,29.564517,"Houston-The Woodlands et al, TX","Houston, TX",Texas,Very Large Metro (1),Exurban,0.2529,0.2331,...,8.1334,4.1470,0.0255,3646656.0,21596143.0,128772917.0,12003993.0,283207.0,11713.0,0.4456
107,SK 0977,-73.617434,40.748259,"New York-Newark et al, NY-NJ-PA","New York, NY",New York,Very Large Metro (1),Suburban,0.1947,0.1929,...,0.4340,1.5922,0.0582,8059413.0,47541302.0,295471375.0,26218584.0,267475.0,21469.0,0.6142
275,SK 0328,-85.510046,36.136788,"Cookeville, TN","Nashville, TN",Tennessee,Small City (5),Exurban,0.2099,0.1997,...,6.3171,6.0445,0.2057,3478916.0,18420003.0,126691568.0,10743594.0,210374.0,18548.0,0.8522
443,SK 1296,-80.769022,35.343219,"Charlotte-Concord et al, NC-SC","Charlotte, NC",North Carolina,Very Large Metro (1),Exurban,0.2504,0.2434,...,4.9382,2.5854,0.0547,3199585.0,18046193.0,113663688.0,10245497.0,246130.0,13113.0,0.3680


In [9]:
drop_features = ['store',
 'longitude',
 'latitude',
 'cbsa_name',
 'dma_name',
 'state_name',
 ]

passthrough_features = []

ordinal_features_oth = [
    "market_size",
    "store_density",
]
ordering_ordinal_oth = [
    ["Very Large Metro (1)", "Large Metro (2)", "Large City (3)", "Medium City (4)", "Small City (5)", "Small Town (6)"],
    ["Rural", "Exurban", "Suburban", "Light Suburban", "Light Urban", "Urban", "Super Urban"],
]
numeric_features = list(set(smoothie_merged.select_dtypes(include=np.number).columns.tolist()) - {"longitude", "latitude"})

In [10]:
numeric_transformer = make_pipeline(
    SimpleImputer(strategy="median"), 
    StandardScaler()
)

ordinal_transformer_oth = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(categories=ordering_ordinal_oth),
)

preprocessor = make_column_transformer(
    ("drop", drop_features),
    (numeric_transformer, numeric_features),
    (ordinal_transformer_oth, ordinal_features_oth),
    ("passthrough", passthrough_features),
)

In [11]:
preprocessor.fit(X_train)

In [12]:
column_names = (
    numeric_features
    + preprocessor.named_transformers_['pipeline-2']['ordinalencoder'].get_feature_names_out().tolist()
)
len(column_names)

920

In [14]:
transformed_X_train_df = pd.DataFrame(transformed_X_train, columns=column_names)
transformed_X_train_df

Unnamed: 0,centerxy_gla_effective_1mi,hh_type_married_nochild_p_2mi,nces_public_schools_total_enrollment_3mi,avg_faminc_2mi,spend_dinner_1mi,popgrpy_10mi,nces_public_schools_5mi,hh_1vehicle_p_ta,gdp_2mi,occ_wc_p_1mi,...,centerxy_full_nearest_dist,emp_healthcare_social_assistance_p_5mi,osm_highway_exits_count_10mi,hh_type_1pers_p_ta,dtpop_homemakers_p_1mi,hh_type_married_child_p_1mi,other_p_3mi,dtpop_retired_disabled_p_5mi,x0,x1
0,-0.949821,0.671728,-0.431942,0.902602,-0.165700,0.789475,-0.295846,0.084775,-0.252765,-1.562731,...,-0.489329,-1.501125,0.583815,0.115159,1.153947,0.586192,0.023718,-0.451164,0.0,1.0
1,-0.114618,-1.074427,-0.518683,-1.397672,-0.081386,0.483260,-0.269440,0.483772,-0.340190,-0.910139,...,-0.312662,0.272961,-0.400870,-0.220595,1.887683,-0.638359,0.490590,0.000390,0.0,1.0
2,-0.949821,-1.469286,-0.803457,-1.982054,-0.570830,-0.125940,-0.929593,0.044530,-0.459763,1.098128,...,1.366486,0.433270,-1.019566,0.034438,1.061442,-1.804599,-0.598779,-0.760356,5.0,1.0
3,-0.949821,-0.473363,-0.009924,0.915251,0.457653,1.571314,-0.612719,-1.421524,-0.090197,0.527583,...,1.400746,1.610651,-0.470582,-0.842967,0.975243,1.634686,-0.256406,-0.711419,2.0,1.0
4,-0.711603,0.531334,1.385890,0.092909,1.138838,-0.309339,1.552583,-0.067005,0.125862,1.643427,...,-0.449073,1.332782,0.182970,-0.797342,-0.403927,-0.159528,-1.540305,1.381745,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,-0.059190,1.388324,-0.476598,1.175011,-0.064515,0.695684,-0.190221,-1.864215,-0.139247,1.505840,...,-0.377028,4.856017,-0.505438,-1.647840,1.540788,0.687116,0.327185,0.116059,0.0,1.0
703,3.280809,0.717064,1.824508,1.801803,1.459273,-0.849589,1.684614,0.089374,1.039953,-0.759929,...,-0.522059,0.385177,1.986774,-0.068511,-1.040953,0.604134,-0.256406,0.238401,0.0,2.0
704,0.806572,0.034104,-0.859804,-0.859812,-0.763328,0.627269,-0.744750,0.259551,-0.418865,-0.970728,...,-0.236067,1.589276,-0.923712,0.553861,-0.504842,-0.230176,-0.590998,-0.264314,4.0,1.0
705,-0.949821,0.082364,0.385323,0.071224,-0.099705,0.831676,-0.190221,0.773533,-0.157249,0.493502,...,-0.564760,0.009342,0.453105,-0.129345,-0.107490,0.798133,0.334966,-1.332028,0.0,1.0


In [16]:
class_weight = {
    "HOME": 0.24,
    "OTHER": 0.16,
    "SHOPPING": 0.22,
    "TRAVEL": 0.15,
    "WORK": 0.23
}
# encoded_class_weights = {i: class_weight[label] for i, label in enumerate(le.classes_)}
# class_weight = encoded_class_weights

## Build Logistic Regression Models

In [17]:
pipe_lr = make_pipeline(
    preprocessor,
    LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", n_jobs=-1, class_weight=class_weight)
)
pipe_lr.fit(X_train, y_train);

In [18]:
pipe_lr.score(X_train, y_train)

0.7538896746817539

In [19]:
pipe_lr.score(X_test, y_test)

0.5316455696202531

In [20]:
pipe_lr_multi = make_pipeline(
    preprocessor,
    LogisticRegression(C=0.1, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="multinomial", n_jobs=-1, 
                       class_weight=class_weight)
)
pipe_lr_multi.fit(X_train, y_train);

In [21]:
pipe_lr_multi.score(X_train, y_train)

0.7807637906647807

In [22]:
pipe_lr_multi.score(X_test, y_test)

0.5189873417721519

In [23]:
prediction_result = pd.DataFrame({
    "True label": y_test,
    "Logistic Regression (OVR)": pipe_lr.predict(X_test),
    "Logistic Regression (multinomial)": pipe_lr_multi.predict(X_test)
})

In [25]:
prediction_result

Unnamed: 0,True label,Logistic Regression (OVR),Logistic Regression (multinomial)
763,SHOPPING,HOME,HOME
39,HOME,HOME,HOME
214,HOME,HOME,HOME
202,HOME,WORK,WORK
239,WORK,WORK,WORK
...,...,...,...
716,SHOPPING,WORK,SHOPPING
333,SHOPPING,HOME,HOME
238,HOME,WORK,SHOPPING
615,HOME,HOME,HOME


## Build Random Forest Models

### Basic RF model

In [None]:
pipe_rf = make_pipeline(
    preprocessor,
    RandomForestClassifier(n_jobs=-1, random_state=42)
)
rf_param_grid = {
    "randomforestclassifier__n_estimators": [25, 50, 100, 150],
    "randomforestclassifier__max_depth": [5, 10, 20, 30],
    "randomforestclassifier__max_leaf_nodes": [30, 50, 70],
    "randomforestclassifier__min_samples_leaf": [10, 20, 30, 40, 50],
    "randomforestclassifier__min_samples_split": [10, 20, 30],
    "randomforestclassifier__class_weight": [None, "balanced", class_weight]
}
rf_grid_search = RandomizedSearchCV(
    pipe_rf, rf_param_grid, n_iter=30, cv=5, n_jobs=-1, return_train_score=True
)
rf_grid_search.fit(X_train, y_train);

In [None]:
pd.DataFrame(rf_grid_search.cv_results_)[
    [
        "param_randomforestclassifier__n_estimators",
        "param_randomforestclassifier__max_depth",
        "param_randomforestclassifier__max_leaf_nodes",
        "param_randomforestclassifier__min_samples_leaf",
        "param_randomforestclassifier__min_samples_split",
        "param_randomforestclassifier__class_weight",
        "mean_fit_time",
        "rank_test_score",
        "mean_test_score",
    ]
].set_index("rank_test_score").sort_index().head(10)

In [None]:
rf_tuned = make_pipeline(
    preprocessor,
    RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight=class_weight, 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
)
rf_tuned.fit(X_train, y_train);

In [None]:
rf_tuned.score(X_train, y_train)

In [None]:
rf_tuned.score(X_test, y_test)

In [None]:
prediction_result["Random Forest"] = rf_tuned.predict(X_test)

In [None]:
# Takes long time to run

# rf_ovr_test = make_pipeline(
#     preprocessor,
#     SelectFromModel(
#         LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
#                            n_jobs=-1, class_weight=class_weight)
#     ),
#     RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight="balanced", 
#                            min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
# )
# rf_param_grid = {
#     "randomforestclassifier__n_estimators": [25, 50, 100, 150],
#     # "randomforestclassifier__max_features": ["sqrt", "log2", None],
#     "randomforestclassifier__max_features": [5, 10, 15, 20, 30],
#     "randomforestclassifier__max_depth": [5, 10, 20, 30],
#     "randomforestclassifier__max_leaf_nodes": [30, 50, 70],
#     "randomforestclassifier__min_samples_leaf": [10, 20, 30, 40, 50],
#     "randomforestclassifier__min_samples_split": [10, 20, 30, 40],
#     "randomforestclassifier__class_weight": [None, "balanced", class_weight]
# }
# rf_ovr_search = RandomizedSearchCV(
#     rf_ovr_test, rf_param_grid, n_iter=30, cv=5, n_jobs=-1, random_state=42
# )

In [None]:
# rf_ovr_search.fit(X_train, y_train);

In [None]:
# pd.DataFrame(rf_ovr_search.cv_results_)[
#     [
#         "param_randomforestclassifier__n_estimators",
#         "param_randomforestclassifier__max_features",
#         "param_randomforestclassifier__max_depth",
#         "param_randomforestclassifier__max_leaf_nodes",
#         "param_randomforestclassifier__min_samples_leaf",
#         "param_randomforestclassifier__min_samples_split",
#         "param_randomforestclassifier__class_weight",
#         "mean_fit_time",
#         "rank_test_score",
#         "mean_test_score",
#     ]
# ].set_index("rank_test_score").sort_values(by="mean_test_score", ascending=False)

In [None]:
# rf_ovr_search.best_params_

In [None]:
# rf_ovr_search.score(X_train, y_train)

In [None]:
# rf_ovr_search.score(X_test, y_test)

### RandomForest with regularization using LogisticRegression

In [None]:
pipe_lr_rf = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
                           n_jobs=-1, class_weight=class_weight)
    ),
    RandomForestClassifier(n_estimators=100, max_depth=20, max_leaf_nodes=70, class_weight=class_weight, 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
)
pipe_lr_rf.fit(X_train, y_train);

In [None]:
pipe_lr_rf.score(X_train, y_train)

In [None]:
pipe_lr_rf.score(X_test, y_test)

In [None]:
prediction_result["Random Forest (L2 reg with LR)"] = pipe_lr_rf.predict(X_test)

In [None]:
# pipe_lr_rf_test = make_pipeline(
#     preprocessor,
#     SelectFromModel(
#         LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
#                            n_jobs=-1, class_weight=class_weight)
#     ),
#     RandomForestClassifier(n_estimators=100, max_depth=20, max_leaf_nodes=70, class_weight=class_weight, 
#                            min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
# )

In [None]:
lr_rf_param = {
    "selectfrommodel__estimator__C": [0.1, 0.15, 0.2, 0.3, 0.4],
    "selectfrommodel__estimator__penalty": ["l1", "l2"],
    "selectfrommodel__estimator__multi_class": ["ovr", "multinomial"],
    "selectfrommodel__estimator__class_weight": [None, "balanced", class_weight]
}
lr_rf_search = RandomizedSearchCV(
    pipe_lr_rf, lr_rf_param, n_iter=30, cv=5, n_jobs=-1, random_state=42
)
lr_rf_search.fit(X_train, y_train);

In [None]:
pd.DataFrame(lr_rf_search.cv_results_)[
    [
        "param_selectfrommodel__estimator__C",
        "param_selectfrommodel__estimator__penalty",
        "param_selectfrommodel__estimator__multi_class",
        "param_selectfrommodel__estimator__class_weight",
        "mean_fit_time",
        "rank_test_score",
        "mean_test_score",
    ]
].set_index("rank_test_score").sort_index().head(10)

In [None]:
lr_rf_search.score(X_train, y_train)

In [None]:
lr_rf_search.score(X_test, y_test)

In [None]:
lr_rf_l1 = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.1, penalty="l1", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
                           n_jobs=-1, class_weight=None)
    ),
    RandomForestClassifier(n_estimators=100, max_depth=20, max_leaf_nodes=70, class_weight=class_weight, 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
)
lr_rf_l1.fit(X_train, y_train);

In [None]:
print(lr_rf_l1.score(X_train, y_train))
print(lr_rf_l1.score(X_test, y_test))

In [None]:
prediction_result["Random Forest (L1 reg with LR)"] = lr_rf_l1.predict(X_test)

In [None]:
pipe_lr_rf_rank2 = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.4, penalty="l1", random_state=42, solver="saga", max_iter=10000, multi_class="multinomial", 
                           n_jobs=-1, class_weight=class_weight)
    ),
    RandomForestClassifier(n_estimators=100, max_depth=20, max_leaf_nodes=70, class_weight=class_weight, 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
)
pipe_lr_rf_rank2.fit(X_train, y_train);

In [None]:
print(pipe_lr_rf_rank2.score(X_train, y_train))
print(pipe_lr_rf_rank2.score(X_test, y_test))

**Second best set of parameters do not lead to better result. Just use the best set of parameters.**

### Random Forest with OVR and regularization using LogisticRegression

In [None]:
lr_rf_ovr = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
                           n_jobs=-1, class_weight=class_weight)
    ),
    OneVsRestClassifier(
        RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight=None, 
                               min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
    )
)

In [None]:
lr_rf_ovr_search = RandomizedSearchCV(
    lr_rf_ovr, lr_rf_param, n_iter=30, cv=5, n_jobs=-1, random_state=42
)
lr_rf_ovr_search.fit(X_train, y_train);

In [None]:
pd.DataFrame(lr_rf_ovr_search.cv_results_)[
    [
        "param_selectfrommodel__estimator__C",
        "param_selectfrommodel__estimator__penalty",
        "param_selectfrommodel__estimator__multi_class",
        "param_selectfrommodel__estimator__class_weight",
        "mean_fit_time",
        "rank_test_score",
        "mean_test_score",
    ]
].set_index("rank_test_score").sort_values(by="mean_test_score", ascending=False).head(10)

In [None]:
lr_rf_ovr_search.score(X_train, y_train)

In [None]:
lr_rf_ovr_search.score(X_test, y_test)

In [None]:
lr_rf_ovr = make_pipeline(
    preprocessor,
    SelectFromModel(
        LogisticRegression(C=0.15, penalty="l1", random_state=42, solver="saga", max_iter=10000, multi_class="multinomial", 
                           n_jobs=-1, class_weight=None)
    ),
    OneVsRestClassifier(
        RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight=None, 
                               min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
    )
)
lr_rf_ovr.fit(X_train, y_train);

In [None]:
print(lr_rf_ovr.score(X_train, y_train))
print(lr_rf_ovr.score(X_test, y_test))

In [None]:
prediction_result["Random Forest (OVR with L1 reg using LR)"] = lr_rf_ovr.predict(X_test)

In [None]:
rf_ovr = make_pipeline(
    preprocessor,
    OneVsRestClassifier(
        RandomForestClassifier(n_estimators=100, max_depth=30, max_leaf_nodes=30, class_weight="balanced", 
                           min_samples_leaf=10, min_samples_split=30, n_jobs=-1, random_state=42)
    )
)
rf_ovr.fit(X_train, y_train);

In [None]:
rf_ovr.score(X_train, y_train)

In [None]:
rf_ovr.score(X_test, y_test)

In [None]:
# prediction_result["rf_ovr"] = rf_ovr.predict(X_test)

## LGBM Model - Poor score, won't use

In [None]:
# pipe_lgbm_ovr = make_pipeline(
#     preprocessor,
#     OneVsRestClassifier(
#         LGBMClassifier(random_state=42, n_jobs=-1)
#     )
# )
# param_lgbm = {
#     "onevsrestclassifier__estimator__n_estimators": [50, 100, 150, 200],
#     "onevsrestclassifier__estimator__boosting_type": ["gbdt", "dart"],
#     "onevsrestclassifier__estimator__learning_rate": [0.1, 0.3, 0.5, 0.7],
#     "onevsrestclassifier__estimator__max_depth": [5, 10, 15, 20],
#     "onevsrestclassifier__estimator__reg_alpha": [0.1, 0.3, 0.5, 0.7],
#     "onevsrestclassifier__estimator__reg_lambda": [0.1, 0.3, 0.5, 0.7],
#     # "onevsrestclassifier__estimator__min_child_samples": [20, 30, 40, 50],
#     "onevsrestclassifier__estimator__num_leaves": [5, 10, 15, 20],
#     "onevsrestclassifier__estimator__min_data_in_leaf": [30, 40, 50, 60],
#     "onevsrestclassifier__estimator__class_weight": [None, "balanced"]
# }
# lgbm_search = RandomizedSearchCV(
#     pipe_lgbm_ovr, param_lgbm, n_iter=30, cv=5, n_jobs=-1, random_state=42
# )

In [None]:
# lgbm_search.fit(X_train, y_train);

In [None]:
# pd.DataFrame(lgbm_search.cv_results_)[
#     [
#         "param_onevsrestclassifier__estimator__n_estimators",
#         "param_onevsrestclassifier__estimator__boosting_type",
#         "param_onevsrestclassifier__estimator__learning_rate",
#         "param_onevsrestclassifier__estimator__max_depth",
#         "param_onevsrestclassifier__estimator__reg_alpha",
#         "param_onevsrestclassifier__estimator__reg_lambda",
#         "param_onevsrestclassifier__estimator__class_weight",
#         # "param_onevsrestclassifier__estimator__min_child_samples",
#         "param_onevsrestclassifier__estimator__num_leaves",
#         "param_onevsrestclassifier__estimator__min_data_in_leaf",
#         "mean_fit_time",
#         "rank_test_score",
#         "mean_test_score",
#     ]
# ].set_index("rank_test_score").sort_index().head(20)

In [None]:
# lgbm_search.score(X_train, y_train)

In [None]:
# lgbm_search.score(X_test, y_test)

In [None]:
# lgbm_test = make_pipeline(
#     preprocessor,
#     OneVsRestClassifier(
#         LGBMClassifier(random_state=42, n_jobs=-1, n_estimators=150, boosting_type="gbdt", learning_rate=0.1, max_depth=10,
#                       reg_alpha=0.7, reg_lambda=0.7, num_leaves=20, min_data_in_leaf=30, class_weight=None)
#     )
# )
# lgbm_test.fit(X_train, y_train);

In [None]:
# lgbm_test.score(X_train, y_train)

In [None]:
# lgbm_test.score(X_test, y_test)

In [None]:
# lgbm_search.best_params_
# score=0.62025

## XGB Model - poor score, won't use

In [None]:
# pipe_xgb = make_pipeline(
#     preprocessor,
#     OneVsRestClassifier(
#         XGBClassifier(random_state=42, verbosity=0)
#     )
# )
# param_xgb = {
#     "onevsrestclassifier__estimator__booster": ["gbtree", "dart"],
#     "onevsrestclassifier__estimator__learning_rate": [0.1, 0.3, 0.5, 0.7],
#     "onevsrestclassifier__estimator__max_depth": [3, 5, 7, 9],
#     "onevsrestclassifier__estimator__reg_alpha": [0.1, 0.3, 0.5, 0.7],
#     "onevsrestclassifier__estimator__reg_lambda": [0.1, 0.3, 0.5, 0.7],
#     "onevsrestclassifier__estimator__subsample": [0.3, 0.5, 0.7, 1],
#     "onevsrestclassifier__estimator__min_child_weight": [1, 4, 7, 10]
# }
# xgb_search = RandomizedSearchCV(
#     pipe_xgb, param_xgb, n_iter=30, cv=5, n_jobs=-1, random_state=42
# )

In [None]:
# xgb_search.fit(X_train, y_train);

In [None]:
# pd.DataFrame(xgb_search.cv_results_)[
#     [
#         "param_onevsrestclassifier__estimator__booster",
#         "param_onevsrestclassifier__estimator__learning_rate",
#         "param_onevsrestclassifier__estimator__max_depth",
#         "param_onevsrestclassifier__estimator__reg_alpha",
#         "param_onevsrestclassifier__estimator__reg_lambda",
#         "param_onevsrestclassifier__estimator__subsample",
#         "param_onevsrestclassifier__estimator__min_child_weight",
#         "mean_fit_time",
#         "rank_test_score",
#         "mean_test_score",
#     ]
# ].set_index("rank_test_score").sort_index().head(20)

In [None]:
# xgb_search.score(X_train, y_train)

In [None]:
# xgb_search.score(X_test, y_test)

In [None]:
# xgb_test = make_pipeline(
#     preprocessor,
#     OneVsRestClassifier(
#         XGBClassifier(random_state=42, verbosity=0, booster="gbtree", learning_rate=0.1, max_depth=5, reg_alpha=0.5, reg_lambda=0.1,
#                      subsample=0.7, min_child_weight=1)
#     )
# )
# xgb_test.fit(X_train, y_train);

In [None]:
# xgb_test.score(X_train, y_train)

In [None]:
# xgb_test.score(X_test, y_test)
# XGBClassifier(random_state=42, verbosity=0, booster="gbtree", learning_rate=0.1, max_depth=5, reg_alpha=0.5, reg_lambda=0.1,
#                     subsample=0.7)

In [None]:
# lr_xgb_test = make_pipeline(
#     preprocessor,
#     SelectFromModel(
#         LogisticRegression(C=0.15, penalty="l2", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", 
#                            n_jobs=-1, class_weight=class_weight)
#     ),
#     OneVsRestClassifier(
#         XGBClassifier(random_state=42, verbosity=0, booster="gbtree", learning_rate=0.1, max_depth=5, reg_alpha=0.5, reg_lambda=0.1,
#                      subsample=0.7)
#     )
# )
# lr_xgb_test.fit(X_train, y_train);

In [None]:
# lr_xgb_test.score(X_train, y_train)

In [None]:
# lr_xgb_test.score(X_test, y_test)

In [None]:
# prediction_result["xgb"] = xgb_test.predict(X_test)

In [None]:
prediction_result.head(20)

In [None]:
prediction_result.iloc[20:50]

In [None]:
prediction_result.iloc[50:]

## Try two different combinations of ensembling models

### 1: RF + RF (OVR, L1) + RF(L1)

In [None]:
classifiers_1 = {
    "rf": rf_tuned,
    "lr_rf_ovr": lr_rf_ovr,
    "lr_rf_l1": lr_rf_l1
}
classifiers_2 = {
    "rf": rf_tuned,
    "lr_rf": pipe_lr_rf,
    "lr_rf_l1": lr_rf_l1
}

In [None]:
# classifiers = {
#     "lr": pipe_lr,
#     "lr_multi": pipe_lr_multi,
#     "rf": rf_tuned,
#     "lr_rf": pipe_lr_rf,
#     "rf_ovr": rf_ovr,
#     "lr_rf_ovr": lr_rf_ovr,
#     "lr_rf_l1": lr_rf_l1,
# }

In [None]:
averaging_model = VotingClassifier(
    list(classifiers_1.items()), voting="soft"
)
averaging_model.fit(X_train, y_train);

In [None]:
averaging_model.score(X_train, y_train)

In [None]:
averaging_model.score(X_test, y_test)

In [None]:
averaging_model_hard = VotingClassifier(
    list(classifiers_1.items()), voting="hard"
)
averaging_model_hard.fit(X_train, y_train);

In [None]:
averaging_model_hard.score(X_train, y_train)

In [None]:
averaging_model_hard.score(X_test, y_test)

In [None]:
stacking_model = StackingClassifier(list(classifiers_1.items()))
stacking_model.fit(X_train, y_train);

In [None]:
stacking_model.score(X_train, y_train)

In [None]:
stacking_model.score(X_test, y_test)

In [None]:
stack_rf = StackingClassifier(list(classifiers_1.items()), RandomForestClassifier(n_jobs=-1, random_state=42))
stack_rf.fit(X_train, y_train);

In [None]:
stack_rf.score(X_train, y_train)

In [None]:
stack_rf.score(X_test, y_test)

### 2: RF + RF (L2) + RF(L1)

In [None]:
averaging_model_2 = VotingClassifier(
    list(classifiers_2.items()), voting="soft"
)
averaging_model_2.fit(X_train, y_train);

In [None]:
averaging_model_2.score(X_train, y_train)

In [None]:
averaging_model_2.score(X_test, y_test)

In [None]:
averaging_model_hard_2 = VotingClassifier(
    list(classifiers_2.items()), voting="hard"
)
averaging_model_hard_2.fit(X_train, y_train);

In [None]:
averaging_model_hard_2.score(X_train, y_train)

In [None]:
averaging_model_hard_2.score(X_test, y_test)

In [None]:
stacking_model_2 = StackingClassifier(list(classifiers_1.items()))
stacking_model_2.fit(X_train, y_train);

In [None]:
stacking_model_2.score(X_train, y_train)

In [None]:
stacking_model_2.score(X_test, y_test)

In [None]:
stack_rf_2 = StackingClassifier(list(classifiers_1.items()), RandomForestClassifier(n_jobs=-1, random_state=42))
stack_rf_2.fit(X_train, y_train);

In [None]:
stack_rf_2.score(X_train, y_train)

In [None]:
stack_rf_2.score(X_test, y_test)