In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier

In [2]:
trade_area = pd.read_csv("../../data/Smoothie King/smoothie_king_trade_area_variables.csv")
poi = pd.read_csv('../../data/Smoothie King/processed_poi.csv')
stores = pd.read_csv('../../data/Smoothie King/smoothie_king_stores.csv')
demographic = pd.read_csv("../../data/Smoothie King/processed_demographic.csv")
to_remove = [col for col in trade_area.columns.tolist() if "_p_" in col]
trade_area = trade_area.drop(columns=to_remove)
# merged = stores.merge(trade_area, left_on="store", right_on="store_num").merge(poi, on="store").merge(demographic, on="store")
merged = stores.merge(trade_area, left_on="store", right_on="store_num").merge(poi)
merged = merged.drop(columns=["store_num", "country_code"])
merged

Unnamed: 0,store,longitude,latitude,category,cbsa_name,dma_name,state_name,market_size,store_density,medhhinc_dma,...,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_nearest_exit_dist,places_of_worship_10mi,places_of_worship_1mi,places_of_worship_2mi,places_of_worship_3mi,places_of_worship_5mi,transitstop_nearest_dist,transitstops
0,SK 1504,-97.650392,30.519353,SHOPPING,"Austin-Round Rock, TX","Austin, TX",Texas,Large Metro (2),Light Suburban,85075.0,...,17,45,1.875541,314,5,30,76,128,14.818824,0
1,SK 0057,-88.171150,30.672501,SHOPPING,"Mobile, AL","Mobile et al, AL-FL",Alabama,Medium City (4),Light Suburban,58200.0,...,3,14,2.617072,305,12,82,109,154,49.572856,0
2,SK 1415,-90.535722,38.784250,HOME,"St. Louis, MO-IL","St. Louis, MO",Missouri,Very Large Metro (1),Light Suburban,66049.0,...,25,45,0.194937,277,6,23,57,78,3.518308,0
3,SK 1231,-80.134700,26.100737,TRAVEL,"Miami-Fort Lauderdale et al, FL","Miami-Ft. Lauderdale, FL",Florida,Very Large Metro (1),Suburban,62779.0,...,31,61,1.393043,1202,10,50,116,393,0.021790,25
4,SK 1535,-96.856651,32.996408,WORK,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Light Suburban,76139.0,...,23,72,0.711949,903,12,58,117,273,0.055289,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,SK 1536,-96.872596,32.647809,HOME,"Dallas-Fort Worth-Arlington, TX","Dallas-Ft. Worth, TX",Texas,Very Large Metro (1),Exurban,76139.0,...,36,81,0.261721,866,8,40,115,323,0.012345,24
792,SK 1886,-105.077634,40.564695,OTHER,"Fort Collins, CO","Denver, CO",Colorado,Medium City (4),Light Suburban,82668.0,...,0,10,4.012518,175,20,71,112,128,0.189059,13
793,SK 0162,-95.478001,30.316531,SHOPPING,"Houston-The Woodlands et al, TX","Houston, TX",Texas,Very Large Metro (1),Exurban,72903.0,...,15,23,0.397305,140,20,42,53,68,0.699036,0
794,SK 1449,-78.968258,35.064994,SHOPPING,"Fayetteville, NC","Raleigh et al, NC",North Carolina,Medium City (4),Exurban,66003.0,...,12,46,0.706073,293,8,43,85,169,4.771075,0


In [3]:
# to_remove = [col for col in merged.columns.tolist() if "_p_" in col]
# merged = merged.drop(columns=to_remove)
# merged

In [4]:
print(len(merged["dma_name"].unique()))
len(merged["state_name"].unique())
len(merged["cbsa_name"].unique())

109


171

In [5]:
miss = merged.columns[merged.isnull().any()].tolist()
miss

['inrix_afternoon_ew',
 'inrix_afternoon_ns',
 'inrix_breakfast_ew',
 'inrix_breakfast_ns',
 'inrix_dinner_ew',
 'inrix_dinner_ns',
 'inrix_lunch_ew',
 'inrix_lunch_ns',
 'inrix_night_ew',
 'inrix_night_ns',
 'inrix_overnight_ew',
 'inrix_overnight_ns']

In [6]:
# for missing_col in miss:
#     inrix_col = missing_col[:-2]
#     if missing_col[-2:] == "ns":
#         merged[missing_col] = merged[missing_col].fillna(merged[inrix_col + "ew"])
#     else:
#         merged[missing_col] = merged[missing_col].fillna(merged[inrix_col + "ns"])

### Dropping the 10 rows with missing values seem to lead to the best results

In [7]:
merged = merged.drop(columns=["store", "longitude", "latitude", "state_name", "cbsa_name", "dma_name"])
# merged = merged.drop(columns=miss)
merged = merged.dropna()

In [8]:
merged

Unnamed: 0,category,market_size,store_density,medhhinc_dma,medhhinc_1mi,age0018_ta,age65pl_ta,age85pl_ta,asian_ta,avg_faminc_ta,...,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_nearest_exit_dist,places_of_worship_10mi,places_of_worship_1mi,places_of_worship_2mi,places_of_worship_3mi,places_of_worship_5mi,transitstop_nearest_dist,transitstops
0,SHOPPING,Large Metro (2),Light Suburban,85075.0,83846.0,21456.0,6709.0,372.0,4092.9205,108837.0,...,17,45,1.875541,314,5,30,76,128,14.818824,0
1,SHOPPING,Medium City (4),Light Suburban,58200.0,50388.0,12717.0,8094.0,704.0,1723.0000,80752.0,...,3,14,2.617072,305,12,82,109,154,49.572856,0
2,HOME,Very Large Metro (1),Light Suburban,66049.0,59999.0,10127.0,8873.0,677.0,1082.0001,106893.0,...,25,45,0.194937,277,6,23,57,78,3.518308,0
3,TRAVEL,Very Large Metro (1),Suburban,62779.0,97439.0,8623.0,8821.0,775.0,850.0585,134373.0,...,31,61,1.393043,1202,10,50,116,393,0.021790,25
4,WORK,Very Large Metro (1),Light Suburban,76139.0,56156.0,17652.0,7902.0,348.0,4425.0000,102367.0,...,23,72,0.711949,903,12,58,117,273,0.055289,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,HOME,Very Large Metro (1),Exurban,76139.0,32999.0,22186.0,12357.0,900.0,689.0000,66363.0,...,36,81,0.261721,866,8,40,115,323,0.012345,24
792,OTHER,Medium City (4),Light Suburban,82668.0,55848.0,13060.0,6761.0,629.0,1907.5900,95226.0,...,0,10,4.012518,175,20,71,112,128,0.189059,13
793,SHOPPING,Very Large Metro (1),Exurban,72903.0,51204.0,14791.0,7410.0,602.0,929.9680,89271.0,...,15,23,0.397305,140,20,42,53,68,0.699036,0
794,SHOPPING,Medium City (4),Exurban,66003.0,50154.0,27431.0,14186.0,1158.0,3537.1850,67117.0,...,12,46,0.706073,293,8,43,85,169,4.771075,0


In [9]:
le = LabelEncoder()
merged["category"] = le.fit_transform(merged["category"])
merged

Unnamed: 0,category,market_size,store_density,medhhinc_dma,medhhinc_1mi,age0018_ta,age65pl_ta,age85pl_ta,asian_ta,avg_faminc_ta,...,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_nearest_exit_dist,places_of_worship_10mi,places_of_worship_1mi,places_of_worship_2mi,places_of_worship_3mi,places_of_worship_5mi,transitstop_nearest_dist,transitstops
0,2,Large Metro (2),Light Suburban,85075.0,83846.0,21456.0,6709.0,372.0,4092.9205,108837.0,...,17,45,1.875541,314,5,30,76,128,14.818824,0
1,2,Medium City (4),Light Suburban,58200.0,50388.0,12717.0,8094.0,704.0,1723.0000,80752.0,...,3,14,2.617072,305,12,82,109,154,49.572856,0
2,0,Very Large Metro (1),Light Suburban,66049.0,59999.0,10127.0,8873.0,677.0,1082.0001,106893.0,...,25,45,0.194937,277,6,23,57,78,3.518308,0
3,3,Very Large Metro (1),Suburban,62779.0,97439.0,8623.0,8821.0,775.0,850.0585,134373.0,...,31,61,1.393043,1202,10,50,116,393,0.021790,25
4,4,Very Large Metro (1),Light Suburban,76139.0,56156.0,17652.0,7902.0,348.0,4425.0000,102367.0,...,23,72,0.711949,903,12,58,117,273,0.055289,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,0,Very Large Metro (1),Exurban,76139.0,32999.0,22186.0,12357.0,900.0,689.0000,66363.0,...,36,81,0.261721,866,8,40,115,323,0.012345,24
792,1,Medium City (4),Light Suburban,82668.0,55848.0,13060.0,6761.0,629.0,1907.5900,95226.0,...,0,10,4.012518,175,20,71,112,128,0.189059,13
793,2,Very Large Metro (1),Exurban,72903.0,51204.0,14791.0,7410.0,602.0,929.9680,89271.0,...,15,23,0.397305,140,20,42,53,68,0.699036,0
794,2,Medium City (4),Exurban,66003.0,50154.0,27431.0,14186.0,1158.0,3537.1850,67117.0,...,12,46,0.706073,293,8,43,85,169,4.771075,0


In [10]:
le.classes_

array(['HOME', 'OTHER', 'SHOPPING', 'TRAVEL', 'WORK'], dtype=object)

In [11]:
# X = merged.drop(columns=["category"])
# y = merged["category"]
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.1, random_state=42, stratify=y
# )
train_df, test_df = train_test_split(merged, test_size=0.1, random_state=42)
X_train = train_df.drop(columns=["category"])
y_train = train_df["category"]
X_test = test_df.drop(columns=["category"])
y_test = test_df["category"]

In [12]:
X_train

Unnamed: 0,market_size,store_density,medhhinc_dma,medhhinc_1mi,age0018_ta,age65pl_ta,age85pl_ta,asian_ta,avg_faminc_ta,avghhinc_ta,...,osm_highway_exits_count_3mi,osm_highway_exits_count_5mi,osm_nearest_exit_dist,places_of_worship_10mi,places_of_worship_1mi,places_of_worship_2mi,places_of_worship_3mi,places_of_worship_5mi,transitstop_nearest_dist,transitstops
347,Very Large Metro (1),Exurban,70970.0,118489.0,14380.0,4910.0,345.0,1265.0001,93118.0,89967.0,...,10,33,2.615653,746,4,13,53,162,1.748239,0
111,Very Large Metro (1),Exurban,70970.0,48890.0,14161.0,6572.0,326.0,3468.9999,72243.0,67727.0,...,14,21,0.388134,472,7,38,63,149,1.455125,0
54,Small Town (6),Exurban,50265.0,16787.0,4885.0,3304.0,297.0,341.6977,73789.0,81607.0,...,0,0,7.056117,37,14,23,27,30,31.770532,0
747,Large City (3),Exurban,67184.0,84633.0,17287.0,7249.0,527.0,2818.9999,135460.0,134123.0,...,6,27,1.977634,153,4,14,23,48,1.319867,0
194,Very Large Metro (1),Light Suburban,62779.0,68906.0,12445.0,12921.0,1800.0,1259.1502,104670.0,100387.0,...,16,46,1.326653,623,9,42,91,237,0.025833,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Very Large Metro (1),Exurban,72903.0,137195.0,9092.0,5007.0,263.0,7055.6338,137597.0,150433.0,...,14,29,0.203553,477,7,15,26,102,9.251025,0
107,Very Large Metro (1),Suburban,87680.0,116504.0,11647.0,11630.0,1230.0,5473.1967,146098.0,161364.0,...,39,101,0.331632,1753,22,77,176,410,0.040567,11
275,Small City (5),Exurban,63736.0,47959.0,9731.0,6986.0,679.0,848.0000,75062.0,62347.0,...,6,10,0.200039,48,5,15,26,40,44.211875,0
443,Very Large Metro (1),Exurban,64661.0,89736.0,8757.0,3764.0,170.0,3524.9999,103601.0,92428.0,...,21,58,1.637729,511,8,32,55,130,0.057342,11


In [13]:
# categorical_features = ["store_density"]
ordinal_features = ["market_size", "store_density"]
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

In [14]:
market_levels = [
    "Small Town (6)",
    "Small City (5)",
    "Medium City (4)",
    "Large City (3)",
    "Large Metro (2)",
    "Very Large Metro (1)"
]
density_levels = [
    "Rural",
    "Exurban",
    "Suburban",
    "Light Suburban",
    "Light Urban",
    "Urban",
    "Super Urban"
]

In [15]:
ordinal_transformer = OrdinalEncoder(categories=[market_levels, density_levels], dtype=int)

preprocessor = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy="constant", fill_value=0), StandardScaler()), numeric_features),
    (ordinal_transformer, ordinal_features),
)

In [16]:
X_transformed = preprocessor.fit_transform(X_train)
pd.DataFrame(X_transformed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,202,203,204,205,206,207,208,209,210,211
0,0.505518,1.603831,0.240326,-0.922216,-0.701716,-0.432801,-0.408111,-0.359667,1.994195,-0.707100,...,0.323607,0.650703,-0.256099,-0.336286,-0.099903,0.075347,-0.365336,-0.562677,5.0,1.0
1,0.505518,-0.993985,0.204838,-0.509750,-0.735201,0.197289,-1.214297,-1.005594,2.327336,-0.183032,...,-0.467842,0.057060,-0.118130,0.132140,0.016747,0.011761,-0.379246,-0.562677,5.0,1.0
2,-1.467147,-2.192245,-1.298315,-1.320784,-0.786309,-0.696760,-1.154591,-0.602471,0.056524,-1.478467,...,1.901325,-0.885402,0.203800,-0.148916,-0.403192,-0.570291,1.059455,-0.562677,0.0,1.0
3,0.144807,0.340139,0.711398,-0.341735,-0.380969,0.011464,1.227123,0.922776,-0.773621,-0.068242,...,0.096915,-0.634079,-0.256099,-0.317549,-0.449852,-0.482250,-0.385665,-0.562677,3.0,1.0
4,-0.274878,-0.246878,-0.073236,1.065911,1.862501,-0.434474,0.038023,-0.057034,-0.813717,0.600470,...,-0.134381,0.384214,-0.026150,0.207088,0.343366,0.442186,-0.447077,1.379677,5.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,0.689684,2.302042,-0.616581,-0.898143,-0.846229,1.222654,1.309654,1.396476,-0.269640,-0.729441,...,-0.533424,0.067893,-0.118130,-0.298812,-0.414857,-0.218125,-0.009270,-0.562677,5.0,1.0
703,2.097560,1.529740,-0.202550,0.745517,0.857962,0.770259,1.637960,1.713950,-0.442362,0.620885,...,-0.487917,2.832449,0.571719,0.862884,1.334888,1.288363,-0.446378,0.454747,5.0,2.0
704,-0.183700,-1.028735,-0.513033,-0.407005,-0.113092,-0.552016,-1.105428,-1.161848,-0.737739,-0.701707,...,-0.534673,-0.861569,-0.210110,-0.298812,-0.414857,-0.521379,1.649894,-0.562677,1.0,1.0
705,-0.095571,0.530611,-0.670867,-1.206624,-1.010127,0.213299,-0.003261,-0.288191,0.472808,-1.073042,...,-0.023855,0.141557,-0.072140,0.019718,-0.076573,-0.081172,-0.445582,0.454747,5.0,1.0


In [17]:
transformed_col_names = (
    numeric_features + ordinal_features 
)

## Proportions of target class

HOME: 33.7

SHOPPING: 26.9

WORK: 20.5

OTHER: 13.6

TRAVEL: 5.3

## Trying Logistic Regression

In [18]:
le.classes_

array(['HOME', 'OTHER', 'SHOPPING', 'TRAVEL', 'WORK'], dtype=object)

In [19]:
class_weight = {
    "HOME": 0.29,
    "OTHER": 0.14,
    "SHOPPING": 0.24,
    "TRAVEL": 0.15,
    "WORK": 0.27
}
encoded_class_weights = {i: class_weight[label] for i, label in enumerate(le.classes_)}
class_weight = encoded_class_weights

In [20]:
pipe_lr = make_pipeline(
    preprocessor,
    LogisticRegression(penalty="l1", random_state=42, solver="saga", max_iter=10000, multi_class="ovr", n_jobs=-1, class_weight=class_weight)
)

In [21]:
pipe_lr.fit(X_train, y_train);

In [22]:
pipe_lr.classes_

array([0, 1, 2, 3, 4])

In [23]:
pipe_lr.score(X_train, y_train)

0.5884016973125884

In [24]:
pipe_lr.score(X_test, y_test)

0.5949367088607594

In [25]:
prediction_result = pd.DataFrame({
    "True label": y_test,
    "pipe_lr": pipe_lr.predict(X_test)
})

In [26]:
pipe_lr_lib = make_pipeline(
    preprocessor,
    LogisticRegression(penalty="l1", random_state=42, solver="liblinear", max_iter=10000, multi_class="ovr", class_weight=class_weight)
)
pipe_lr_lib.fit(X_train, y_train);

In [27]:
pipe_lr_lib.score(X_train, y_train)

0.6181046676096181

In [28]:
pipe_lr_lib.score(X_test, y_test)

0.620253164556962

In [29]:
pipe_lr_ela = make_pipeline(
    preprocessor,
    LogisticRegression(penalty="elasticnet", l1_ratio=0.7, random_state=42, solver="saga", max_iter=10000, multi_class="ovr", n_jobs=-1)
)
pipe_lr_ela.fit(X_train, y_train);

In [30]:
pipe_lr_lib.score(X_test, y_test)

0.620253164556962

In [31]:
pipe_lr_multi = make_pipeline(
    preprocessor,
    LogisticRegression(penalty="l1", random_state=42, solver="saga", max_iter=10000, multi_class="multinomial", n_jobs=-1, class_weight=class_weight)
)
pipe_lr_multi.fit(X_train, y_train);

In [32]:
pipe_lr_multi.score(X_test, y_test)

0.6075949367088608

In [33]:
prediction_result["pipe_lr_lib"] = pipe_lr_lib.predict(X_test)
prediction_result["pipe_lr_multi"] = pipe_lr_multi.predict(X_test)

## Trying LGBM

In [34]:
pipe_lgbm = make_pipeline(
    preprocessor,
    LGBMClassifier(random_state=42, objective="multiclass", n_jobs=-1)
)
param_lgbm = {
    "lgbmclassifier__boosting_type": ["gbdt", "dart"],
    "lgbmclassifier__learning_rate": [0.1, 0.3, 0.5, 0.7],
    "lgbmclassifier__max_depth": [10, 50, 100, 300, 500],
    "lgbmclassifier__reg_alpha": [0.1, 0.3, 0.5, 0.7],
    "lgbmclassifier__reg_lambda": [0.1, 0.3, 0.5, 0.7],
    "lgbmclassifier__class_weight": ["balanced", class_weight]
}

In [35]:
lgbm_random_search = RandomizedSearchCV(
    pipe_lgbm, param_lgbm, n_jobs=-1, n_iter=30, cv=5, random_state=42
)
lgbm_random_search.fit(X_train, y_train);

In [36]:
pd.DataFrame(lgbm_random_search.cv_results_)[
    [
        "param_lgbmclassifier__reg_lambda",
        "param_lgbmclassifier__reg_alpha",
        "param_lgbmclassifier__max_depth",
        "param_lgbmclassifier__learning_rate",
        "param_lgbmclassifier__boosting_type",
        "param_lgbmclassifier__class_weight",
        "mean_fit_time",
        "rank_test_score",
        "mean_test_score",
    ]
].set_index("rank_test_score").sort_index()

Unnamed: 0_level_0,param_lgbmclassifier__reg_lambda,param_lgbmclassifier__reg_alpha,param_lgbmclassifier__max_depth,param_lgbmclassifier__learning_rate,param_lgbmclassifier__boosting_type,param_lgbmclassifier__class_weight,mean_fit_time,mean_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.7,0.3,50,0.3,dart,balanced,7.557903,0.449695
2,0.7,0.3,300,0.1,dart,balanced,9.634294,0.4455
3,0.5,0.1,300,0.3,dart,balanced,8.990151,0.444062
4,0.5,0.3,50,0.1,gbdt,"{0: 0.29, 1: 0.14, 2: 0.24, 3: 0.15, 4: 0.27}",4.791711,0.439806
5,0.1,0.1,500,0.3,gbdt,"{0: 0.29, 1: 0.14, 2: 0.24, 3: 0.15, 4: 0.27}",2.60412,0.438418
6,0.5,0.5,300,0.3,dart,"{0: 0.29, 1: 0.14, 2: 0.24, 3: 0.15, 4: 0.27}",4.568979,0.436959
7,0.5,0.5,500,0.1,gbdt,"{0: 0.29, 1: 0.14, 2: 0.24, 3: 0.15, 4: 0.27}",4.431766,0.432724
7,0.5,0.5,50,0.1,gbdt,"{0: 0.29, 1: 0.14, 2: 0.24, 3: 0.15, 4: 0.27}",4.945645,0.432724
9,0.3,0.3,10,0.3,dart,"{0: 0.29, 1: 0.14, 2: 0.24, 3: 0.15, 4: 0.27}",5.905249,0.431336
10,0.5,0.1,500,0.1,dart,balanced,11.326929,0.431306


In [37]:
lgbm_random_search.score(X_test, y_test)

0.6075949367088608

In [38]:
pipe_lgbm_dart = make_pipeline(
    preprocessor,
    LGBMClassifier(
        random_state=42, boosting_type="dart", reg_lambda=0.3, reg_alpha=0.3, max_depth=100, learning_rate=0.1, objective="multiclass", n_jobs=-1,
        class_weight="balanced"
    )
)
pipe_lgbm_dart.fit(X_train, y_train);

In [39]:
pipe_lgbm_dart.score(X_train, y_train)

1.0

In [40]:
pipe_lgbm_dart.score(X_test, y_test)

0.6329113924050633

In [41]:
pipe_lgbm_gbdt = make_pipeline(
    preprocessor,
    LGBMClassifier(
        random_state=42, reg_lambda=0.5, reg_alpha=0.5, max_depth=10, learning_rate=0.1, objective="multiclass", n_jobs=-1, class_weight=class_weight
    )
)
pipe_lgbm_gbdt.fit(X_train, y_train);

In [42]:
pipe_lgbm_gbdt.score(X_train, y_train)

0.9985855728429985

In [43]:
pipe_lgbm_gbdt.score(X_test, y_test)

0.569620253164557

In [44]:
prediction_result["pipe_lgbm_dart"] = pipe_lgbm_dart.predict(X_test)
prediction_result["pipe_lgbm_gbdt"] = pipe_lgbm_gbdt.predict(X_test)

## Trying XGBoost

In [45]:
pipe_xgb = make_pipeline(
    preprocessor,
    XGBClassifier(random_state=42, objective="multi:softmax", verbosity=0)
)
param_xgb = {
    "xgbclassifier__booster": ["gbtree", "dart"],
    "xgbclassifier__learning_rate": [0.1, 0.3, 0.5, 0.7],
    "xgbclassifier__max_depth": [10, 50, 100, 300],
    "xgbclassifier__reg_alpha": [0.1, 0.3, 0.5, 0.7],
    "xgbclassifier__reg_lambda": [0.1, 0.3, 0.5, 0.7]
}

In [46]:
xgb_random_search = RandomizedSearchCV(
    pipe_xgb, param_xgb, n_jobs=-1, n_iter=30, cv=5, random_state=42
)
xgb_random_search.fit(X_train, y_train);



In [47]:
pd.DataFrame(xgb_random_search.cv_results_)[
    [
        "param_xgbclassifier__reg_lambda",
        "param_xgbclassifier__reg_alpha",
        "param_xgbclassifier__max_depth",
        "param_xgbclassifier__learning_rate",
        "param_xgbclassifier__booster",
        "mean_fit_time",
        "rank_test_score",
        "mean_test_score",
    ]
].set_index("rank_test_score").sort_index()

Unnamed: 0_level_0,param_xgbclassifier__reg_lambda,param_xgbclassifier__reg_alpha,param_xgbclassifier__max_depth,param_xgbclassifier__learning_rate,param_xgbclassifier__booster,mean_fit_time,mean_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.3,0.5,50,0.1,dart,57.883606,0.441235
2,0.1,0.7,300,0.3,dart,47.723799,0.439796
2,0.1,0.7,300,0.3,gbtree,6.801034,0.439796
4,0.5,0.7,10,0.3,gbtree,7.264131,0.431316
5,0.1,0.7,10,0.3,gbtree,6.649984,0.431316
6,0.7,0.1,10,0.3,dart,22.723337,0.429937
7,0.1,0.3,10,0.3,dart,46.317994,0.428489
8,0.7,0.1,10,0.5,gbtree,6.06877,0.42721
9,0.3,0.1,100,0.1,dart,50.141155,0.42716
10,0.7,0.1,10,0.7,gbtree,6.406279,0.42711


In [48]:
xgb_random_search.score(X_test, y_test)

0.569620253164557

In [49]:
pipe_xgb = make_pipeline(
    preprocessor,
    XGBClassifier(random_state=42, verbosity=0, reg_alpha=0.3, reg_lambda=0.3, learning_rate=0.1)
)
pipe_xgb.fit(X_train, y_train);



In [50]:
pipe_xgb.score(X_train, y_train)

1.0

In [51]:
pipe_xgb.score(X_test, y_test)

0.6329113924050633

In [52]:
pipe_xgb_dart = make_pipeline(
    preprocessor,
    XGBClassifier(random_state=42, verbosity=0, reg_alpha=0.3, reg_lambda=0.3, learning_rate=0.1, booster="dart")
)
pipe_xgb_dart.fit(X_train, y_train);



In [53]:
pipe_xgb_dart.score(X_train, y_train)

1.0

In [54]:
pipe_xgb_dart.score(X_test, y_test)

0.6329113924050633

In [55]:
prediction_result["pipe_xgb"] = pipe_xgb.predict(X_test)
prediction_result["pipe_xgb_dart"] = pipe_xgb_dart.predict(X_test)
prediction_result.head(20)

Unnamed: 0,True label,pipe_lr,pipe_lr_lib,pipe_lr_multi,pipe_lgbm_dart,pipe_lgbm_gbdt,pipe_xgb,pipe_xgb_dart
763,2,2,2,2,2,0,0,0
39,0,0,0,0,0,0,0,0
214,0,0,0,0,0,0,0,0
202,0,2,2,2,2,2,2,2
239,4,4,4,4,4,4,4,4
218,0,0,0,0,0,0,0,0
556,4,4,4,4,1,1,1,1
610,2,2,2,2,2,2,2,2
305,4,4,4,4,3,4,4,4
139,2,0,2,2,2,2,2,2


## Trying RandomForest

In [56]:
pipe_rf = make_pipeline(
    preprocessor,
    RandomForestClassifier(n_jobs=-1, random_state=42)
)
pipe_rf.fit(X_train, y_train);

In [57]:
pipe_rf.score(X_train, y_train)

1.0

In [58]:
pipe_rf.score(X_test, y_test)

0.620253164556962

In [59]:
rf_param_grid = {
    "randomforestclassifier__n_estimators": [25, 50, 100, 150],
    "randomforestclassifier__max_features": ["sqrt", "log2", None],
    "randomforestclassifier__max_depth": [5, 10, 20, 50],
    "randomforestclassifier__max_leaf_nodes": [30, 50, 70],
    "randomforestclassifier__class_weight": ["balanced", class_weight]
}
rf_grid_search = GridSearchCV(
    pipe_rf, rf_param_grid, cv=5, n_jobs=-1, return_train_score=True
)
rf_grid_search.fit(X_train, y_train);

In [60]:
pd.DataFrame(rf_grid_search.cv_results_)[
    [
        "param_randomforestclassifier__n_estimators",
        "param_randomforestclassifier__max_features",
        "param_randomforestclassifier__max_depth",
        "param_randomforestclassifier__max_leaf_nodes",
        "param_randomforestclassifier__class_weight",
        "mean_fit_time",
        "rank_test_score",
        "mean_test_score",
    ]
].set_index("rank_test_score").sort_index()

Unnamed: 0_level_0,param_randomforestclassifier__n_estimators,param_randomforestclassifier__max_features,param_randomforestclassifier__max_depth,param_randomforestclassifier__max_leaf_nodes,param_randomforestclassifier__class_weight,mean_fit_time,mean_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,150,log2,50,70,balanced,0.975854,0.451204
1,150,log2,20,70,balanced,1.031243,0.451204
3,150,,10,70,balanced,2.822643,0.451194
4,100,sqrt,50,70,balanced,0.542643,0.451154
4,100,sqrt,20,70,balanced,0.581660,0.451154
...,...,...,...,...,...,...,...
284,25,log2,5,30,balanced,0.222729,0.394536
284,25,log2,5,70,balanced,0.190541,0.394536
284,25,log2,5,50,balanced,0.174575,0.394536
287,50,log2,10,70,"{0: 0.29, 1: 0.14, 2: 0.24, 3: 0.15, 4: 0.27}",0.432852,0.393148


In [61]:
rf_grid_search.score(X_train, y_train)

0.9688826025459689

In [62]:
rf_grid_search.score(X_test, y_test)

0.6455696202531646

In [63]:
rf_tuned = rf_grid_search.best_estimator_

In [64]:
prediction_result["rf_tuned"] = rf_tuned.predict(X_test)
prediction_result.head(20)

Unnamed: 0,True label,pipe_lr,pipe_lr_lib,pipe_lr_multi,pipe_lgbm_dart,pipe_lgbm_gbdt,pipe_xgb,pipe_xgb_dart,rf_tuned
763,2,2,2,2,2,0,0,0,2
39,0,0,0,0,0,0,0,0,0
214,0,0,0,0,0,0,0,0,0
202,0,2,2,2,2,2,2,2,0
239,4,4,4,4,4,4,4,4,4
218,0,0,0,0,0,0,0,0,0
556,4,4,4,4,1,1,1,1,1
610,2,2,2,2,2,2,2,2,2
305,4,4,4,4,3,4,4,4,0
139,2,0,2,2,2,2,2,2,2


In [65]:
prediction_result.iloc[20:60]

Unnamed: 0,True label,pipe_lr,pipe_lr_lib,pipe_lr_multi,pipe_lgbm_dart,pipe_lgbm_gbdt,pipe_xgb,pipe_xgb_dart,rf_tuned
758,2,0,0,0,2,0,2,2,2
454,2,2,2,2,2,2,2,2,2
291,0,0,0,0,0,0,0,0,0
342,1,4,4,4,4,4,4,4,4
23,2,2,2,2,2,2,2,2,2
251,0,0,0,0,2,0,2,2,0
608,2,2,2,2,2,2,2,2,2
385,0,0,0,0,0,0,0,0,0
622,3,1,1,1,3,1,1,1,1
437,4,4,4,4,4,4,4,4,4


In [66]:
prediction_result.tail(20)

Unnamed: 0,True label,pipe_lr,pipe_lr_lib,pipe_lr_multi,pipe_lgbm_dart,pipe_lgbm_gbdt,pipe_xgb,pipe_xgb_dart,rf_tuned
336,0,0,0,0,0,0,0,0,0
554,0,0,0,0,0,4,0,0,0
649,0,0,0,0,0,0,0,0,0
30,0,4,4,4,2,2,2,2,2
600,0,4,4,4,0,0,0,0,0
33,0,0,0,0,0,0,0,0,0
31,2,0,0,0,2,2,2,2,2
591,4,0,0,0,2,4,2,2,4
425,0,0,0,0,0,0,0,0,0
321,4,4,4,4,4,4,4,4,4


In [67]:
le.classes_

array(['HOME', 'OTHER', 'SHOPPING', 'TRAVEL', 'WORK'], dtype=object)

## Trying Averaging

In [68]:
classifiers = {
    "logistic regression": pipe_lr,
    "logistic regression multi": pipe_lr_multi,
    "logistic regression liblinear": pipe_lr_lib,
    "random forest": rf_tuned,
    # "lgbm gbdt": pipe_lgbm_gbdt,
    "lgbm dart": pipe_lgbm_dart,
    "XGBoost": pipe_xgb,
    "XGBoost dart": pipe_xgb_dart
}

In [69]:
averaging_model = VotingClassifier(
    list(classifiers.items()), voting="soft"
)
averaging_model.fit(X_train, y_train);



In [70]:
averaging_model.score(X_train, y_train)

0.9957567185289957

In [71]:
averaging_model.score(X_test, y_test)

0.620253164556962

In [72]:
averaging_model_hard = VotingClassifier(
    list(classifiers.items()), voting="hard"
)
averaging_model_hard.fit(X_train, y_train);



In [73]:
averaging_model_hard.score(X_train, y_train)

0.9816124469589816

In [74]:
averaging_model_hard.score(X_test, y_test)

0.6708860759493671

In [75]:
stacking_model = StackingClassifier(list(classifiers.items()))
stacking_model.fit(X_train, y_train);



In [76]:
stacking_model.score(X_train, y_train)

0.8175388967468176

In [77]:
stacking_model.score(X_test, y_test)

0.6075949367088608