In [28]:
from mlcomp.data.load import load_classification_train, load_classification_test
from mlcomp.data import preprocess
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import StackingClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
# from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import pandas as pd

In [29]:

df = load_classification_train()
df = preprocess.drop_ft2(df)
df = preprocess.remove_outliers(df, handling_method=preprocess.HandlingMethod.CAP_AT_MIN_MAX)

X = df.drop(columns='label')
y = df['label']

# X, y = SMOTE().fit_resample(X, y)

#X = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [30]:
cat_hp = {  # C1
    "bagging_temperature": 9.933171093235632,
    "depth": 4,
    "l2_leaf_reg": 5.420115711716861,
    "random_strength": 0.3008985550781157,
    "logging_level": "Silent",
}
cat = CatBoostClassifier(eval_metric="TotalF1")
cat.set_params(**cat_hp)

hgb_hp = {  # C3
    "l2_regularization": 0.04426091526172612,
    "learning_rate": 0.27215534692918875,
    "max_bins": 95,
    "max_depth": 18,
    "max_leaf_nodes": 45,
    "min_samples_leaf": 3,
}
hgb = HistGradientBoostingClassifier()
hgb.set_params(**hgb_hp)


xgb_hp = {  # C3
    "colsample_bylevel": 0.6543301384218317,
    "colsample_bynode": 0.9439972406076952,
    "colsample_bytree": 0.7773758702787967,
    "gamma": 0,
    "learning_rate": 0.1827843463017609,
    "max_delta_step": 4,
    "max_depth": 8,
    "min_child_weight": 0,
    "reg_alpha": 0.1895331843304295,
    "reg_lambda": 0.6188481066931769,
    "subsample": 0.9979218768593112,
}
xgb = XGBClassifier()
xgb.set_params(**xgb_hp)


mlp_hp = {  # C1
    "activation": "logistic",
    "alpha": 0.0006796700786147267,
    "hidden_layer_sizes": 108,
    "learning_rate": "adaptive",
    "learning_rate_init": 0.02899177860096097,
    "momentum": 0.1650630459189415,
    "solver": "adam",
}
mlp = MLPClassifier()
mlp.set_params(**mlp_hp)

estimators = [
    ("cat", cat),
    ("hgb", hgb),
    ("xgb", xgb),
    ("svm", make_pipeline(StandardScaler(), SVC())),
    ("stdscale_mlp", make_pipeline(StandardScaler(), mlp)),
    ("lgbm", LGBMClassifier()),
    ("rf", RandomForestClassifier())
]

In [None]:
clf = StackingClassifier(estimators=estimators, n_jobs=-1)

pipe = make_pipeline(SMOTE(), clf)

cross_val_score(pipe, X, y, scoring='f1_macro', cv=3).mean()


In [None]:
pipe = StackingClassifier(estimators=estimators, n_jobs=-1)
pipe.fit(X, y)

test = load_classification_test()
test = preprocess.drop_ft2(test)
test = preprocess.remove_outliers(test, handling_method=preprocess.HandlingMethod.CAP_AT_MIN_MAX)
#test = StandardScaler().fit_transform(test)

prediction = pipe.predict(test)
df = pd.DataFrame(prediction, columns=["label"])
df.to_csv("pred.csv", index_label="Id")


### !ATTENTION!: Please see doc.md. I did a mistake using SMOTE and CV, which causes most of the validation results here to be wrong.


#### (1) Stack(Cat-C1, HGB-C3, XGB-C3), drop2, cap outlier, smote
- Val: 0.8683956033883564
- Kaggle: 0.78982

#### (2) Stack(Cat-C1, HGB-C3, XGB-C3, SVC), drop2, cap outlier, smote
- Val: 0.8714865289115579
- Kaggle: 0.79674

#### (3) Stack(Cat-C1, HGB-C3, XGB-C3, (StdScale + SVC)), drop2, cap outlier, smote
- Val: 0.8700799654038611
- Kaggle: 0.77174

#### (4) Stack(Cat-C1, HGB-C3, XGB-C3, (StdScale + SVC), (StdScale + KNN)), drop2, cap outlier, smote
- Val: 0.8675240682428484
- Kaggle: 0.78844

#### (5) Stack(Cat-C1, HGB-C3, XGB-C3, SVC, (StdScale + MLP)), drop2, cap outlier, smote
- Val: 0.864481808027894
- Kaggle: 0.81036

#### (6) Stack(Cat-C1, HGB-C3, XGB-C3, SVC, (StdScale + MLP-C1)), drop2, cap outlier, smote
- Val: 0.8629149982016588
- Kaggle: 0.79677

#### (7) Stack(Cat-C1, HGB-C3, XGB-C3, (StdScale + SVC), (StdScale + MLP-C1)), drop2, cap outlier, smote
- Val: 0.8753143683305312
- Kaggle: 0.76787




### Scores below here are correct again

#### (8) Stack(Cat-C1, HGB-C3, XGB-C3, (StdScale + SVC), (StdScale + MLP-C1), LGBM), drop2, cap outlier, smote
- Val: 0.8018078052395957
- Kaggle: 0.78283

