In [None]:
from smac import Scenario, HyperparameterOptimizationFacade, MultiFidelityFacade
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.metrics import f1_score
from imblearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import ADASYN, SMOTE
from mlcomp.util import get_categorical_labels
from mlcomp.data.load import load_classification_train
from mlcomp.data import preprocess
from default_cs import get_default_cs
from ConfigSpace import Configuration, ConfigurationSpace
from datetime import datetime
from pathlib import Path

In [2]:

df = load_classification_train()
df = preprocess.drop_ft2(df)
df = preprocess.remove_outliers(df, handling_method=preprocess.HandlingMethod.CAP_AT_MIN_MAX)

X = df.drop(columns='label')
y = df['label']

# THIS IS WRONG! Read doc.md for more info! X, y = SMOTE().fit_resample(X, y)

# X = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

X_train, y_train = SMOTE().fit_resample(X_train, y_train)

In [3]:
# The following function creates the training function used by SMAC.
# You can configure it here:
################
DO_CV = True
CV = 5
DO_SMOTE_IN_CV = True
################


def make_train(algo):
    def train(config: Configuration, seed: int) -> float:
        clf = algo(random_state=seed)
        clf.set_params(**config)

        if DO_CV:
            if DO_SMOTE_IN_CV:
                pipe = make_pipeline(SMOTE(), clf)
                cv_result = cross_val_score(pipe, X, y, scoring="f1_macro", cv=CV)
            else:
                cv_result = cross_val_score(clf, X, y, scoring="f1_macro", cv=CV)
            mean_f1 = cv_result.mean()
            return 1 - mean_f1
        else:
            clf.fit(X_train, y_train)
            prediction = clf.predict(X_test)
            return 1 - f1_score(y_test, prediction)

    return train

In [None]:
# I originally planned that this file will run SMAC for different algorithms at once and then show the best at the end.
# Turns out that this was not the best idea, so I just commented everything in the list except the one I currently want to optimize.

algos = [
    # HistGradientBoostingClassifier,
    # XGBClassifier,
    # XGBRFClassifier,
    # CatBoostClassifier,
    # GradientBoostingClassifier,
    MLPClassifier,
]

incumbents = []

for algo in algos:
    print(f"### STARTING SMAC FOR {algo.__name__} ###")
    train = make_train(algo)
    cs = get_default_cs(algo)
    scenario = Scenario(
        configspace=cs,
        n_trials=400,
        walltime_limit=7200,
        min_budget=100,
        max_budget=1000,
        n_workers=8,
        output_directory=Path(f"smac3_{algo.__name__}"),
    )

    smac = HyperparameterOptimizationFacade(scenario=scenario, target_function=train)

    incumbents.append(smac.optimize())
    print(f"### SMAC FINISHED FOR {algo.__name__} ###")

In [6]:
incumbents

[Configuration(values={
   'activation': 'logistic',
   'alpha': 0.0006796700786147267,
   'hidden_layer_sizes': 108,
   'learning_rate': 'adaptive',
   'learning_rate_init': 0.02899177860096097,
   'momentum': 0.1650630459189415,
   'solver': 'adam',
 })]

The cells below are just used by me for testing and generating validation scores.

In [52]:
clf = GradientBoostingClassifier()
values={
  'ccp_alpha': 0.0008966736741063057,
  'learning_rate': 0.4312350953167421,
  'loss': 'exponential',
  'max_depth': 33,
  'max_features': None,
  'max_leaf_nodes': 33,
  'min_impurity_decrease': 0.06360397078117137,
  'min_samples_leaf': 0.11120704068946179,
  'min_samples_split': 0.21042060422634004,
  'min_weight_fraction_leaf': 0.19354830546561044,
  'n_estimators': 188,
  'subsample': 0.9227895485953006,
}
clf.set_params(**values)

pipe = make_pipeline(SMOTE(), clf)

cv_result = cross_val_score(pipe, X, y, scoring="f1_macro", cv=5)
mean_f1 = cv_result.mean()
mean_f1



0.7457166976714453

In [None]:
mlp = CatBoostClassifier()
mlp.set_params(**hp)
mlp.fit(X_train, y_train)

pred = mlp.predict(X_test)
f1_score(y_test, pred)

In [None]:
x = LGBMClassifier()
x.get_params()