In [1]:
import pandas as pd
import numpy as np
from autogluon.features.generators import AutoMLPipelineFeatureGenerator
from autogluon.common import space
from autogluon.core import TabularDataset
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split
# auto_ml_pipeline_feature_generator = AutoMLPipelineFeatureGenerator()

In [2]:
print("loading data")
# Prepare data
train = pd.read_csv("../data/mushrooms/train.csv")
X_train, X_val, y_train, y_val = train_test_split(
    train.drop(columns="class"),
    train["class"],
    test_size=0.2,
    random_state=42,
    stratify=train["class"],
)
train = pd.concat([X_train, y_train], axis=1)
validate = pd.concat([X_val, y_val], axis=1)
test = TabularDataset("../data/mushrooms/test.csv")
train = TabularDataset(train)
validate = TabularDataset(validate)

XGB = {
    "eta": space.Real(0.01, 0.2, default=0.1),
    "max_depth": space.Int(
        1, 3, default=2
    ),  # Adjusted range to be realistic for max_depth
    "min_child_weight": space.Int(
        1, 30, default=1
    ),  # Adjusted range to be realistic for min_child_weight
    "gamma": space.Real(0.01, 0.2, default=0.1),
    "subsample": space.Real(0.01, 0.2, default=0.1),
    "colsample_bytree": space.Real(0, 1, default=0.5),
    "lambda": space.Real(0.01, 0.2, default=0.1),
    "alpha": space.Real(0.01, 0.2, default=0.1),
    "device": "cuda",
    "enable_categorical": True,
}
hyperparameters = {  # hyperparameters of each model type
    "XGB": XGB,
}
time_limit = 2 * 60  # train various models for ~2 min
num_trials = (
    5  # try at most 5 different hyperparameter configurations for each type of model
)
search_strategy = (
    "auto"  # to tune hyperparameters using random search routine with a local scheduler
)

hyperparameter_tune_kwargs = (
    {  # HPO is not performed unless hyperparameter_tune_kwargs is specified
        "num_trials": num_trials,
        "scheduler": "local",
        "searcher": search_strategy,
    }
)  # Refer to TabularPredictor.fit docstring for all valid values
# Training
predictor = TabularPredictor(
    label="class",
    problem_type="binary",
    eval_metric="log_loss",
).fit(
    train_data=train,
    time_limit=time_limit,
    presets=["best_quality"],
    # hyperparameters=hyperparameters,
    # hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
)

loading data


No path specified. Models will be saved in: "AutogluonModels/ag-20240804_175220"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 120 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels/ag-20240804_175220/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 30s
AutoGluon will save models to "AutogluonModels/ag-20240804_175220/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.9.18
Operating System:   Lin

In [3]:
predictor.leaderboard(validate)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT_BAG_L1,-0.150638,-0.151805,log_loss,2.49351,5.462252,34.484753,2.49351,5.462252,34.484753,1,True,1
1,WeightedEnsemble_L3,-0.150638,-0.151805,log_loss,2.496814,5.50449,53.20208,0.003305,0.042238,18.717327,3,True,5
2,LightGBMXT_BAG_L2,-0.646025,-0.646154,log_loss,4.417905,7.508646,47.957379,0.981072,1.065474,6.827056,2,True,4
3,LightGBM_BAG_L1,-0.654919,-0.654951,log_loss,0.943324,0.980921,6.64557,0.943324,0.980921,6.64557,1,True,2
4,WeightedEnsemble_L2,-0.694084,-0.151805,log_loss,2.535895,5.508975,48.735201,0.042386,0.046723,14.250448,2,True,3
