In [None]:
import pandas as pd
import numpy as np
import optuna
import optuna.visualization as ov
from optuna.samplers import RandomSampler
import lightgbm as lgb
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, KFold
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action='ignore', category=Warning)

In [None]:
def objective(trial, scoring_metric='roc_auc'):
    n_samples = trial.suggest_int('n_samples', 100, 10000000, log=True)
    n_features = trial.suggest_int('n_features', 3, 20)
    n_informative = trial.suggest_int('n_informative', 1, n_features - 2)
    n_redundant = trial.suggest_int('n_redundant', 0, n_features - n_informative - 1)
    n_repeated = n_features - n_informative - n_redundant
    
    # ensure the condition n_classes(2) * n_clusters_per_class <= 2**n_informative
    max_clusters = min(5, (2 ** n_informative) // 2)
    n_clusters_per_class = trial.suggest_int('n_clusters_per_class', 1, max_clusters)
    
    weights = [trial.suggest_uniform('weights', 0.01, 0.99)]
    class_sep = trial.suggest_loguniform('class_sep', 0.01, 10)
    
    # generate dataset
    X, y = make_classification(n_samples=n_samples, n_features=n_features,
                               n_informative=n_informative, n_redundant=n_redundant,
                               n_repeated=n_repeated, n_classes=2,
                               n_clusters_per_class=n_clusters_per_class,
                               weights=weights, class_sep=class_sep, flip_y=0.01,
                               random_state=42)
    
    # define model
    model = lgb.LGBMClassifier(verbose=-1)
    
    # perform 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(model, X, y, cv=kf, scoring=scoring_metric).mean()
    
    return score


In [None]:
def run_experiment(trials=1000, scoring_metric='roc_auc'):
    optuna.logging.set_verbosity(optuna.logging.ERROR)
    study = optuna.create_study(sampler=RandomSampler(), direction='maximize')
    study.optimize(lambda trial: objective(trial, scoring_metric=scoring_metric), n_trials=trials, show_progress_bar=True)
    
    # collect results
    results = study.trials_dataframe()
    results.to_csv('simulation_results.csv', index=False)
    return study

# run the experiment
study = run_experiment(trials=100000)

In [None]:
ov.plot_optimization_history(study)

In [None]:
ov.plot_param_importances(study)

In [None]:
ov.plot_contour(study, params=["class_sep", "weights", "n_samples"])