In [None]:
import time
import warnings

import numpy as np
import pandas as pd
import optuna
from optuna.samplers import RandomSampler

import lightgbm as lgb
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, KFold

import shap
import matplotlib.pyplot as plt

warnings.simplefilter(action='ignore', category=Warning)

In [None]:
def objective(trial, scoring_metric='roc_auc'):
    """Objective function for Optuna to optimize."""
    
    # suggest parameters for classification dataset
    n_samples = trial.suggest_int('n_samples', 100, 1000000, log=True)
    n_features = trial.suggest_int('n_features', 3, 20)
    n_informative = trial.suggest_int('n_informative', 1, n_features - 2)
    n_redundant = trial.suggest_int('n_redundant', 0, n_features - n_informative - 1)
    n_repeated = n_features - n_informative - n_redundant
    
    # ensure the condition: n_classes(2) * n_clusters_per_class <= 2**n_informative
    max_clusters = min(5, (2 ** n_informative) // 2)
    n_clusters_per_class = trial.suggest_int('n_clusters_per_class', 1, max_clusters)
    
    # class weights
    weights = [trial.suggest_uniform('weights', 0.01, 0.99)]
    class_sep = trial.suggest_loguniform('class_sep', 0.01, 10.0)
    
    # generate dataset
    X, y = make_classification(
        n_samples=n_samples,
        n_features=n_features,
        n_informative=n_informative,
        n_redundant=n_redundant,
        n_repeated=n_repeated,
        n_classes=2,
        n_clusters_per_class=n_clusters_per_class,
        weights=weights,
        class_sep=class_sep,
        flip_y=0.01,
        random_state=42
    )
    
    # define LightGBM classifier
    model = lgb.LGBMClassifier(verbose=-1)
    
    # perform 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(model, X, y, cv=kf, scoring=scoring_metric).mean()
    
    return score

In [None]:
def run_experiment(trials=1000, scoring_metric='roc_auc', max_hours=None):
    """
    Runs the Optuna experiment up to 'trials' iterations OR 
    until 'max_hours' hours have elapsed (whichever is reached first).

    :param trials: Maximum number of trials.
    :param scoring_metric: Scoring metric for cross-validation.
    :param max_hours: If provided, time limit in hours for the experiment.
    :return: Optuna study object.
    """
    optuna.logging.set_verbosity(optuna.logging.ERROR)
    
    # create the study
    study = optuna.create_study(sampler=RandomSampler(), direction='maximize')
    
    # convert hours to seconds for the 'timeout' parameter if needed
    timeout_sec = None
    if max_hours is not None:
        timeout_sec = int(max_hours * 3600)
    
    # run optimization
    study.optimize(
        lambda trial: objective(trial, scoring_metric=scoring_metric),
        n_trials=trials,
        timeout=timeout_sec,
        show_progress_bar=True
    )
    
    # save results to CSV
    results = study.trials_dataframe()
    results.to_csv('simulation_results.csv', index=False)
    return study

In [None]:
def shap_analysis(study):
    """
    Runs a SHAP analysis on the results of the study, using a surrogate model
    (LGBM Regressor) trained on the param-values -> final score relation.
    """
    
    # convert study results to DataFrame
    results_df = study.trials_dataframe()
    
    # identify parameter columns (those that start with 'params_')
    param_cols = [col for col in results_df.columns if col.startswith('params_')]
    
    if not param_cols:
        print("No parameter columns found in study DataFrame.")
        return

    X = results_df[param_cols].copy()
    y = results_df['value'].copy()
    
    # fit surrogate model
    surrogate = lgb.LGBMRegressor(random_state=42)
    surrogate.fit(X, y)
    
    # explain predictions using SHAP
    explainer = shap.Explainer(surrogate, X)
    shap_values = explainer(X)
    
    # SHAP plots
    print("Generating SHAP plots ...")
    shap.plots.beeswarm(shap_values, show=True)
    shap.plots.bar(shap_values, show=True)

In [None]:
study = run_experiment(trials=10_000, max_hours=2)

In [None]:
# after the study is done, perform SHAP analysis
shap_analysis(study)

In [None]:
ov.plot_optimization_history(study)

In [None]:
ov.plot_contour(study, params=["class_sep", "weights", "n_samples"])