In [38]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA, KernelPCA
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
import optuna
import optuna.visualization as vis

n = 9

X = np.load('Datasets/kryptonite-%s-X.npy'%(n))
y = np.load('Datasets/kryptonite-%s-y.npy'%(n))

def denoise_bernoulli_data(X):
    """Denoise data by converting to binary values"""
    return (X >= 0.5).astype(float)

def objective(trial):
    ### ========== Shuffle and Split Data ========== ###

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train_scaled = X_train
    X_val_scaled = X_val

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    ### ========== Dimensionality Reduction ========== ###

    # Suggest dimensionality reduction and kernel parameters
    # use_pca = trial.suggest_categorical('use_pca', ['True', 'False'])
    
    # params_pca = {}

    # if use_pca == 'True':
        # kernel_pca = trial.suggest_categorical('pca_kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'cosine'])
        # kernel_pca = 'linear'

        # if kernel_pca == 'poly':
        #     params_pca['degree'] = trial.suggest_int('degree_pca', 1, 10)
        # if kernel_pca in ['poly', 'rbf', 'sigmoid']:
        #     params_pca['gamma'] = trial.suggest_float('gamma_pca', 0.1, 10)

        # params_pca['n_jobs'] = -1
        # params_pca['n_components'] = trial.suggest_int('n_pca_components', 1, max(1, X_train_scaled.shape[1]))


    # if use_pca == 'True':
    #     pca = PCA(**params_pca)

    #     X_train_scaled = pca.fit_transform(X_train_scaled)
    #     X_val_scaled = pca.transform(X_val)

    # LightGBM hyperparameters
    # learning_rate = trial.suggest_float('learning_rate', 1e-3, 0.3, log=True)
    # max_depth = trial.suggest_int('max_depth', -1, 20)
    num_leaves = trial.suggest_int('num_leaves', 2, 512)
    # n_estimators = trial.suggest_int('n_estimators', 50, 500, step=50)
    min_child_samples = trial.suggest_int('min_child_samples', 5, 100)
    # subsample = trial.suggest_float('subsample', 0.5, 1.0)
    # colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    boosting_type = trial.suggest_categorical('boosting_type', ['dart', 'gbdt'])

    # Define and train classifier with suggested parameters
    # classifier = LGBMClassifier(
    #     learning_rate=learning_rate,
    #     max_depth=max_depth,
    #     num_leaves=num_leaves,
    #     n_estimators=n_estimators,
    #     min_child_samples=min_child_samples,
    #     subsample=subsample,
    #     colsample_bytree=colsample_bytree,
    #     n_jobs=-1,
    #     verbose=-1,
    #     verbose_eval=-1
    # )

    classifier = LGBMClassifier(
        num_leaves=num_leaves,
        min_child_samples=min_child_samples,
        objective='binary',
        boosting_type=boosting_type,
        n_jobs=-1,
        num_iterations=10000,
        device_type='cpu'
    )

    # Train the classifier
    classifier.fit(X_train_scaled, y_train)

    # Evaluate on the val set
    y_val_pred = classifier.predict(X_val_scaled)
    val_accuracy = accuracy_score(y_val, y_val_pred)

    # Report validation accuracy for pruning
    trial.report(val_accuracy, step=0)

    # Check if the trial should be pruned
    if trial.should_prune():
        raise optuna.TrialPruned()


    return val_accuracy

# Create a study object with a pruner
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction="maximize", pruner=pruner)

# Optimize the objective function
study.optimize(objective, n_trials=50)

print("Best hyperparameters:", study.best_params)
print("Best value:", study.best_value)

# Visualize the optimization history and parameter importances
vis.plot_optimization_history(study).show()
vis.plot_param_importances(study).show()
vis.plot_slice(study).show()

[I 2024-11-12 10:49:23,012] A new study created in memory with name: no-name-366375f7-5e04-47ee-93c2-2efdd7b16223



Found `num_iterations` in params. Will use it instead of argument

[I 2024-11-12 10:49:28,791] Trial 0 finished with value: 0.5108333333333334 and parameters: {'num_leaves': 16, 'min_child_samples': 73, 'boosting_type': 'gbdt'}. Best is trial 0 with value: 0.5108333333333334.

Found `num_iterations` in params. Will use it instead of argument

[I 2024-11-12 10:50:40,167] Trial 1 finished with value: 0.5213888888888889 and parameters: {'num_leaves': 80, 'min_child_samples': 8, 'boosting_type': 'dart'}. Best is trial 1 with value: 0.5213888888888889.

Found `num_iterations` in params. Will use it instead of argument

[W 2024-11-12 10:54:55,002] Trial 2 failed with parameters: {'num_leaves': 391, 'min_child_samples': 79, 'boosting_type': 'dart'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/homes/fb224/fb224/MML-CW/venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
     

KeyboardInterrupt: 