In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA, KernelPCA
from sklearn.ensemble import RandomForestClassifier
import optuna
import optuna.visualization as vis

n = 9

X = np.load('Datasets/kryptonite-%s-X.npy'%(n))
y = np.load('Datasets/kryptonite-%s-y.npy'%(n))

def denoise_bernoulli_data(X):
    """Denoise data by converting to binary values"""
    return (X >= 0.5).astype(float)

def objective(trial):
    ### ========== Shuffle and Split Data ========== ###

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.8, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Suggest dimensionality reduction and kernel parameters
    use_pca = trial.suggest_categorical('use_pca', ['True', 'False'])
    
    params_pca = {}

    if use_pca == 'True':
        # kernel_pca = trial.suggest_categorical('pca_kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'cosine'])
        # kernel_pca = 'linear'

        # if kernel_pca == 'poly':
        #     params_pca['degree'] = trial.suggest_int('degree_pca', 1, 10)
        # if kernel_pca in ['poly', 'rbf', 'sigmoid']:
        #     params_pca['gamma'] = trial.suggest_float('gamma_pca', 0.1, 10)

        # params_pca['n_jobs'] = -1
        params_pca['n_components'] = trial.suggest_int('n_pca_components', 1, max(1, X_train_scaled.shape[1]))


    ### ========== Dimensionality Reduction ========== ###

    if use_pca == 'True':
        pca = PCA(**params_pca)

        X_train_scaled = pca.fit_transform(X_train_scaled)
        X_val_scaled = pca.transform(X_val)

    # Random Forest hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 100, 300, step=50)
    max_depth = trial.suggest_int('max_depth', 5, 30, step=5)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])

    # Define and train classifier
    classifier = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        n_jobs=-1
    )

    # Train the classifier
    classifier.fit(X_train_scaled, y_train)

    # Evaluate on the val set
    y_val_pred = classifier.predict(X_val_scaled)
    val_accuracy = accuracy_score(y_val, y_val_pred)

    # Report validation accuracy for pruning
    trial.report(val_accuracy, step=0)

    # Check if the trial should be pruned
    if trial.should_prune():
        raise optuna.TrialPruned()


    return val_accuracy

# Create a study object with a pruner
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction="maximize", pruner=pruner)

# Optimize the objective function
study.optimize(objective, n_trials=100)

print("Best hyperparameters:", study.best_params)
print("Best value:", study.best_value)

# Visualize the optimization history and parameter importances
vis.plot_optimization_history(study).show()
vis.plot_param_importances(study).show()
vis.plot_slice(study).show()

[I 2024-11-12 09:49:28,485] A new study created in memory with name: no-name-b19b546a-b4bd-4ea9-8b7a-abd8ddcd02ce
[I 2024-11-12 09:49:28,987] Trial 0 finished with value: 0.5038194444444445 and parameters: {'use_pca': 'False', 'n_estimators': 250, 'max_depth': 5, 'min_samples_split': 15, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5038194444444445.
[I 2024-11-12 09:49:29,293] Trial 1 finished with value: 0.5022222222222222 and parameters: {'use_pca': 'False', 'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.5038194444444445.
[I 2024-11-12 09:49:29,748] Trial 2 finished with value: 0.5070138888888889 and parameters: {'use_pca': 'False', 'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 2 with value: 0.5070138888888889.
[I 2024-11-12 09:49:30,345] Trial 3 finished with value: 0.50381944444444

Best hyperparameters: {'use_pca': 'True', 'n_pca_components': 3, 'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2'}
Best value: 0.5245833333333333
