In [9]:
import numpy as np
import joblib  # for saving the model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
import optuna
import optuna.visualization as vis

n = 9

X = np.load('Datasets/kryptonite-%s-X.npy' % (n))
y = np.load('Datasets/kryptonite-%s-y.npy' % (n))

X_train_total, X_test, y_train_total, y_test = train_test_split(X, y, test_size=0.1, random_state=42)  # 10% of all data for final model evaluation

# To hold the best model and accuracy score
best_model = None
best_accuracy = -1

def objective(trial):
    global best_model, best_scaler, best_accuracy, best_pca

    # Shuffle and split data
    X_train, X_val, y_train, y_val = train_test_split(X_train_total, y_train_total, test_size=0.8, random_state=42)

    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # PCA dimensionality reduction
    pca = PCA(n_components=trial.suggest_int('n_components', 1, X_train_scaled.shape[1]))
    X_train_scaled = pca.fit_transform(X_train_scaled)
    X_val_scaled = pca.transform(X_val_scaled)

    # Random Forest Classifier parameters
    classifier = RandomForestClassifier(
        n_estimators=500,
        max_depth=trial.suggest_int('max_depth', 1, 30),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 20),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 20),
        max_features=trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        random_state=42,
        n_jobs=-1
    )

    # Train the classifier
    classifier.fit(X_train_scaled, y_train)

    # Evaluate on the validation set
    y_val_pred = classifier.predict(X_val_scaled)
    val_accuracy = accuracy_score(y_val, y_val_pred)

    # Report validation accuracy for pruning
    trial.report(val_accuracy, step=0)

    # Check if the trial should be pruned
    if trial.should_prune():
        raise optuna.TrialPruned()

    # If this is the best model so far, save it
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_model = classifier  # Update the best model
        best_scaler = scaler
        best_pca = pca
        # Save the best model
        joblib.dump(best_model, 'best_rf_model.joblib')
        print(f"New best model saved with accuracy: {best_accuracy:.4f}")

    return val_accuracy

# Create a study object with a pruner
pruner = optuna.pruners.MedianPruner(n_startup_trials=10)
study = optuna.create_study(direction="maximize", pruner=pruner)

# Optimize the objective function
study.optimize(objective, n_trials=100)

print("Best hyperparameters:", study.best_params)
print("Best value:", study.best_value)
# print("Best model saved as 'best_rf_model_dim_{n}.joblib'.")

# Evaluate on the validation set
X_test_scaled = best_scaler.transform(X_test)
X_test_scaled = best_pca.transform(X_test_scaled)
y_test_pred = best_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'accuracy on test set:{test_accuracy}')

# Optional: Visualize optimization results
vis.plot_optimization_history(study).show()
vis.plot_param_importances(study).show()
vis.plot_slice(study).show()


[I 2024-11-13 22:34:02,613] A new study created in memory with name: no-name-c6d46762-4dfb-4f7b-9b2e-bf567e656b5f
[I 2024-11-13 22:34:03,508] Trial 0 finished with value: 0.8425154320987654 and parameters: {'n_components': 5, 'max_depth': 26, 'min_samples_split': 19, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 0 with value: 0.8425154320987654.


New best model saved with accuracy: 0.8425


[I 2024-11-13 22:34:04,425] Trial 1 finished with value: 0.8758487654320988 and parameters: {'n_components': 9, 'max_depth': 24, 'min_samples_split': 16, 'min_samples_leaf': 15, 'max_features': 'log2'}. Best is trial 1 with value: 0.8758487654320988.


New best model saved with accuracy: 0.8758


[I 2024-11-13 22:34:05,136] Trial 2 finished with value: 0.7908950617283951 and parameters: {'n_components': 6, 'max_depth': 8, 'min_samples_split': 14, 'min_samples_leaf': 12, 'max_features': 'log2'}. Best is trial 1 with value: 0.8758487654320988.
[I 2024-11-13 22:34:05,830] Trial 3 finished with value: 0.7876543209876543 and parameters: {'n_components': 7, 'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 16, 'max_features': 'log2'}. Best is trial 1 with value: 0.8758487654320988.
[I 2024-11-13 22:34:06,541] Trial 4 finished with value: 0.837037037037037 and parameters: {'n_components': 5, 'max_depth': 13, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.8758487654320988.
[I 2024-11-13 22:34:07,231] Trial 5 finished with value: 0.6833333333333333 and parameters: {'n_components': 4, 'max_depth': 7, 'min_samples_split': 18, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.8758487654320988.
[I 2

New best model saved with accuracy: 0.8967


[I 2024-11-13 22:34:10,859] Trial 10 pruned. 
[I 2024-11-13 22:34:11,818] Trial 11 finished with value: 0.9053240740740741 and parameters: {'n_components': 9, 'max_depth': 22, 'min_samples_split': 15, 'min_samples_leaf': 10, 'max_features': 'log2'}. Best is trial 11 with value: 0.9053240740740741.


New best model saved with accuracy: 0.9053


[I 2024-11-13 22:34:12,758] Trial 12 finished with value: 0.9056327160493827 and parameters: {'n_components': 9, 'max_depth': 30, 'min_samples_split': 13, 'min_samples_leaf': 10, 'max_features': 'log2'}. Best is trial 12 with value: 0.9056327160493827.


New best model saved with accuracy: 0.9056


[I 2024-11-13 22:34:13,629] Trial 13 finished with value: 0.9056327160493827 and parameters: {'n_components': 9, 'max_depth': 29, 'min_samples_split': 12, 'min_samples_leaf': 10, 'max_features': 'log2'}. Best is trial 12 with value: 0.9056327160493827.
[I 2024-11-13 22:34:14,484] Trial 14 finished with value: 0.8873456790123457 and parameters: {'n_components': 8, 'max_depth': 30, 'min_samples_split': 12, 'min_samples_leaf': 10, 'max_features': 'log2'}. Best is trial 12 with value: 0.9056327160493827.
[I 2024-11-13 22:34:15,447] Trial 15 finished with value: 0.9097993827160494 and parameters: {'n_components': 9, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 9, 'max_features': 'log2'}. Best is trial 15 with value: 0.9097993827160494.


New best model saved with accuracy: 0.9098


[I 2024-11-13 22:34:16,231] Trial 16 finished with value: 0.8749228395061729 and parameters: {'n_components': 7, 'max_depth': 17, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 15 with value: 0.9097993827160494.
[I 2024-11-13 22:34:17,063] Trial 17 finished with value: 0.8733024691358025 and parameters: {'n_components': 8, 'max_depth': 27, 'min_samples_split': 6, 'min_samples_leaf': 12, 'max_features': 'log2'}. Best is trial 15 with value: 0.9097993827160494.
[I 2024-11-13 22:34:17,864] Trial 18 finished with value: 0.9094907407407408 and parameters: {'n_components': 7, 'max_depth': 17, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 15 with value: 0.9097993827160494.
[I 2024-11-13 22:34:18,677] Trial 19 finished with value: 0.9087191358024691 and parameters: {'n_components': 7, 'max_depth': 16, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 15 with value: 0.9097993827160

New best model saved with accuracy: 0.9193


[I 2024-11-13 22:34:22,166] Trial 23 finished with value: 0.9172067901234567 and parameters: {'n_components': 8, 'max_depth': 18, 'min_samples_split': 7, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 22 with value: 0.9192901234567902.
[I 2024-11-13 22:34:23,095] Trial 24 finished with value: 0.9165123456790123 and parameters: {'n_components': 8, 'max_depth': 19, 'min_samples_split': 7, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 22 with value: 0.9192901234567902.
[I 2024-11-13 22:34:23,982] Trial 25 finished with value: 0.9168981481481482 and parameters: {'n_components': 8, 'max_depth': 19, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 22 with value: 0.9192901234567902.
[I 2024-11-13 22:34:24,831] Trial 26 finished with value: 0.8966820987654321 and parameters: {'n_components': 8, 'max_depth': 13, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 22 with value: 0.9192901234567

New best model saved with accuracy: 0.9334


[I 2024-11-13 22:34:30,726] Trial 33 finished with value: 0.926466049382716 and parameters: {'n_components': 9, 'max_depth': 14, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 32 with value: 0.9334104938271605.
[I 2024-11-13 22:34:31,578] Trial 34 finished with value: 0.9248456790123457 and parameters: {'n_components': 9, 'max_depth': 14, 'min_samples_split': 11, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 32 with value: 0.9334104938271605.
[I 2024-11-13 22:34:32,425] Trial 35 finished with value: 0.9247685185185185 and parameters: {'n_components': 9, 'max_depth': 13, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 32 with value: 0.9334104938271605.
[I 2024-11-13 22:34:33,260] Trial 36 finished with value: 0.9151234567901234 and parameters: {'n_components': 9, 'max_depth': 14, 'min_samples_split': 11, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 32 with value: 0.933410493827

New best model saved with accuracy: 0.9362


[I 2024-11-13 22:35:16,639] Trial 85 finished with value: 0.9380401234567901 and parameters: {'n_components': 9, 'max_depth': 21, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 85 with value: 0.9380401234567901.


New best model saved with accuracy: 0.9380


[I 2024-11-13 22:35:17,586] Trial 86 finished with value: 0.9256172839506173 and parameters: {'n_components': 8, 'max_depth': 20, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 85 with value: 0.9380401234567901.
[I 2024-11-13 22:35:18,675] Trial 87 finished with value: 0.9374228395061729 and parameters: {'n_components': 9, 'max_depth': 22, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 85 with value: 0.9380401234567901.
[I 2024-11-13 22:35:19,709] Trial 88 finished with value: 0.9374228395061729 and parameters: {'n_components': 9, 'max_depth': 22, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 85 with value: 0.9380401234567901.
[I 2024-11-13 22:35:20,644] Trial 89 finished with value: 0.928858024691358 and parameters: {'n_components': 8, 'max_depth': 22, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 85 with value: 0.938040123456790

Best hyperparameters: {'n_components': 9, 'max_depth': 21, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Best value: 0.9380401234567901
accuracy on test set:0.9366666666666666
