In [None]:
import numpy as np
import joblib  # for saving the model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
import optuna
import optuna.visualization as vis

n = 18

X = np.load('Datasets/kryptonite-%s-X.npy' % (n))
y = np.load('Datasets/kryptonite-%s-y.npy' % (n))

X_train_total, X_test, y_train_total, y_test = train_test_split(X, y, test_size=0.1, random_state=42)  # 10% of all data for final model evaluation

# To hold the best model and accuracy score
best_model = None
best_accuracy = -1

def objective(trial):
    global best_model, best_scaler, best_accuracy, best_pca

    # Shuffle and split data
    X_train, X_val, y_train, y_val = train_test_split(X_train_total, y_train_total, test_size=0.8, random_state=42)

    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # PCA dimensionality reduction
    pca = PCA(n_components=trial.suggest_int('n_components', 1, X_train_scaled.shape[1]))
    X_train_scaled = pca.fit_transform(X_train_scaled)
    X_val_scaled = pca.transform(X_val_scaled)

    # Random Forest Classifier parameters
    classifier = RandomForestClassifier(
        n_estimators=trial.suggest_int('n_estimators', 10, 200),
        max_depth=trial.suggest_int('max_depth', 1, 30),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 20),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 20),
        max_features=trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        random_state=42,
        n_jobs=-1
    )

    # Train the classifier
    classifier.fit(X_train_scaled, y_train)

    # Evaluate on the validation set
    y_val_pred = classifier.predict(X_val_scaled)
    val_accuracy = accuracy_score(y_val, y_val_pred)

    # Report validation accuracy for pruning
    trial.report(val_accuracy, step=0)

    # Check if the trial should be pruned
    if trial.should_prune():
        raise optuna.TrialPruned()

    # If this is the best model so far, save it
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_model = classifier  # Update the best model
        best_scaler = scaler
        best_pca = pca
        # Save the best model
        joblib.dump(best_model, 'best_rf_model.joblib')
        print(f"New best model saved with accuracy: {best_accuracy:.4f}")

    return val_accuracy

# Create a study object with a pruner
pruner = optuna.pruners.MedianPruner(n_startup_trials=10)
study = optuna.create_study(direction="maximize", pruner=pruner)

# Optimize the objective function
study.optimize(objective, n_trials=100)

print("Best hyperparameters:", study.best_params)
print("Best value:", study.best_value)
# print("Best model saved as 'best_rf_model_dim_{n}.joblib'.")

# Optional: Visualize optimization results
vis.plot_optimization_history(study).show()
vis.plot_param_importances(study).show()
vis.plot_slice(study).show()

# Evaluate on the test set
X_test_scaled = best_scaler.transform(X_test)
X_test_scaled = best_pca.transform(X_test_scaled)
y_test_pred = best_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

[I 2024-11-13 22:25:27,562] A new study created in memory with name: no-name-848b6859-0e3c-43bb-aa51-ea009a014181
[I 2024-11-13 22:25:28,034] Trial 0 finished with value: 0.4979166666666667 and parameters: {'n_components': 3, 'n_estimators': 193, 'max_depth': 17, 'min_samples_split': 12, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 0 with value: 0.4979166666666667.


New best model saved with accuracy: 0.4979


[I 2024-11-13 22:25:28,631] Trial 1 finished with value: 0.5008873456790124 and parameters: {'n_components': 18, 'n_estimators': 166, 'max_depth': 25, 'min_samples_split': 16, 'min_samples_leaf': 8, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.5008873456790124.


New best model saved with accuracy: 0.5009


[I 2024-11-13 22:25:28,991] Trial 2 finished with value: 0.5010030864197531 and parameters: {'n_components': 3, 'n_estimators': 160, 'max_depth': 30, 'min_samples_split': 7, 'min_samples_leaf': 16, 'max_features': 'log2'}. Best is trial 2 with value: 0.5010030864197531.


New best model saved with accuracy: 0.5010


[I 2024-11-13 22:25:29,385] Trial 3 finished with value: 0.49753086419753084 and parameters: {'n_components': 8, 'n_estimators': 154, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 19, 'max_features': 'log2'}. Best is trial 2 with value: 0.5010030864197531.
[I 2024-11-13 22:25:29,558] Trial 4 finished with value: 0.5038966049382716 and parameters: {'n_components': 17, 'n_estimators': 38, 'max_depth': 14, 'min_samples_split': 19, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 4 with value: 0.5038966049382716.
[I 2024-11-13 22:25:29,702] Trial 5 finished with value: 0.49888117283950617 and parameters: {'n_components': 7, 'n_estimators': 50, 'max_depth': 26, 'min_samples_split': 18, 'min_samples_leaf': 18, 'max_features': 'sqrt'}. Best is trial 4 with value: 0.5038966049382716.


New best model saved with accuracy: 0.5039


[I 2024-11-13 22:25:29,837] Trial 6 finished with value: 0.4977237654320988 and parameters: {'n_components': 4, 'n_estimators': 68, 'max_depth': 2, 'min_samples_split': 15, 'min_samples_leaf': 15, 'max_features': 'log2'}. Best is trial 4 with value: 0.5038966049382716.
[I 2024-11-13 22:25:30,252] Trial 7 finished with value: 0.5006172839506173 and parameters: {'n_components': 17, 'n_estimators': 172, 'max_depth': 8, 'min_samples_split': 13, 'min_samples_leaf': 18, 'max_features': 'log2'}. Best is trial 4 with value: 0.5038966049382716.
[I 2024-11-13 22:25:30,318] Trial 8 finished with value: 0.5034722222222222 and parameters: {'n_components': 4, 'n_estimators': 10, 'max_depth': 27, 'min_samples_split': 14, 'min_samples_leaf': 12, 'max_features': 'sqrt'}. Best is trial 4 with value: 0.5038966049382716.
[I 2024-11-13 22:25:30,466] Trial 9 finished with value: 0.49930555555555556 and parameters: {'n_components': 10, 'n_estimators': 78, 'max_depth': 2, 'min_samples_split': 7, 'min_samples_

New best model saved with accuracy: 0.5042


[I 2024-11-13 22:25:34,324] Trial 25 finished with value: 0.504591049382716 and parameters: {'n_components': 18, 'n_estimators': 53, 'max_depth': 23, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 25 with value: 0.504591049382716.


New best model saved with accuracy: 0.5046


[I 2024-11-13 22:25:34,583] Trial 26 finished with value: 0.5054783950617284 and parameters: {'n_components': 18, 'n_estimators': 54, 'max_depth': 22, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 26 with value: 0.5054783950617284.


New best model saved with accuracy: 0.5055


[I 2024-11-13 22:25:34,839] Trial 27 finished with value: 0.5047453703703704 and parameters: {'n_components': 18, 'n_estimators': 56, 'max_depth': 22, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 26 with value: 0.5054783950617284.
[I 2024-11-13 22:25:35,132] Trial 28 finished with value: 0.5032793209876543 and parameters: {'n_components': 15, 'n_estimators': 82, 'max_depth': 19, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 26 with value: 0.5054783950617284.
[I 2024-11-13 22:25:35,366] Trial 29 pruned. 
[I 2024-11-13 22:25:35,664] Trial 30 finished with value: 0.5020447530864197 and parameters: {'n_components': 16, 'n_estimators': 71, 'max_depth': 23, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 26 with value: 0.5054783950617284.
[I 2024-11-13 22:25:35,921] Trial 31 pruned. 
[I 2024-11-13 22:25:36,171] Trial 32 finished with value: 0.5018518518518519 and parameters:

New best model saved with accuracy: 0.5065
New best model saved with accuracy: 0.5096


[I 2024-11-13 22:25:38,797] Trial 39 finished with value: 0.5100308641975309 and parameters: {'n_components': 17, 'n_estimators': 29, 'max_depth': 29, 'min_samples_split': 3, 'min_samples_leaf': 7, 'max_features': 'log2'}. Best is trial 39 with value: 0.5100308641975309.
[I 2024-11-13 22:25:38,933] Trial 40 pruned. 


New best model saved with accuracy: 0.5100


[I 2024-11-13 22:25:39,155] Trial 41 pruned. 
[I 2024-11-13 22:25:39,320] Trial 42 finished with value: 0.5066743827160494 and parameters: {'n_components': 17, 'n_estimators': 25, 'max_depth': 28, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 39 with value: 0.5100308641975309.
[I 2024-11-13 22:25:39,481] Trial 43 finished with value: 0.503125 and parameters: {'n_components': 17, 'n_estimators': 25, 'max_depth': 29, 'min_samples_split': 6, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 39 with value: 0.5100308641975309.
[I 2024-11-13 22:25:39,640] Trial 44 finished with value: 0.5058641975308642 and parameters: {'n_components': 17, 'n_estimators': 20, 'max_depth': 30, 'min_samples_split': 3, 'min_samples_leaf': 7, 'max_features': 'log2'}. Best is trial 39 with value: 0.5100308641975309.
[I 2024-11-13 22:25:39,760] Trial 45 finished with value: 0.504128086419753 and parameters: {'n_components': 15, 'n_estimators': 19, 'max_depth': 30

New best model saved with accuracy: 0.5102


[I 2024-11-13 22:25:41,917] Trial 57 finished with value: 0.5058256172839506 and parameters: {'n_components': 11, 'n_estimators': 38, 'max_depth': 27, 'min_samples_split': 4, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 56 with value: 0.5101851851851852.
[I 2024-11-13 22:25:42,082] Trial 58 pruned. 
[I 2024-11-13 22:25:42,393] Trial 59 pruned. 
[I 2024-11-13 22:25:42,619] Trial 60 finished with value: 0.5080632716049382 and parameters: {'n_components': 17, 'n_estimators': 45, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 56 with value: 0.5101851851851852.
[I 2024-11-13 22:25:42,854] Trial 61 finished with value: 0.5095293209876544 and parameters: {'n_components': 17, 'n_estimators': 39, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 56 with value: 0.5101851851851852.
[I 2024-11-13 22:25:43,094] Trial 62 finished with value: 0.5074074074074074 and parameters:

Best hyperparameters: {'n_components': 17, 'n_estimators': 38, 'max_depth': 28, 'min_samples_split': 4, 'min_samples_leaf': 5, 'max_features': 'log2'}
Best value: 0.5101851851851852
Best model saved as 'best_rf_model_dim_{n}.joblib'.


Test Accuracy: 0.4939
