In [1]:
import numpy as np
import joblib  # for saving the model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import optuna
import optuna.visualization as vis
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Dimensionality
n = 18

In [5]:
X_train_total = np.load('../../Datasets_Train_Test_Split/kryptonite-%s-X_train.npy' % n)
y_train_total = np.load('../../Datasets_Train_Test_Split/kryptonite-%s-y_train.npy' % n)
X_test = np.load('../../Datasets_Train_Test_Split/kryptonite-%s-X_test.npy' % n)
y_test = np.load('../../Datasets_Train_Test_Split/kryptonite-%s-y_test.npy' % n)

# To hold the best model and accuracy score
best_model = None
best_accuracy = -1
SAVE_BEST_MODEL = False

In [6]:
def objective(trial):
    """
    Objective function for Optuna to optimize the hyperparameters of a Random Forest Classifier.

    Parameters:
    ----------
    trial : optuna.trial.Trial
        A trial object containing the current set of hyperparameters to test.

    Returns:
    -------
    float
        Validation accuracy for the current trial.

    Notes:
    -----
    - The function implements data scaling, PCA, and model training.
    - It supports pruning of underperforming trials.
    - The best model, scaler, and PCA instance are saved globally.
    """
    global best_model, best_scaler, best_accuracy, best_pca

    # Shuffle and split data
    X_train, X_val, y_train, y_val = train_test_split(X_train_total, y_train_total, test_size=0.8, random_state=42)

    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # PCA dimensionality reduction
    pca = PCA(n_components=trial.suggest_int('n_components', 1, X_train_scaled.shape[1]))
    X_train_scaled = pca.fit_transform(X_train_scaled)
    X_val_scaled = pca.transform(X_val_scaled)

    # Random Forest Classifier parameters
    classifier = RandomForestClassifier(
        n_estimators=500,
        max_depth=trial.suggest_int('max_depth', 1, 30),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 20),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 20),
        max_features=trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        random_state=42,
        n_jobs=-1
    )

    # Train the classifier
    classifier.fit(X_train_scaled, y_train)

    # Evaluate on the validation set
    y_val_pred = classifier.predict(X_val_scaled)
    val_accuracy = accuracy_score(y_val, y_val_pred)

    # Report validation accuracy for pruning
    trial.report(val_accuracy, step=0)

    # Check if the trial should be pruned
    if trial.should_prune():
        raise optuna.TrialPruned()

    # If this is the best model so far, save it
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_model = classifier  # Update the best model
        best_scaler = scaler
        best_pca = pca

        if SAVE_BEST_MODEL:
        # Save the best model
            joblib.dump(best_model, 'best_rf_model.joblib')
            print(f"New best model saved with accuracy: {best_accuracy:.4f}")

    return val_accuracy

In [None]:
# Create a study object with a pruner
pruner = optuna.pruners.MedianPruner(n_startup_trials=10)
study = optuna.create_study(direction="maximize", pruner=pruner)

# Optimize the objective function
study.optimize(objective, n_trials=100)

print("Best hyperparameters:", study.best_params)
print("Best value:", study.best_value)

# Evaluate on the validation set
X_test_scaled = best_scaler.transform(X_test)
X_test_scaled = best_pca.transform(X_test_scaled)
y_test_pred = best_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'accuracy on test set:{test_accuracy}')

# Optional: Visualize optimization results
vis.plot_optimization_history(study).show()
vis.plot_param_importances(study).show()
vis.plot_slice(study).show()

[I 2024-11-25 17:16:06,356] A new study created in memory with name: no-name-074b392e-778c-4aa3-8f0d-d8e0057612db
[I 2024-11-25 17:16:07,629] Trial 0 finished with value: 0.5021267361111111 and parameters: {'n_components': 18, 'max_depth': 22, 'min_samples_split': 9, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5021267361111111.
[I 2024-11-25 17:16:08,299] Trial 1 finished with value: 0.4967447916666667 and parameters: {'n_components': 1, 'max_depth': 6, 'min_samples_split': 15, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.5021267361111111.
[I 2024-11-25 17:16:09,274] Trial 2 finished with value: 0.5003472222222223 and parameters: {'n_components': 16, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 0 with value: 0.5021267361111111.
[I 2024-11-25 17:16:10,152] Trial 3 finished with value: 0.5018663194444445 and parameters: {'n_components': 7, 'max_depth': 18, 'min_samp

Best hyperparameters: {'n_components': 18, 'max_depth': 22, 'min_samples_split': 9, 'min_samples_leaf': 7, 'max_features': 'sqrt'}
Best value: 0.5021267361111111
accuracy on test set:0.48875


In [8]:
optimal_params_9: dict = {'n_estimators': 500, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2', 'n_components': 9}
optimal_params_12: dict = {'n_estimators': 500, 'max_depth': 25, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'n_components': 11}
optimal_params_15: dict = {'n_estimators': 500, 'n_components': 4, 'max_depth': 29, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
optimal_params_18: dict = {'n_estimators': 500, 'n_components': 15, 'max_depth': 17, 'min_samples_split': 14, 'min_samples_leaf': 17, 'max_features': 'log2'}

optimal_params: dict[int, dict] = {9: optimal_params_9, 12: optimal_params_12, 15: optimal_params_15, 18: optimal_params_18}

### Prediction for McNemar Test

In [None]:
def run_random_forest_fixed_datasets(n, optimal_params):
    """
    Runs a Random Forest classifier on fixed datasets, applies scaling, PCA, and evaluates the model's performance.

    Parameters:
    ----------
    n : int
        Dataset identifier to load the corresponding train and test datasets.
    optimal_params : dict
        Dictionary of hyperparameters for the Random Forest and PCA. Expected keys:
            - 'n_components': int, number of PCA components.
            - 'n_estimators': int, number of trees in the Random Forest.
            - 'max_depth': int, maximum depth of the trees.
            - 'min_samples_split': int, minimum samples required to split a node.
            - 'min_samples_leaf': int, minimum samples required at a leaf node.
            - 'max_features': int or str, number of features to consider for best split.

    Returns:
    -------
    None
        Outputs model predictions and probabilities to `.npy` files for future use.
        Print test accuracy to the console.

    Notes:
    -----
    - Datasets are expected to be stored in `../../Datasets_Train_Test_Split/` directory.
    - Predictions and probabilities are saved in `.npy` format for use in McNemar's test.
    """
    X_train = np.load('../../Datasets_Train_Test_Split/kryptonite-%s-X_train.npy' % n)
    y_train = np.load('../../Datasets_Train_Test_Split/kryptonite-%s-y_train.npy' % n)
    X_test = np.load('../../Datasets_Train_Test_Split/kryptonite-%s-X_test.npy' % n)
    y_test = np.load('../../Datasets_Train_Test_Split/kryptonite-%s-y_test.npy' % n)

    print(X_train.shape, X_test.shape)

    optimal_params = optimal_params

    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # PCA dimensionality reduction
    pca = PCA(n_components=optimal_params['n_components'])
    X_train_scaled = pca.fit_transform(X_train_scaled)
    X_test_scaled = pca.transform(X_test_scaled)

    # Random Forest Classifier parameters
    classifier = RandomForestClassifier(
        n_estimators=optimal_params['n_estimators'],
        max_depth=optimal_params['max_depth'],
        min_samples_split=optimal_params['min_samples_split'],
        min_samples_leaf=optimal_params['min_samples_leaf'],
        max_features=optimal_params['max_features'],
        random_state=42,
        n_jobs=-1
    )

    # Train the classifier
    classifier.fit(X_train_scaled, y_train)

    # Evaluate on the test
    y_test_pred = classifier.predict(X_test_scaled)
    y_test_pred_proba = classifier.predict_proba(X_test_scaled)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f'accuracy on test set: {test_accuracy}')

    # Save predictions for McNemar test
    np.save('../../Datasets_Train_Test_Split/kryptonite_%s_pred_rf.npy' % n, y_test_pred)
    np.save('../../Datasets_Train_Test_Split/kryptonite_%s_pred_proba_rf.npy' % n, y_test_pred_proba)

In [None]:
list_n: list = [9, 12, 15, 18]

for n in list_n:
    run_random_forest_fixed_datasets(n=n, optimal_params=optimal_params[n])

(14400, 9) (3600, 9)
accuracy on test set: 0.9572222222222222


### K-Fold Cross-Validation


In [13]:
def run_k_fold_random_forest(n, optimal_params):
    """
    Performs k-fold cross-validation with a Random Forest Classifier on the given dataset.

    Parameters:
    ----------
    n : int
        Identifier for the dataset to load (e.g. 9, 12, 15, 18).
    optimal_params : dict
        Dictionary of hyperparameters for the Random Forest and PCA. Expected keys:
            - 'n_components': int, number of PCA components.
            - 'n_estimators': int, number of trees in the Random Forest.
            - 'max_depth': int, maximum depth of the trees.
            - 'min_samples_split': int, minimum samples required to split a node.
            - 'min_samples_leaf': int, minimum samples required at a leaf node.
            - 'max_features': str, method to select the number of features for splits ('sqrt' or 'log2').

    Returns:
    -------
    None
        Prints the mean accuracy, precision, recall, and F1 score across k folds.

    Notes:
    -----
    - The dataset is expected to be stored in the `../../Datasets` directory in `.npy` format.
    - Performs shuffling of the dataset before splitting it into folds.
    - Applies feature scaling using `StandardScaler` and dimensionality reduction using `PCA`.
    - Evaluation metrics include accuracy, precision, recall, and F1 score for each fold.
    - Uses weighted averaging for precision, recall, and F1 to handle class imbalance.
    """
    # Load the data
    X = np.load('../../Datasets/kryptonite-%s-X.npy' % n)
    y = np.load('../../Datasets/kryptonite-%s-y.npy' % n)

    print(X.shape, y.shape)

    # Hyperparameters
    optimal_params = optimal_params_9
    random_seed = 42
    k = 4  # Number of folds

    # Shuffle data
    np.random.seed(random_seed)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X = X[indices]
    y = y[indices]

    # Split indices into k folds
    fold_sizes = np.full(k, len(X) // k)
    fold_starts = np.cumsum(fold_sizes)
    folds = [indices[fold_starts[i - 1] if i > 0 else 0:fold_starts[i]] for i in range(k)]

    print(X.shape, folds[0])

    # k-Fold Cross-Validation
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    for i in range(k):
        # Create train/test splits
        test_idx = folds[i]
        train_idx = np.setdiff1d(indices, test_idx)
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Feature scaling
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # PCA dimensionality reduction
        pca = PCA(n_components=optimal_params['n_components'])
        X_train_scaled = pca.fit_transform(X_train_scaled)
        X_test_scaled = pca.transform(X_test_scaled)

        # Train the Random Forest Classifier
        classifier = RandomForestClassifier(
            n_estimators=optimal_params['n_estimators'],
            max_depth=optimal_params['max_depth'],
            min_samples_split=optimal_params['min_samples_split'],
            min_samples_leaf=optimal_params['min_samples_leaf'],
            max_features=optimal_params['max_features'],
            random_state=random_seed,
            n_jobs=-1
        )
        classifier.fit(X_train_scaled, y_train)

        # Evaluate on the test set
        y_test_pred = classifier.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_test_pred)
        precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_test_pred, average='weighted', zero_division=0)

        # Store results for each fold
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

    # Compute average metrics
    mean_accuracy = np.mean(accuracies)
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)
    mean_f1 = np.mean(f1_scores)

    print(f'Mean Accuracy over {k} folds: {mean_accuracy:.4f}')
    print(f'Mean Precision over {k} folds: {mean_precision:.4f}')
    print(f'Mean Recall over {k} folds: {mean_recall:.4f}')
    print(f'Mean F1 Score over {k} folds: {mean_f1:.4f}')

In [14]:
list_n: list = [9, 12, 15, 18]

for n in list_n:
    run_k_fold_random_forest(n=n, optimal_params=optimal_params[n])

(18000, 9) (18000,)
(18000, 9) [ 2574  7496  9210 ... 10260  4915 10423]
Mean Accuracy over 4 folds: 0.9519
Mean Precision over 4 folds: 0.9520
Mean Recall over 4 folds: 0.9519
Mean F1 Score over 4 folds: 0.9519
(24000, 12) (24000,)
(24000, 12) [ 3111 18679 17472 ... 20866 23445  9529]
Mean Accuracy over 4 folds: 0.8658
Mean Precision over 4 folds: 0.8661
Mean Recall over 4 folds: 0.8658
Mean F1 Score over 4 folds: 0.8658
(30000, 15) (30000,)
(30000, 15) [ 2308 22404 23397 ... 28201 19705 28313]
Mean Accuracy over 4 folds: 0.5952
Mean Precision over 4 folds: 0.5962
Mean Recall over 4 folds: 0.5952
Mean F1 Score over 4 folds: 0.5948
(36000, 18) (36000,)
(36000, 18) [16461 23579 23640 ...  8910 34919  7671]
Mean Accuracy over 4 folds: 0.5097
Mean Precision over 4 folds: 0.5099
Mean Recall over 4 folds: 0.5097
Mean F1 Score over 4 folds: 0.5094
