In [1]:
from sklearn.ensemble import IsolationForest
import numpy as np
import sys, os
import pandas as pd

In [2]:
sys.path.insert(0, '../../')
from src.functions import Data, Modeling, Evaluation

dt = Data()
mod = Modeling()
eval = Evaluation()

In [3]:
# Get data directory
data_dir = os.path.join(os.getcwd(), '../../data/processed')


# relative paths
rel_path_X_train = 'X_train.pkl'
rel_path_X_val = 'X_val.pkl'
rel_path_X_test = 'X_test.pkl'
rel_path_y_train = 'y_train.pkl'
rel_path_y_val = 'y_val.pkl'
rel_path_y_test = 'y_test.pkl'

# absolute paths
abs_path_X_train = os.path.join(data_dir, rel_path_X_train)
abs_path_X_val = os.path.join(data_dir, rel_path_X_val)
abs_path_X_test = os.path.join(data_dir, rel_path_X_test)
abs_path_y_train = os.path.join(data_dir, rel_path_y_train)
abs_path_y_val = os.path.join(data_dir, rel_path_y_val)
abs_path_y_test = os.path.join(data_dir, rel_path_y_test)

# read files
X_train = pd.read_pickle(abs_path_X_train)
X_val = pd.read_pickle(abs_path_X_val)
X_test = pd.read_pickle(abs_path_X_test)
y_train = pd.read_pickle(abs_path_y_train)
y_val = pd.read_pickle(abs_path_y_val)
y_test = pd.read_pickle(abs_path_y_test)


In [4]:
# Save column names
X_val_df = X_val

# Convert pandas dataframes to numpy arrays for memory efficiency
X_train = X_train.values
X_val = X_val.values
y_train = y_train.values
y_val = y_val.values

In [5]:
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import precision_score, f1_score, average_precision_score

In [11]:
def if_hypertune(param_grid, X_train, y_train):
    '''Hyperparameter tuning for a model using GridSearchCV'''

    # Initialize the model
    clf = IsolationForest(random_state=42)

    # Initialize the grid search
    grid_search = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=5, n_jobs=-1, refit=True)

    # Fit the grid search on your training data
    grid_search.fit(X_train, y_train)

    # Print the best parameters and the best score
    print(f"Best parameters: {grid_search.best_params_}")
    best_auroc = grid_search.best_score_
    print(f"Best AUROC: {best_auroc}")

    # Get the best parameters and the best estimator
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Predict the labels of the training set using cross-validation
    y_train_pred = cross_val_predict(best_estimator, X_train, y_train, cv=5)

    # Calculate the precision, F1 score, and PR AUC
    precision = precision_score(y_train, y_train_pred)
    f1 = f1_score(y_train, y_train_pred)
    pr_auc = average_precision_score(y_train, y_train_pred)

    # Return the best parameters, the best estimator, and the metrics
    return best_params, best_estimator, {'AUROC': best_auroc, 'Precision': precision, 'F1 Score': f1, 'PR AUC': pr_auc}

In [13]:
# Define the parameter grid
tm_param_grid = {
    'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400],
    'max_samples': ['auto', 50, 100, 128, 200, 256, 300],
    'contamination': [0.01, 0.05, 0.1, 0.15, 0.2],
}


# Fine tune the model
tm_best_params, tm_best_estimator, metrics = if_hypertune(tm_param_grid, X_train, y_train)

# Use the best estimator to make predictions
tm_pred = tm_best_estimator.predict(X_val)

Best parameters: {'contamination': 0.01, 'max_samples': 50, 'n_estimators': 350}
Best AUROC: 0.4723178043317059


In [14]:
from sklearn.model_selection import StratifiedKFold


In [15]:
def if_hypertune(param_grid, X_train, y_train):
    '''Hyperparameter tuning for a model using GridSearchCV'''

    # Initialize the model
    clf = IsolationForest(random_state=42)

    # Initialize the grid search
    grid_search = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=5, n_jobs=-1, refit=True)

    # Fit the grid search on your training data
    grid_search.fit(X_train, y_train)

    # Print the best parameters and the best score
    print(f"Best parameters: {grid_search.best_params_}")
    best_auroc = grid_search.best_score_
    print(f"Best AUROC: {best_auroc}")

    # Get the best parameters and the best estimator
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=5)

    # Use it in cross_val_predict
    y_train_pred = cross_val_predict(best_estimator, X_train, y_train, cv=skf)

    # Calculate the precision, F1 score, and PR AUC
    precision = precision_score(y_train, y_train_pred)
    f1 = f1_score(y_train, y_train_pred)
    pr_auc = average_precision_score(y_train, y_train_pred)

    # Return the best parameters, the best estimator, and the metrics
    return best_params, best_estimator, {'AUROC': best_auroc, 'Precision': precision, 'F1 Score': f1, 'PR AUC': pr_auc}

In [16]:
# Define the parameter grid
tm_param_grid = {
    'n_estimators': [50, 100, 200, 300, 400],
    'max_samples': ['auto', 50, 128, 256],
    'contamination': [0.01, 0.05, 0.1, 0.2],
}


# Fine tune the model
tm_best_params, tm_best_estimator, metrics = if_hypertune(tm_param_grid, X_train, y_train)

# Use the best estimator to make predictions
tm_pred = tm_best_estimator.predict(X_val)

Best parameters: {'contamination': 0.01, 'max_samples': 50, 'n_estimators': 300}
Best AUROC: 0.47095260623075247
