In [1]:
from sklearn.ensemble import IsolationForest
import numpy as np
import sys, os
import pandas as pd

In [2]:
sys.path.insert(0, '../../')
from src.functions import Data, Modeling, Evaluation

dt = Data()
mod = Modeling()
eval = Evaluation()

In [3]:
# Get data directory
data_dir = os.path.join(os.getcwd(), '../../data/processed')


# relative paths
rel_path_X_train = 'X_train.pkl'
rel_path_X_val = 'X_val.pkl'
rel_path_X_test = 'X_test.pkl'
rel_path_y_train = 'y_train.pkl'
rel_path_y_val = 'y_val.pkl'
rel_path_y_test = 'y_test.pkl'

# absolute paths
abs_path_X_train = os.path.join(data_dir, rel_path_X_train)
abs_path_X_val = os.path.join(data_dir, rel_path_X_val)
abs_path_X_test = os.path.join(data_dir, rel_path_X_test)
abs_path_y_train = os.path.join(data_dir, rel_path_y_train)
abs_path_y_val = os.path.join(data_dir, rel_path_y_val)
abs_path_y_test = os.path.join(data_dir, rel_path_y_test)

# read files
X_train = pd.read_pickle(abs_path_X_train)
X_val = pd.read_pickle(abs_path_X_val)
X_test = pd.read_pickle(abs_path_X_test)
y_train = pd.read_pickle(abs_path_y_train)
y_val = pd.read_pickle(abs_path_y_val)
y_test = pd.read_pickle(abs_path_y_test)


In [4]:
# Save column names
X_val_df = X_val

# Convert pandas dataframes to numpy arrays for memory efficiency
X_train = X_train.values
X_val = X_val.values
y_train = y_train.values
y_val = y_val.values

In [90]:
from sklearn.neighbors import LocalOutlierFactor

bm = LocalOutlierFactor(n_neighbors=35,
                        contamination=0.05, # Lower contamination to make the model more conservative in predicting anomalies
                        novelty=True,
                        algorithm='auto',
                        leaf_size=50,
                        metric='minkowski',
                        p=3)
bm.fit(X_train)
bm_pred = bm.predict(X_val)

In [91]:
bm_cm = eval.get_cm(bm_pred, y_val)
eval.cm_inf(bm_cm, 'LOF')

Confusion Matrix for model LOF:

True Positives: 402
True Negatives: 108528
False Positives: 5447
False Negatives: 3731




In [88]:
# Initialize the model
bm = IsolationForest(contamination=0.001, # Lower contamination makes the model more conservative in classifying instances as anomalies
                     n_jobs=-1,
                     random_state=42,
                     n_estimators=2000, # Increase the number of base estimators
                     max_samples=1.0, # Use all samples for training each base estimator
                     max_features=1.0, # Use all features for training each base estimator
                     bootstrap=True)
bm.fit(X_train)
bm_pred = bm.predict(X_val)

In [6]:
# Get the threshold used by the predict method
threshold = bm.offset_
threshold

-0.6168985779915752

In [89]:
bm_cm = eval.get_cm(bm_pred, y_val)
eval.cm_inf(bm_cm, 'Base Model - Isolation Forest')

Confusion Matrix for model Base Model - Isolation Forest:

True Positives: 7
True Negatives: 113856
False Positives: 119
False Negatives: 4126




In [None]:
y_val

array([[ 1],
       [ 1],
       [ 1],
       ...,
       [ 1],
       [-1],
       [ 1]])

In [9]:
import numpy as np

# Flatten y_val to 1D array
y_val_flat = y_val.flatten()

# Count the number of normal instances (1) and anomalies (-1)
normal_count = np.sum(y_val_flat == 1)
anomaly_count = np.sum(y_val_flat == -1)

print(f'Number of normal instances: {normal_count}')
print(f'Number of anomalies: {anomaly_count}')

Number of normal instances: 113975
Number of anomalies: 4133


In [53]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix

# Initialize the model
bm = IsolationForest(contamination=0.01, n_jobs=-1, random_state=42)
bm.fit(X_train)

# Get the anomaly scores
scores = bm.decision_function(X_val)

# Set the threshold
threshold = 0.1

# Convert scores into class predictions
bm_pred = (scores <= threshold).astype('int')

# Convert the -1/1 anomaly labels to 0/1
bm_pred = ((bm_pred + 1) / 2).astype('int')
y_val = ((y_val + 1) / 2).astype('int')

# Get the confusion matrix
tn, fp, fn, tp = confusion_matrix(y_val, bm_pred).ravel()

In [54]:
from sklearn.metrics import confusion_matrix

# Assuming y_val and bm_pred are your true and predicted labels respectively
cm = confusion_matrix(y_val, bm_pred)

print('Confusion Matrix:')
print(cm)

Confusion Matrix:
[[ 3658   475]
 [96191 17784]]


In [61]:
# Initialize the model with a lower contamination parameter
bm = IsolationForest(contamination=0.5, n_jobs=-1, random_state=42)
bm.fit(X_train)

bm_pred = bm.predict(X_val)

bm_cm = eval.get_cm(bm_pred, y_val)
eval.cm_inf(bm_cm, 'Base Model - Isolation Forest')

Confusion Matrix for model Base Model - Isolation Forest:

True Positives: 1654
True Negatives: 56674
False Positives: 57301
False Negatives: 2479




In [62]:
# Get the threshold used by the predict method
threshold = bm.offset_
threshold

-0.4561186085728175

In [77]:
# Set a higher threshold
threshold = 0.1

# Convert scores into class predictions
bm_pred = (scores <= threshold).astype('int')

# Convert the -1/1 anomaly labels to 0/1
bm_pred = ((bm_pred + 1) / 2).astype('int')

# Get the confusion matrix
tn, fp, fn, tp = confusion_matrix(y_val, bm_pred).ravel()

# Print the confusion matrix
cm = confusion_matrix(y_val, bm_pred)
print('Confusion Matrix:')
print(cm)

Confusion Matrix:
[[ 3658   475]
 [96191 17784]]


In [67]:
[[TN FP]
 [FN TP]]

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1400656058.py, line 1)

In [None]:
normal, normal predicted anomaly
anomaly predicted normal, anomaly

In [None]:
# Get the predicted anomaly scores
scores_pred = bm.decision_function(X_val)

In [None]:
from sklearn.metrics import roc_curve

# Compute the ROC curve
fpr, tpr, thresholds = roc_curve(y_val, scores_pred)

# Compute the Euclidean distance from each point on the ROC curve to the top left corner
distances = np.sqrt((1 - tpr) ** 2 + fpr ** 2)

# Find the index of the point with the minimum distance
min_index = np.argmin(distances)

# Get the threshold corresponding to this point
optimal_threshold = thresholds[min_index]

# Apply the optimal threshold to the anomaly scores to get the final predictions
bm_pred_hilfe = np.where(scores_pred <= optimal_threshold, -1, 1)

# Evaluate the model
bm_cm_hilfe = eval.get_cm(bm_pred_hilfe, y_val)
eval.cm_inf(bm_cm_hilfe, 'Base Model - Isolation Forest')

# Print the classification report
eval.print_classreport(y_val, bm_pred_hilfe, 'Base Model - Isolation Forest')

bm_metrics = eval.get_metrics(bm_cm_hilfe, y_val, bm_pred_hilfe)
eval.print_metrics(bm_metrics, 'Base Model - Isolation Forest')

Confusion Matrix for model Base Model - Isolation Forest:

True Positives: 1894
True Negatives: 53730
False Positives: 60245
False Negatives: 2239


Classification Report for Base Model - Isolation Forest:

              precision    recall  f1-score   support

          -1       0.03      0.46      0.06      4133
           1       0.96      0.47      0.63    113975

    accuracy                           0.47    118108
   macro avg       0.50      0.46      0.34    118108
weighted avg       0.93      0.47      0.61    118108



Metrics of Base Model - Isolation Forest:

Recall: 0.4583
Precision: 0.0305
F1 Score: 0.0572
PR AUC: 0.0329
AU ROC: 0.4648
Specificity: 0.4714




In [None]:
y_test.value_counts(normalize=True)

isFraud
 1         0.965007
-1         0.034993
Name: proportion, dtype: float64

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Apply ADASYN
ad = ADASYN(random_state=42)
X_train_res_adasyn, y_train_res_adasyn = ad.fit_resample(X_train, y_train)

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 354324 entries, 32965 to 587621
Data columns (total 37 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   D15_to_std_card1              354324 non-null  float64
 1   D15_to_mean_card1             354324 non-null  float64
 2   D15_to_std_card4              354324 non-null  float64
 3   card2                         354324 non-null  float64
 4   TransactionAmt_to_std_addr1   354324 non-null  float64
 5   TransactionAmt_to_mean_addr2  354324 non-null  float64
 6   D11                           354324 non-null  float64
 7   D2                            354324 non-null  float64
 8   M4                            354324 non-null  float64
 9   M1                            354324 non-null  float64
 10  TransactionAmt_to_mean_addr1  354324 non-null  float64
 11  D1                            354324 non-null  float64
 12  M7                            354324 non-null

In [None]:
X_train_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683854 entries, 0 to 683853
Data columns (total 37 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   D15_to_std_card1              683854 non-null  float64
 1   D15_to_mean_card1             683854 non-null  float64
 2   D15_to_std_card4              683854 non-null  float64
 3   card2                         683854 non-null  float64
 4   TransactionAmt_to_std_addr1   683854 non-null  float64
 5   TransactionAmt_to_mean_addr2  683854 non-null  float64
 6   D11                           683854 non-null  float64
 7   D2                            683854 non-null  float64
 8   M4                            683854 non-null  float64
 9   M1                            683854 non-null  float64
 10  TransactionAmt_to_mean_addr1  683854 non-null  float64
 11  D1                            683854 non-null  float64
 12  M7                            683854 non-nul

In [None]:
len(bm_pred)

118108

In [None]:
len(y_val)

118108

In [None]:
bm_cm = eval.get_cm(bm_pred, y_val)
eval.cm_inf(bm_cm, 'Base Model - Isolation Forest')

Confusion Matrix for model Base Model - Isolation Forest:

True Positives: 43
True Negatives: 112571
False Positives: 1404
False Negatives: 4090




In [None]:
# Print the classification report
eval.print_classreport(y_val, bm_pred, 'Base Model - Isolation Forest')

Classification Report for Base Model - Isolation Forest:

              precision    recall  f1-score   support

          -1       0.03      0.01      0.02      4133
           1       0.96      0.99      0.98    113975

    accuracy                           0.95    118108
   macro avg       0.50      0.50      0.50    118108
weighted avg       0.93      0.95      0.94    118108





In [None]:
bm_metrics = eval.get_metrics(bm_cm, y_val, bm_pred)
eval.print_metrics(bm_metrics, 'Base Model - Isolation Forest')

Metrics of Base Model - Isolation Forest:

Recall: 0.0104
Precision: 0.0297
F1 Score: 0.0154
PR AUC: 0.0349
AU ROC: 0.499
Specificity: 0.9877




In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot a histogram for each feature in your dataset
for column in X_train.columns:
    plt.figure(figsize=(10, 5))
    sns.histplot(X_train[column], kde=True)
    plt.title(f'Distribution of {column}')
    plt.show()

AttributeError: 'numpy.ndarray' object has no attribute 'columns'