## Random Forest and Logistic Regression Model

In [None]:
# Import preprocessed data
import pandas as pd

X_train_smote = pd.read_csv('X_train_smote_dropped.csv')
y_train_smote = pd.read_csv('y_train_smote.csv')['fraudulent']
X_train_selected = pd.read_csv('X_train_dropped.csv')
y_train = pd.read_csv('y_train.csv')['fraudulent']
X_test_selected = pd.read_csv('X_test_dropped.csv')
y_test = pd.read_csv('y_test.csv')['fraudulent']

X_train_smote = X_train_smote.drop(columns='Unnamed: 0')
X_train_selected = X_train_selected.drop(columns='Unnamed: 0')
X_test_selected = X_test_selected.drop(columns='Unnamed: 0')

#### Hyperparameter Tuning and Evaluation
##### Random Forest model

In [4]:
## Random Forest Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import classification_report

# Define a simple grid of hyperparameters
param_distributions = {
    "n_estimators": [200, 500, 1000],  # Number of trees
    "max_depth": [None, 10, 20],      # Maximum depth of trees
    "min_samples_split": [2, 5, 10],      # Minimum samples required to split a node
    "min_samples_leaf": [1, 2, 5]       # Minimum samples required at a leaf node
}

# Initialize Random Forest model
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# Setup RandomizedSearchCV
rf_smote_model = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=50,
    scoring="roc_auc",  # Metric to optimize
    random_state=42,
    verbose=1,
    n_jobs=-1,
    cv=3
)

rf_smote_model.fit(X_train_smote, y_train_smote)
best_rf_smote = rf_smote_model.best_estimator_

# Best parameters and score
print("Best parameters:", rf_smote_model.best_params_)
print("Best ROC AUC score:", rf_smote_model.best_score_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}
Best ROC AUC score: 0.9999886552556165


In [3]:
# Random Forest Evaluation
from sklearn.metrics import roc_auc_score, brier_score_loss, classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import classification_report

## Random Forest with SMOTE 
# Train the model with the best parameters on the full dataset
best_params = {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}
best_rf_smote = RandomForestClassifier(**best_params, random_state=42, class_weight='balanced')
best_rf_smote.fit(X_train_smote, y_train_smote)

# Evaluate on train and test set
rf_smote_y_train_pred = best_rf_smote.predict(X_train_smote)
rf_smote_y_test_pred = best_rf_smote.predict(X_test_selected)

print("RF with SMOTE Classification Report:")
print("Training:")
print(classification_report(y_train_smote, rf_smote_y_train_pred))
print(f"ROC AUC Score: {roc_auc_score(y_train_smote, rf_smote_y_train_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_train_smote, rf_smote_y_train_pred):.4f}")
print("Test:")
print(classification_report(y_test, rf_smote_y_test_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, rf_smote_y_test_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_test, rf_smote_y_test_pred):.4f}")

## Random Forest no SMOTE 
# Train the model with the best parameters on the full dataset
best_rf_smote.fit(X_train_selected, y_train)

# Evaluate on train and test set
rf_y_train_pred = best_rf_smote.predict(X_train_selected)
rf_y_test_pred = best_rf_smote.predict(X_test_selected)

print("RF original dataset Classification Report:")
print("Training:")
print(classification_report(y_train, rf_y_train_pred))
print(f"ROC AUC Score: {roc_auc_score(y_train, rf_y_train_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_train, rf_y_train_pred):.4f}")
print("Test:")
print(classification_report(y_test, rf_y_test_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, rf_y_test_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_test, rf_y_test_pred):.4f}")

RF with SMOTE Classification Report:
Training:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13620
           1       1.00      1.00      1.00     13620

    accuracy                           1.00     27240
   macro avg       1.00      1.00      1.00     27240
weighted avg       1.00      1.00      1.00     27240

ROC AUC Score: 1.0000
Brier Score: 0.0000
Test:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3394
           1       0.96      0.73      0.83       182

    accuracy                           0.98      3576
   macro avg       0.97      0.86      0.91      3576
weighted avg       0.98      0.98      0.98      3576

ROC AUC Score: 0.8619
Brier Score: 0.0154
RF original dataset Classification Report:
Training:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13620
           1       1.00      1.00      1.00    

##### Pseudolabeling on SMOTE and original dataset on Random Forest model result

In [4]:
## Pseudolabeling Random Forest SMOTE 
# Initialize parameters
confidence_threshold = 0.9  # Confidence threshold for pseudolabeling
n_iterations = 10  # Reduced iterations for speed
top_k = 500  # Max high-confidence samples to include per iteration

# Initial training with the labeled training set
best_rf_smote.fit(X_train_smote, y_train_smote)

# Iterative pseudolabeling
for i in range(n_iterations):
    print("Iteration" + str(i))
    test_pred_proba = best_rf_smote.predict_proba(X_test_selected)

    # Select top-k confident samples
    confident_indices = np.argsort(test_pred_proba.max(axis=1))[-top_k:]
    pseudolabeled_data = X_test_selected.iloc[confident_indices]
    pseudolabel_targets = test_pred_proba[confident_indices].argmax(axis=1)

    # Augment training data
    augmented_train_data = np.vstack([X_train_smote, pseudolabeled_data])
    augmented_train_labels = np.concatenate([y_train_smote, pseudolabel_targets])

    # Retrain with warm_start
    best_rf_smote.set_params(warm_start=True, n_estimators=best_rf_smote.n_estimators + 10)
    best_rf_smote.fit(augmented_train_data, augmented_train_labels)

# Evaluate pseudolabelling result
rf_smote_pseudo_y_train_pred = best_rf_smote.predict(X_train_smote)
rf_smote_pseudo_y_test_pred = best_rf_smote.predict(X_test_selected)

print("Pseudolabeling Random Forest Classification Report:")
print("Training:")
print(classification_report(y_train_smote, rf_smote_pseudo_y_train_pred))
print(f"ROC AUC Score: {roc_auc_score(y_train_smote, rf_smote_pseudo_y_train_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_train_smote, rf_smote_pseudo_y_train_pred):.4f}")
print("Test:")
print(classification_report(y_test, rf_smote_pseudo_y_test_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, rf_smote_pseudo_y_test_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_test, rf_smote_pseudo_y_test_pred):.4f}")

Iteration0


  warn(


Iteration1


  warn(


Iteration2


  warn(


Iteration3


  warn(


Iteration4


  warn(


Iteration5


  warn(


Iteration6


  warn(


Iteration7


  warn(


Iteration8


  warn(


Iteration9


  warn(


Pseudolabeling Random Forest Classification Report:
Training:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13620
           1       1.00      1.00      1.00     13620

    accuracy                           1.00     27240
   macro avg       1.00      1.00      1.00     27240
weighted avg       1.00      1.00      1.00     27240

ROC AUC Score: 1.0000
Brier Score: 0.0000
Test:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3394
           1       0.97      0.73      0.83       182

    accuracy                           0.99      3576
   macro avg       0.98      0.86      0.91      3576
weighted avg       0.98      0.99      0.98      3576

ROC AUC Score: 0.8648
Brier Score: 0.0148


In [5]:
## Pseudolabeling Random Forest 
# Initialize parameters
confidence_threshold = 0.9  # Confidence threshold for pseudolabeling
n_iterations = 10  # Reduced iterations for speed
top_k = 500  # Max high-confidence samples to include per iteration

# Initial training with the labeled training set
best_rf_smote.fit(X_train_selected, y_train)

# Iterative pseudolabeling
for i in range(n_iterations):
    print("Iteration" + str(i))
    test_pred_proba = best_rf_smote.predict_proba(X_test_selected)

    # Select top-k confident samples
    confident_indices = np.argsort(test_pred_proba.max(axis=1))[-top_k:]
    pseudolabeled_data = X_test_selected.iloc[confident_indices]
    pseudolabel_targets = test_pred_proba[confident_indices].argmax(axis=1)

    # Augment training data
    augmented_train_data = np.vstack([X_train_selected, pseudolabeled_data])
    augmented_train_labels = np.concatenate([y_train, pseudolabel_targets])

    # Retrain with warm_start
    best_rf_smote.set_params(warm_start=True, n_estimators=best_rf_smote.n_estimators + 10)
    best_rf_smote.fit(augmented_train_data, augmented_train_labels)

# Evaluate pseudolabelling result
rf_pseudo_y_train_pred = best_rf_smote.predict(X_train_selected)
rf_pseudo_y_test_pred = best_rf_smote.predict(X_test_selected)

print("Pseudolabeling Random Forest Classification Report:")
print("Training:")
print(classification_report(y_train, rf_pseudo_y_train_pred))
print(f"ROC AUC Score: {roc_auc_score(y_train, rf_pseudo_y_train_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_train, rf_pseudo_y_train_pred):.4f}")
print("Test:")
print(classification_report(y_test, rf_pseudo_y_test_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, rf_pseudo_y_test_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_test, rf_pseudo_y_test_pred):.4f}")

Iteration0


  warn(
  warn(
  warn(


Iteration1


  warn(


Iteration2


  warn(


Iteration3


  warn(


Iteration4


  warn(


Iteration5


  warn(


Iteration6


  warn(


Iteration7


  warn(


Iteration8


  warn(


Iteration9


  warn(


Pseudolabeling Random Forest Classification Report:
Training:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13620
           1       1.00      1.00      1.00       683

    accuracy                           1.00     14303
   macro avg       1.00      1.00      1.00     14303
weighted avg       1.00      1.00      1.00     14303

ROC AUC Score: 1.0000
Brier Score: 0.0000
Test:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3394
           1       0.98      0.66      0.79       182

    accuracy                           0.98      3576
   macro avg       0.98      0.83      0.89      3576
weighted avg       0.98      0.98      0.98      3576

ROC AUC Score: 0.8321
Brier Score: 0.0176


##### Logistic Regression

In [6]:
## Logistic Regression Hyperparameter Tuning 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from scipy.stats import loguniform

# Define the model
logreg = LogisticRegression(random_state=42, class_weight='balanced')

# Define the hyperparameter space
param_distributions = {
    'C': loguniform(0.01, 1, 10),  # Log-uniform distribution for regularization strength
    'penalty': ['l1', 'l2', 'elasticnet'],  # Type of regularization
    'solver': ['lbfgs', 'saga']
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=logreg,
    param_distributions=param_distributions,
    n_iter=50,  # Number of random combinations to try
    scoring='roc_auc',  # Optimize for AUC score
    cv=5,
    n_jobs=-1,  # Use all available cores
    verbose=1,
    random_state=42
)

# Fit to the training data
random_search.fit(X_train_smote, y_train_smote)

# Get the best estimator
best_logreg_smote = random_search.best_estimator_

# Print best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


125 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
55 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Joseph Nathanael\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Joseph Nathanael\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Joseph Nathanael\AppData\Roaming\Python\Python311\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(s

Best Hyperparameters: {'C': 10.488728535883558, 'penalty': 'l1', 'solver': 'saga'}




In [None]:
# Logistic Regression Evaluation
from sklearn.metrics import roc_auc_score, brier_score_loss, classification_report

## Logistic Regression with SMOTE 
# Train the model with the best parameters on the full dataset
best_logreg_smote.fit(X_train_smote, y_train_smote)

# Evaluate on validation set
log_smote_y_train_pred = best_logreg_smote.predict(X_train_smote)
log_smote_y_test_pred = best_logreg_smote.predict(X_test_selected)

print("Logistic Regression with SMOTE Classification Report:")
print("Training:")
print(classification_report(y_train_smote, log_smote_y_train_pred))
print(f"ROC AUC Score: {roc_auc_score(y_train_smote, log_smote_y_train_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_train_smote, log_smote_y_train_pred):.4f}")
print("Test:")
print(classification_report(y_test, log_smote_y_test_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, log_smote_y_test_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_test, log_smote_y_test_pred):.4f}")

## Random Forest no SMOTE 
# Train the model with the best parameters on the full dataset
best_logreg_smote.fit(X_train_selected, y_train)

# Evaluate on train and test set
log_y_train_pred = best_logreg_smote.predict(X_train_selected)
log_y_test_pred = best_logreg_smote.predict(X_test_selected)

print("Logistic Regression original dataset Classification Report:")
print("Training:")
print(classification_report(y_train, log_y_train_pred))
print(f"ROC AUC Score: {roc_auc_score(y_train, log_y_train_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_train, log_y_train_pred):.4f}")
print("Test:")
print(classification_report(y_test, log_y_test_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, log_y_test_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_test, log_y_test_pred):.4f}")



Logistic Regression with SMOTE Classification Report:
Training:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     13620
           1       0.99      1.00      0.99     13620

    accuracy                           0.99     27240
   macro avg       0.99      0.99      0.99     27240
weighted avg       0.99      0.99      0.99     27240

ROC AUC Score: 0.9941
Brier Score: 0.0059
Test:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3394
           1       0.69      0.85      0.76       182

    accuracy                           0.97      3576
   macro avg       0.84      0.92      0.87      3576
weighted avg       0.98      0.97      0.97      3576

ROC AUC Score: 0.9157
Brier Score: 0.0268
Logistic Regression original dataset Classification Report:
Training:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     13620
           1 



##### Pseudolabeling on SMOTE and original dataset on Random Forest model result

In [8]:
## Pseudolabeling Logistic Regression with SMOTE
# Initialize parameters
confidence_threshold = 0.9  # Confidence threshold for pseudolabeling
n_iterations = 10  # Reduced iterations for speed
top_k = 500  # Max high-confidence samples to include per iteration

# Initial training with the labeled training set
best_logreg_smote.fit(X_train_smote, y_train_smote)

# Iterative pseudolabeling
for i in range(n_iterations):
    print("Iteration" + str(i))
    test_pred_proba = best_logreg_smote.predict_proba(X_test_selected)

    # Select top-k confident samples
    confident_indices = np.argsort(test_pred_proba.max(axis=1))[-top_k:]
    pseudolabeled_data = X_test_selected.iloc[confident_indices]
    pseudolabel_targets = test_pred_proba[confident_indices].argmax(axis=1)

    # Augment training data
    augmented_train_data = np.vstack([X_train_smote, pseudolabeled_data])
    augmented_train_labels = np.concatenate([y_train_smote, pseudolabel_targets])

    # Retrain Logistic Regression on the augmented dataset
    best_logreg_smote.fit(augmented_train_data, augmented_train_labels)

# Evaluate pseudolabeling result
log_smote_pseudo_y_train_pred = best_logreg_smote.predict(X_train_smote)
log_smote_pseudo_y_test_pred = best_logreg_smote.predict(X_test_selected)

print("Pseudolabeling Logistic Regression with SMOTE Classification Report:")
print("Training:")
print(classification_report(y_train_smote, log_smote_pseudo_y_train_pred))
print(f"ROC AUC Score: {roc_auc_score(y_train_smote, log_smote_pseudo_y_train_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_train_smote, log_smote_pseudo_y_train_pred):.4f}")
print("Test:")
print(classification_report(y_test, log_smote_pseudo_y_test_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, log_smote_pseudo_y_test_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_test, log_smote_pseudo_y_test_pred):.4f}")



Iteration0




Iteration1




Iteration2




Iteration3




Iteration4




Iteration5




Iteration6




Iteration7




Iteration8




Iteration9
Pseudolabeling Logistic Regression with SMOTE Classification Report:
Training:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     13620
           1       0.99      1.00      0.99     13620

    accuracy                           0.99     27240
   macro avg       0.99      0.99      0.99     27240
weighted avg       0.99      0.99      0.99     27240

ROC AUC Score: 0.9939
Brier Score: 0.0061
Test:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3394
           1       0.69      0.85      0.76       182

    accuracy                           0.97      3576
   macro avg       0.84      0.92      0.87      3576
weighted avg       0.98      0.97      0.97      3576

ROC AUC Score: 0.9157
Brier Score: 0.0268




In [9]:
## Pseudolabeling Logistic Regression
# Initialize parameters
confidence_threshold = 0.9  # Confidence threshold for pseudolabeling
n_iterations = 10  # Reduced iterations for speed
top_k = 500  # Max high-confidence samples to include per iteration

# Initial training with the labeled training set
best_logreg_smote.fit(X_train_selected, y_train)

# Iterative pseudolabeling
for i in range(n_iterations):
    print("Iteration" + str(i))
    test_pred_proba = best_logreg_smote.predict_proba(X_test_selected)

    # Select top-k confident samples
    confident_indices = np.argsort(test_pred_proba.max(axis=1))[-top_k:]
    pseudolabeled_data = X_test_selected.iloc[confident_indices]
    pseudolabel_targets = test_pred_proba[confident_indices].argmax(axis=1)

    # Augment training data
    augmented_train_data = np.vstack([X_train_selected, pseudolabeled_data])
    augmented_train_labels = np.concatenate([y_train, pseudolabel_targets])

    # Retrain Logistic Regression on the augmented dataset
    best_logreg_smote.fit(augmented_train_data, augmented_train_labels)

# Evaluate pseudolabeling result
log_pseudo_y_train_pred = best_logreg_smote.predict(X_train_selected)
log_pseudo_y_test_pred = best_logreg_smote.predict(X_test_selected)


print("Pseudolabeling Logistic Regression with SMOTE Classification Report:")
print("Training:")
print(classification_report(y_train, log_pseudo_y_train_pred))
print(f"ROC AUC Score: {roc_auc_score(y_train, log_pseudo_y_train_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_train, log_pseudo_y_train_pred):.4f}")
print("Test:")
print(classification_report(y_test, log_pseudo_y_test_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, log_pseudo_y_test_pred):.4f}")
print(f"Brier Score: {brier_score_loss(y_test, log_pseudo_y_test_pred):.4f}")



Iteration0




Iteration1




Iteration2




Iteration3




Iteration4




Iteration5




Iteration6




Iteration7




Iteration8




Iteration9
Pseudolabeling Logistic Regression with SMOTE Classification Report:
Training:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     13620
           1       0.61      1.00      0.76       683

    accuracy                           0.97     14303
   macro avg       0.80      0.98      0.87     14303
weighted avg       0.98      0.97      0.97     14303

ROC AUC Score: 0.9818
Brier Score: 0.0306
Test:
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      3394
           1       0.56      0.90      0.69       182

    accuracy                           0.96      3576
   macro avg       0.78      0.93      0.84      3576
weighted avg       0.97      0.96      0.96      3576

ROC AUC Score: 0.9292
Brier Score: 0.0405


