In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    brier_score_loss,
    classification_report,
    roc_auc_score
)

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

**Import files**

In [2]:
X_train_smote_dropped =pd.read_csv('X_train_smote_dropped.csv').drop(columns=['Unnamed: 0'])
X_train_dropped = pd.read_csv('X_train_dropped.csv').drop(columns=['Unnamed: 0'])
X_test_dropped = pd.read_csv('X_test_dropped.csv').drop(columns=['Unnamed: 0']) 

y_train_smote = pd.read_csv('y_train_smote.csv')['fraudulent']
y_test = pd.read_csv('y_test.csv')['fraudulent']
y_train = pd.read_csv('y_train.csv')['fraudulent']

**XGBoost Hyperparameter tuning using SMOTE Dataset**

In [4]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, brier_score_loss, roc_curve
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import xgboost as xgb

xgb_model = xgb.XGBClassifier(random_state=15)


param_dist = {
    'n_estimators': [100, 200, 300],  
    'max_depth': [4, 5],             
    'learning_rate': [0.05, 0.1],    
    'subsample': [0.8, 1.0],         
    'colsample_bytree': [0.8],       
    'gamma': [0.1, 0.3],             
    'alpha': [0.1, 0.3, 0.5],        
    'lambda': [0.1, 0.3, 0.5],       
}

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=50,                            
    scoring='roc_auc',                    
    cv=3,                                 
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_smote_dropped, y_train_smote)

best_model = random_search.best_estimator_
best_params = random_search.best_params_

best_model.fit(X_train_smote_dropped, y_train_smote)

from sklearn.metrics import roc_auc_score, brier_score_loss, classification_report
y_train_pred = best_model.predict(X_train_smote_dropped)
y_train_proba = best_model.predict_proba(X_train_smote_dropped)[:, 1]
print("\nTraining Set Metrics:")
print(classification_report(y_train_smote, y_train_pred))
print("ROC AUC Score:", roc_auc_score(y_train_smote, y_train_proba))
print("Brier Score:", brier_score_loss(y_train_smote, y_train_proba))

y_test_pred = best_model.predict(X_test_dropped)
y_test_proba = best_model.predict_proba(X_test_dropped)[:, 1]
print("\nTest Set Metrics:")
print(classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_test_proba))
print("Brier Score:", brier_score_loss(y_test, y_test_proba))


Fitting 3 folds for each of 50 candidates, totalling 150 fits

Training Set Metrics:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13620
           1       1.00      1.00      1.00     13620

    accuracy                           1.00     27240
   macro avg       1.00      1.00      1.00     27240
weighted avg       1.00      1.00      1.00     27240

ROC AUC Score: 0.9999999730464615
Brier Score: 0.0003258478816547882

Test Set Metrics:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3394
           1       0.93      0.79      0.85       182

    accuracy                           0.99      3576
   macro avg       0.96      0.89      0.92      3576
weighted avg       0.99      0.99      0.99      3576

ROC AUC Score: 0.9876737876148602
Brier Score: 0.011100783346473223


In [9]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_smote), y=y_train_smote)
class_weight_dict = dict(enumerate(class_weights))

sample_weights = np.array([class_weight_dict[label] for label in y_train_smote])

mod_xgb_model = xgb.XGBClassifier(**best_params)
mod_xgb_model.fit(X_train_smote_dropped, y_train_smote, sample_weight=sample_weights)

y_train_pred = mod_xgb_model.predict(X_train_smote_dropped)
y_train_proba = mod_xgb_model.predict_proba(X_train_smote_dropped)[:, 1]
print("\nTraining Set Metrics:")
print(classification_report(y_train_smote, y_train_pred))
print("ROC AUC Score:", roc_auc_score(y_train_smote, y_train_proba))
print("Brier Score:", brier_score_loss(y_train_smote, y_train_proba))

y_test_pred = mod_xgb_model.predict(X_test_dropped)
y_test_proba = mod_xgb_model.predict_proba(X_test_dropped)[:, 1]
print("\nTest Set Metrics:")
print(classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_test_proba))
print("Brier Score:", brier_score_loss(y_test, y_test_proba))


Training Set Metrics:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13620
           1       1.00      1.00      1.00     13620

    accuracy                           1.00     27240
   macro avg       1.00      1.00      1.00     27240
weighted avg       1.00      1.00      1.00     27240

ROC AUC Score: 1.0
Brier Score: 0.00030942367333941647

Test Set Metrics:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3394
           1       0.93      0.80      0.86       182

    accuracy                           0.99      3576
   macro avg       0.96      0.90      0.93      3576
weighted avg       0.99      0.99      0.99      3576

ROC AUC Score: 0.9876203643145305
Brier Score: 0.010758967088551789


**XGBoost Pseudolabelling using SMOTE Dataset**

In [10]:
from sklearn.metrics import roc_auc_score, brier_score_loss, classification_report

mod_xgb_model_pseudo = xgb.XGBClassifier(**best_params)
mod_xgb_model_pseudo.fit(X_train_smote_dropped, y_train_smote, sample_weight=sample_weights)

confidence_threshold = 0.9  
n_iterations = 20  

for i in range(n_iterations):
    print(f"\n--- Pseudolabeling Iteration {i+1} ---")

    val_pred_proba = mod_xgb_model_pseudo.predict_proba(X_test_dropped)

    pseudolabels = np.where(val_pred_proba.max(axis=1) > confidence_threshold, val_pred_proba.argmax(axis=1), -1)

    pseudolabeled_data = X_test_dropped[pseudolabels != -1]
    pseudolabel_targets = pseudolabels[pseudolabels != -1]

    if len(pseudolabeled_data) == 0:
        print("No high-confidence pseudolabels found. Stopping iteration.")
        break

    augmented_train_data = np.vstack([X_train_smote_dropped, pseudolabeled_data])
    augmented_train_labels = np.concatenate([y_train_smote, pseudolabel_targets])

    mod_xgb_model_pseudo.fit(augmented_train_data, augmented_train_labels)

print("\n--- Final Model Evaluation ---")

y_train_pred = mod_xgb_model_pseudo.predict(X_train_smote_dropped)
y_train_proba = mod_xgb_model_pseudo.predict_proba(X_train_smote_dropped)[:, 1]
print("\nTraining Set Metrics:")
print(classification_report(y_train_smote, y_train_pred))
print("ROC AUC Score:", roc_auc_score(y_train_smote, y_train_proba))
print("Brier Score:", brier_score_loss(y_train_smote, y_train_proba))

y_test_pred = mod_xgb_model_pseudo.predict(X_test_dropped)
y_test_proba = mod_xgb_model_pseudo.predict_proba(X_test_dropped)[:, 1]
print("\nTest Set Metrics:")
print(classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_test_proba))
print("Brier Score:", brier_score_loss(y_test, y_test_proba))


--- Pseudolabeling Iteration 1 ---

--- Pseudolabeling Iteration 2 ---

--- Pseudolabeling Iteration 3 ---

--- Pseudolabeling Iteration 4 ---

--- Pseudolabeling Iteration 5 ---

--- Pseudolabeling Iteration 6 ---

--- Pseudolabeling Iteration 7 ---

--- Pseudolabeling Iteration 8 ---

--- Pseudolabeling Iteration 9 ---

--- Pseudolabeling Iteration 10 ---

--- Pseudolabeling Iteration 11 ---

--- Pseudolabeling Iteration 12 ---

--- Pseudolabeling Iteration 13 ---

--- Pseudolabeling Iteration 14 ---

--- Pseudolabeling Iteration 15 ---

--- Pseudolabeling Iteration 16 ---

--- Pseudolabeling Iteration 17 ---

--- Pseudolabeling Iteration 18 ---

--- Pseudolabeling Iteration 19 ---

--- Pseudolabeling Iteration 20 ---

--- Final Model Evaluation ---

Training Set Metrics:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13620
           1       1.00      1.00      1.00     13620

    accuracy                           1.00     27

**XGBoost using Best Hyperparameters on Non-SMOTE Dataset**

In [11]:
xgb_model_no_smote = xgb.XGBClassifier(**best_params)
sample_weights = np.ones(len(y_train))  
xgb_model_no_smote.fit(X_train_dropped, y_train, sample_weight=sample_weights)

y_train_pred = xgb_model_no_smote.predict(X_train_dropped)
y_train_proba = xgb_model_no_smote.predict_proba(X_train_dropped)[:, 1]
print("\nTraining Set Metrics:")
print(classification_report(y_train, y_train_pred))
print("ROC AUC Score:", roc_auc_score(y_train, y_train_proba))
print("Brier Score:", brier_score_loss(y_train, y_train_proba))

y_test_pred = xgb_model_no_smote.predict(X_test_dropped)
y_test_proba = xgb_model_no_smote.predict_proba(X_test_dropped)[:, 1]
print("\nTest Set Metrics:")
print(classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_test_proba))
print("Brier Score:", brier_score_loss(y_test, y_test_proba))


Training Set Metrics:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13620
           1       1.00      1.00      1.00       683

    accuracy                           1.00     14303
   macro avg       1.00      1.00      1.00     14303
weighted avg       1.00      1.00      1.00     14303

ROC AUC Score: 0.9999998925015534
Brier Score: 0.00042806360279484655

Validation Set Metrics:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3394
           1       0.95      0.75      0.84       182

    accuracy                           0.99      3576
   macro avg       0.97      0.88      0.92      3576
weighted avg       0.99      0.99      0.98      3576

ROC AUC Score: 0.9901037383359127
Brier Score: 0.01138209963162096

Test Set Metrics:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3394
           1       0.95      0.75

**Pseudolabelling using XGBoost on Non-SMOTE Data**

In [8]:
xgb_model_no_smote_pseudo = xgb.XGBClassifier(**best_params)
xgb_model_no_smote_pseudo.fit(X_train_dropped, y_train, sample_weight=sample_weights)

confidence_threshold = 0.9  
n_iterations = 20  

for i in range(n_iterations):
    print(f"\n--- Pseudolabeling Iteration {i+1} ---")

    val_pred_proba = xgb_model_no_smote_pseudo.predict_proba(X_test_dropped)

    pseudolabels = np.where(val_pred_proba.max(axis=1) > confidence_threshold, val_pred_proba.argmax(axis=1), -1)

    pseudolabeled_data = X_test_dropped[pseudolabels != -1]
    pseudolabel_targets = pseudolabels[pseudolabels != -1]

    if len(pseudolabeled_data) == 0:
        print("No high-confidence pseudolabels found. Stopping iteration.")
        break

    augmented_train_data = np.vstack([X_train_dropped, pseudolabeled_data])
    augmented_train_labels = np.concatenate([y_train, pseudolabel_targets])

    xgb_model_no_smote_pseudo.fit(augmented_train_data, augmented_train_labels)


print("\n--- Model Evaluation (No smote + Pseudolabelling) ---")

y_train_pred = xgb_model_no_smote_pseudo.predict(X_train_dropped)
y_train_proba = xgb_model_no_smote_pseudo.predict_proba(X_train_dropped)[:, 1]
print("\nTraining Set Metrics:")
print(classification_report(y_train, y_train_pred))
print("ROC AUC Score:", roc_auc_score(y_train, y_train_proba))
print("Brier Score:", brier_score_loss(y_train, y_train_proba))

y_test_pred = xgb_model_no_smote_pseudo.predict(X_test_dropped)
y_test_proba = xgb_model_no_smote_pseudo.predict_proba(X_test_dropped)[:, 1]
print("\nTest Set Metrics:")
print(classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_test_proba))
print("Brier Score:", brier_score_loss(y_test, y_test_proba))


--- Pseudolabeling Iteration 1 ---

--- Pseudolabeling Iteration 2 ---

--- Pseudolabeling Iteration 3 ---

--- Pseudolabeling Iteration 4 ---

--- Pseudolabeling Iteration 5 ---

--- Pseudolabeling Iteration 6 ---

--- Pseudolabeling Iteration 7 ---

--- Pseudolabeling Iteration 8 ---

--- Pseudolabeling Iteration 9 ---

--- Pseudolabeling Iteration 10 ---

--- Pseudolabeling Iteration 11 ---

--- Pseudolabeling Iteration 12 ---

--- Pseudolabeling Iteration 13 ---

--- Pseudolabeling Iteration 14 ---

--- Pseudolabeling Iteration 15 ---

--- Pseudolabeling Iteration 16 ---

--- Pseudolabeling Iteration 17 ---

--- Pseudolabeling Iteration 18 ---

--- Pseudolabeling Iteration 19 ---

--- Pseudolabeling Iteration 20 ---

--- Model Evaluation (No smote + Pseudolabelling) ---

Training Set Metrics:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13620
           1       1.00      1.00      1.00       683

    accuracy               