In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.decomposition import LatentDirichletAllocation

from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
import xgboost as xgb

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    roc_auc_score,
    brier_score_loss
)

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import uniform, randint

from itertools import product
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
project_path = '/content/drive/My Drive/BT4012 Project/Testing'
os.chdir(project_path)
# Confirm current directory
print("Current Directory:", os.getcwd())

Mounted at /content/drive
Current Directory: /content/drive/My Drive/BT4012 Project/Testing


# Import Imputed Dataset & Further Processing



In [None]:
X_train_smote = pd.read_csv('X_train_smote_dropped.csv').drop(columns=['Unnamed: 0'])
X_train = pd.read_csv('X_train_dropped.csv').drop(columns=['Unnamed: 0'])
X_test = pd.read_csv('X_test_dropped.csv').drop(columns=['Unnamed: 0'])

y_train_smote = pd.read_csv('y_train_smote.csv')['fraudulent']
y_train = pd.read_csv('y_train.csv')['fraudulent']
y_test = pd.read_csv('y_test.csv')['fraudulent']

# Decision Tree

## Decision Tree Hyperparameter Tuning using SMOTE dataset

In [None]:

param_distributions = {
    "max_depth": [5, 10, 20, 40, None],
    "min_samples_split": [2, 10, 30, 50, 100],
    "min_samples_leaf": [1, 10, 30, 50, 100],
    "ccp_alpha": [0.0, 0.01, 0.1, 0.5],
    "class_weight": [None, 'balanced', {0: 0.1, 1: 0.9},  {0: 0.2, 1: 0.8}, {0: 0.3, 1: 0.7}]
}

model_dt = DecisionTreeClassifier(random_state=15)

random_search = RandomizedSearchCV(
    estimator=model_dt,
    param_distributions=param_distributions,
    n_iter=100,
    scoring="roc_auc",
    cv=5,
    random_state=15,
    n_jobs=-1,
    verbose=2
)

random_search.fit(X_train_smote, y_train_smote)

best_model_dt = random_search.best_estimator_
best_params_dt = random_search.best_params_
best_score = random_search.best_score_

print("\n--- Best Parameters ---")
print(best_params_dt)
print("\n--- Best Cross-Validation ROC AUC ---")
print(best_score)

print("\n--- Training Set Evaluation ---")
y_train_pred = best_model_dt.predict(X_train_smote)
y_train_proba = best_model_dt.predict_proba(X_train_smote)[:, 1]
class_report_train = classification_report(y_train_smote, y_train_pred)
print("Training Classification Report:\n", class_report_train)
roc_auc_train = roc_auc_score(y_train_smote, y_train_proba)
print("Training ROC AUC Score:", roc_auc_train)
brier_score_train = brier_score_loss(y_train_smote, y_train_proba)
print("Training Brier Score:", brier_score_train)

print("\n--- Test Set Evaluation ---")
y_test_pred = best_model_dt.predict(X_test)
y_test_proba = best_model_dt.predict_proba(X_test)[:, 1]
class_report_test = classification_report(y_test, y_test_pred)
print("Test Classification Report:\n", class_report_test)
roc_auc_test = roc_auc_score(y_test, y_test_proba)
print("Test ROC AUC Score:", roc_auc_test)
brier_score_test = brier_score_loss(y_test, y_test_proba)
print("Test Brier Score:", brier_score_test)

Fitting 5 folds for each of 100 candidates, totalling 500 fits

--- Best Parameters ---
{'min_samples_split': 2, 'min_samples_leaf': 10, 'max_depth': 40, 'class_weight': None, 'ccp_alpha': 0.0}

--- Best Cross-Validation ROC AUC ---
0.986097405236749

--- Training Set Evaluation ---
Training Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98     13620
           1       0.98      0.98      0.98     13620

    accuracy                           0.98     27240
   macro avg       0.98      0.98      0.98     27240
weighted avg       0.98      0.98      0.98     27240

Training ROC AUC Score: 0.9963511916698472
Training Brier Score: 0.012622118810493747

--- Test Set Evaluation ---
Test Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      3394
           1       0.61      0.76      0.68       182

    accuracy                           0.96     

In [None]:
best_params_dt

{'min_samples_split': 2,
 'min_samples_leaf': 10,
 'max_depth': 40,
 'class_weight': None,
 'ccp_alpha': 0.0}

## Decision Tree Pseudolabelling (using SMOTE Dataset)

In [None]:

decision_tree_pseudolabelling = DecisionTreeClassifier(
    max_depth=best_params_dt["max_depth"],
    min_samples_split=best_params_dt["min_samples_split"],
    min_samples_leaf=best_params_dt["min_samples_leaf"],
    ccp_alpha=best_params_dt["ccp_alpha"],
    class_weight=best_params_dt['class_weight'],
    random_state=15
)

decision_tree_pseudolabelling.fit(X_train_smote, y_train_smote)

confidence_threshold = 0.9
n_iterations = 20

for i in range(n_iterations):
    val_pred_proba = decision_tree_pseudolabelling.predict_proba(X_test)
    pseudolabels = np.where(val_pred_proba.max(axis=1) > confidence_threshold, val_pred_proba.argmax(axis=1), -1)


    pseudolabeled_data = X_test[pseudolabels != -1]
    pseudolabel_targets = pseudolabels[pseudolabels != -1]

    augmented_train_data = np.vstack([X_train_smote, pseudolabeled_data])
    augmented_train_labels = np.concatenate([y_train_smote, pseudolabel_targets])


    decision_tree_pseudolabelling.fit(augmented_train_data, augmented_train_labels)

print("\n--- Pseudolabelling Decision Tree SMOTE Evaluation ---")

y_train_pred = decision_tree_pseudolabelling.predict(X_train_smote)
y_train_proba = decision_tree_pseudolabelling.predict_proba(X_train_smote)[:, 1]
print("\nTraining Set Metrics:")
print(classification_report(y_train_smote, y_train_pred))
print("ROC AUC Score:", roc_auc_score(y_train_smote, y_train_proba))
print("Brier Score:", brier_score_loss(y_train_smote, y_train_proba))

y_test_pred = decision_tree_pseudolabelling.predict(X_test)
y_test_proba = decision_tree_pseudolabelling.predict_proba(X_test)[:, 1]
print("\nTest Set Metrics:")
print(classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_test_proba))
print("Brier Score:", brier_score_loss(y_test, y_test_proba))


--- Pseudolabelling Decision Tree SMOTE Evaluation ---

Training Set Metrics:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     13620
           1       0.98      0.98      0.98     13620

    accuracy                           0.98     27240
   macro avg       0.98      0.98      0.98     27240
weighted avg       0.98      0.98      0.98     27240

ROC AUC Score: 0.9971087586062648
Brier Score: 0.012558387909913605

Test Set Metrics:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      3394
           1       0.61      0.78      0.69       182

    accuracy                           0.96      3576
   macro avg       0.80      0.88      0.83      3576
weighted avg       0.97      0.96      0.97      3576

ROC AUC Score: 0.9133279802107144
Brier Score: 0.0343062826317488


## Decision Tree with Original/Non-SMOTE Dataset

In [None]:
decision_tree_non_smote = DecisionTreeClassifier(
    max_depth=best_params_dt["max_depth"],
    min_samples_split=best_params_dt["min_samples_split"],
    min_samples_leaf=best_params_dt["min_samples_leaf"],
    ccp_alpha=best_params_dt["ccp_alpha"],
    class_weight=best_params_dt['class_weight'],
    random_state=15
)

decision_tree_non_smote.fit(X_train, y_train)

print("\n--- Decision Tree Non-SMOTE Evaluation ---")
y_train_pred = decision_tree_non_smote.predict(X_train)
y_train_proba = decision_tree_non_smote.predict_proba(X_train)[:, 1]
print("\nTraining Set Metrics:")
print(classification_report(y_train, y_train_pred))
print("ROC AUC Score:", roc_auc_score(y_train, y_train_proba))
print("Brier Score:", brier_score_loss(y_train, y_train_proba))


y_test_pred = decision_tree_non_smote.predict(X_test)
y_test_proba = decision_tree_non_smote.predict_proba(X_test)[:, 1]
print("\nTest Set Metrics:")
print(classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_test_proba))
print("Brier Score:", brier_score_loss(y_test, y_test_proba))


--- Decision Tree Non-SMOTE Evaluation ---

Training Set Metrics:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     13620
           1       0.88      0.69      0.77       683

    accuracy                           0.98     14303
   macro avg       0.93      0.84      0.88     14303
weighted avg       0.98      0.98      0.98     14303

ROC AUC Score: 0.9935934150751522
Brier Score: 0.013226738406949145

Test Set Metrics:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3394
           1       0.78      0.63      0.70       182

    accuracy                           0.97      3576
   macro avg       0.88      0.81      0.84      3576
weighted avg       0.97      0.97      0.97      3576

ROC AUC Score: 0.8995439592817318
Brier Score: 0.02379026840292163


## Decision Tree Pseudolabelling with Original/Non-SMOTE Dataset

In [None]:
decision_tree_pseudolabelling_non_smote = DecisionTreeClassifier(
    max_depth=best_params_dt["max_depth"],
    min_samples_split=best_params_dt["min_samples_split"],
    min_samples_leaf=best_params_dt["min_samples_leaf"],
    ccp_alpha=best_params_dt["ccp_alpha"],
    class_weight=best_params_dt['class_weight'],
    random_state=15
)

decision_tree_pseudolabelling_non_smote.fit(X_train, y_train)

confidence_threshold = 0.9
n_iterations = 20


for i in range(n_iterations):

    val_pred_proba = decision_tree_pseudolabelling_non_smote.predict_proba(X_test)

    pseudolabels = np.where(val_pred_proba.max(axis=1) > confidence_threshold, val_pred_proba.argmax(axis=1), -1)

    pseudolabeled_data = X_test[pseudolabels != -1]
    pseudolabel_targets = pseudolabels[pseudolabels != -1]
    augmented_train_data = np.vstack([X_train, pseudolabeled_data])
    augmented_train_labels = np.concatenate([y_train, pseudolabel_targets])


    decision_tree_pseudolabelling_non_smote.fit(augmented_train_data, augmented_train_labels)

print("\n--- Decision Tree Pseudolabelling NON-SMOTE Evaluation ---")

y_train_pred = decision_tree_pseudolabelling_non_smote.predict(X_train)
y_train_proba = decision_tree_pseudolabelling_non_smote.predict_proba(X_train)[:, 1]
print("\nTraining Set Metrics:")
print(classification_report(y_train, y_train_pred))
print("ROC AUC Score:", roc_auc_score(y_train, y_train_proba))
print("Brier Score:", brier_score_loss(y_train, y_train_proba))

y_test_pred = decision_tree_pseudolabelling_non_smote.predict(X_test)
y_test_proba = decision_tree_pseudolabelling_non_smote.predict_proba(X_test)[:, 1]
print("\nTest Set Metrics:")
print(classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_test_proba))
print("Brier Score:", brier_score_loss(y_test, y_test_proba))


--- Decision Tree Pseudolabelling NON-SMOTE Evaluation ---

Training Set Metrics:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     13620
           1       0.91      0.69      0.79       683

    accuracy                           0.98     14303
   macro avg       0.95      0.85      0.89     14303
weighted avg       0.98      0.98      0.98     14303

ROC AUC Score: 0.9945103230758315
Brier Score: 0.012258627644121736

Test Set Metrics:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3394
           1       0.82      0.63      0.71       182

    accuracy                           0.97      3576
   macro avg       0.90      0.81      0.85      3576
weighted avg       0.97      0.97      0.97      3576

ROC AUC Score: 0.9106276752122363
Brier Score: 0.021828664882374282


# LightGBM



## LightGBM Hyperparameter Tuning using SMOTE Dataset

In [None]:
#This is done as LightGBM requires feature names to not contains characters
#Initially error occured hence this function was written

def clean_feature_names(df):
    df.columns = [
        col.replace(",", "_")
        .replace("\"", "_")
        .replace("{", "_")
        .replace("}", "_")
        .replace("[", "_")
        .replace("]", "_")
        for col in df.columns
    ]
    return df

if isinstance(X_train_smote, pd.DataFrame):
    X_train_smote_col_edited = clean_feature_names(X_train_smote)
    X_train_col_edited = clean_feature_names(X_train)
    X_test_col_edited = clean_feature_names(X_test)

In [None]:

param_dist_lgbm = {
    'n_estimators': [100, 200, 300],     # Number of boosting rounds
    'max_depth': [5, 10, 20],                # Maximum tree depth
    'learning_rate': [0.05, 0.1, 0.3],       # Learning rate for boosting
    'subsample': [0.8, 1.0],            # Subsampling fraction
    'colsample_bytree': [0.8, 1.0],     # Feature fraction (similar to colsample_bytree in XGBoost)
    'reg_alpha': [0.1, 0.3, 0.5],       # L1 regularization
    'reg_lambda': [0.1, 0.3, 0.5],      # L2 regularization
    'min_child_samples': [10, 30, 50, 100],  # Minimum number of data points in a leaf
}


lgbm_smote = LGBMClassifier(random_state=15)

random_search = RandomizedSearchCV(
    estimator=lgbm_smote,
    param_distributions=param_dist_lgbm,
    n_iter=100,
    scoring="roc_auc",
    cv=5,
    random_state=15,
    n_jobs=-1,
    verbose=2
)

random_search.fit(X_train_smote_col_edited, y_train_smote)

best_model_lgbm = random_search.best_estimator_
best_params_lgbm = random_search.best_params_
best_score = random_search.best_score_

print("\n--- Best Parameters ---")
print(best_params_lgbm)
print("\n--- Best Cross-Validation ROC AUC ---")
print(best_score)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[LightGBM] [Info] Number of positive: 13620, number of negative: 13620
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.251869 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 138990
[LightGBM] [Info] Number of data points in the train set: 27240, number of used features: 586
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

--- Best Parameters ---
{'subsample': 0.8, 'reg_lambda': 0.1, 'reg_alpha': 0.1, 'n_estimators': 300, 'min_child_samples': 10, 'max_depth': 5, 'learning_rate': 0.3, 'colsample_bytree': 1.0}

--- Best Cross-Validation ROC AUC ---
0.9998655287960825


In [None]:
print("\n--- Training Set Evaluation ---")
y_train_pred = best_model_lgbm.predict(X_train_smote_col_edited)
y_train_proba = best_model_lgbm.predict_proba(X_train_smote_col_edited)[:, 1]

class_report_train = classification_report(y_train_smote, y_train_pred)
print("Training Classification Report:\n", class_report_train)
roc_auc_train = roc_auc_score(y_train_smote, y_train_proba)
print("Training ROC AUC Score:", roc_auc_train)
brier_score_train = brier_score_loss(y_train_smote, y_train_proba)
print("Training Brier Score:", brier_score_train)

print("\n--- Test Set Evaluation ---")
y_test_pred = best_model_lgbm.predict(X_test_col_edited)
y_test_proba = best_model_lgbm.predict_proba(X_test_col_edited)[:, 1]
class_report_test = classification_report(y_test, y_test_pred)
print("Test Classification Report:\n", class_report_test)
roc_auc_test = roc_auc_score(y_test, y_test_proba)
print("Test ROC AUC Score:", roc_auc_test)
brier_score_test = brier_score_loss(y_test, y_test_proba)
print("Test Brier Score:", brier_score_test)


--- Training Set Evaluation ---
Training Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     13620
           1       1.00      1.00      1.00     13620

    accuracy                           1.00     27240
   macro avg       1.00      1.00      1.00     27240
weighted avg       1.00      1.00      1.00     27240

Training ROC AUC Score: 1.0
Training Brier Score: 1.0553367262462046e-06

--- Test Set Evaluation ---
Test Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      3394
           1       0.94      0.80      0.86       182

    accuracy                           0.99      3576
   macro avg       0.97      0.90      0.93      3576
weighted avg       0.99      0.99      0.99      3576

Test ROC AUC Score: 0.9899564195380341
Test Brier Score: 0.010832074038771188


## LightGBM Pseudolabelling with SMOTE Dataset

In [None]:
confidence_threshold = 0.9
n_iterations = 20

lgbm_pseudolabelling = LGBMClassifier(**best_params_lgbm)

lgbm_pseudolabelling.fit(X_train_smote_col_edited, y_train_smote)

for i in range(n_iterations):

    test_pred_proba = lgbm_pseudolabelling.predict_proba(X_test_col_edited)

    pseudolabels = np.where(test_pred_proba.max(axis=1) > confidence_threshold, test_pred_proba.argmax(axis=1), -1)

    pseudolabeled_data = X_test_col_edited[pseudolabels != -1]
    pseudolabel_targets = pseudolabels[pseudolabels != -1]

    augmented_train_data = np.vstack([X_train_smote_col_edited, pseudolabeled_data])
    augmented_train_labels = np.concatenate([y_train_smote, pseudolabel_targets])
    lgbm_pseudolabelling.fit(augmented_train_data, augmented_train_labels)


print("\n--- Training Set Evaluation ---")
y_train_pred = lgbm_pseudolabelling.predict(X_train_smote_col_edited)
y_train_proba = lgbm_pseudolabelling.predict_proba(X_train_smote_col_edited)[:, 1]
class_report_train = classification_report(y_train_smote, y_train_pred)
print("Training Classification Report:\n", class_report_train)
roc_auc_train = roc_auc_score(y_train_smote, y_train_proba)
print("Training ROC AUC Score:", roc_auc_train)
brier_score_train = brier_score_loss(y_train_smote, y_train_proba)
print("Training Brier Score:", brier_score_train)

print("\n--- Test Set Evaluation ---")
y_test_pred = lgbm_pseudolabelling.predict(X_test_col_edited)
y_test_proba = lgbm_pseudolabelling.predict_proba(X_test_col_edited)[:, 1]
class_report_test = classification_report(y_test, y_test_pred)
print("Test Classification Report:\n", class_report_test)
roc_auc_test = roc_auc_score(y_test, y_test_proba)
print("Test ROC AUC Score:", roc_auc_test)
brier_score_test = brier_score_loss(y_test, y_test_proba)
print("Test Brier Score:", brier_score_test)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Number of positive: 13761, number of negative: 17037
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.434461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 139328
[LightGBM] [Info] Number of data points in the train set: 30798, number of used features: 586
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.446815 -> initscore=-0.213549
[LightGBM] [Info] Start training from score -0.213549
[LightGBM] [Info] Number of positive: 13762, number of negative: 17037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.277332 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 139329
[LightGBM] [Info] Number of data points in the train set: 30799, number of used features: 586
[LightGBM] 

## SVM Fitting using Original/Non-SMOTE Dataset

In [None]:
lgbm_non_smote = LGBMClassifier(**best_params_lgbm)

lgbm_non_smote.fit(X_train_col_edited, y_train)

print("\n--- LGBM Non Smote Evaluation ---")

y_train_pred = lgbm_non_smote.predict(X_train_col_edited)
y_train_proba = lgbm_non_smote.predict_proba(X_train_col_edited)[:, 1]
print("\nTraining Set Metrics:")
print(classification_report(y_train, y_train_pred))
print("ROC AUC Score:", roc_auc_score(y_train, y_train_proba))
print("Brier Score:", brier_score_loss(y_train, y_train_proba))

y_test_pred = lgbm_non_smote.predict(X_test_col_edited)
y_test_proba = lgbm_non_smote.predict_proba(X_test_col_edited)[:, 1]
print("\nTest Set Metrics:")
print(classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_test_proba))
print("Brier Score:", brier_score_loss(y_test, y_test_proba))

[LightGBM] [Info] Number of positive: 683, number of negative: 13620
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.109231 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 123213
[LightGBM] [Info] Number of data points in the train set: 14303, number of used features: 584
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047752 -> initscore=-2.992800
[LightGBM] [Info] Start training from score -2.992800

--- LGBM Non Smote Evaluation ---

Training Set Metrics:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13620
           1       1.00      1.00      1.00       683

    accuracy                           1.00     14303
   macro avg       1.00      1.00      1.00     14303
weighted avg       1.00      1.00      1.00     14303

ROC AUC Score: 1.0
Brier Score: 2.48414068492782e-06

Te

## SVM Pseudolabelling with Original/Non-Smote Dataset

In [None]:
lgbm_pseudolabelling_non_smote = LGBMClassifier(**best_params_lgbm)

lgbm_pseudolabelling_non_smote.fit(X_train_col_edited, y_train)

confidence_threshold = 0.9
n_iterations = 20

for i in range(n_iterations):

    val_pred_proba = lgbm_pseudolabelling_non_smote.predict_proba(X_test_col_edited)

    pseudolabels = np.where(val_pred_proba.max(axis=1) > confidence_threshold, val_pred_proba.argmax(axis=1), -1)

    pseudolabeled_data = X_test_col_edited[pseudolabels != -1]
    pseudolabel_targets = pseudolabels[pseudolabels != -1]

    augmented_train_data = np.vstack([X_train_col_edited, pseudolabeled_data])
    augmented_train_labels = np.concatenate([y_train, pseudolabel_targets])


    lgbm_pseudolabelling_non_smote.fit(augmented_train_data, augmented_train_labels)

print("\n--- LGBM Pseudoabelling Original/Non-SMOTE Evaluation ---")

y_train_pred = lgbm_pseudolabelling_non_smote.predict(X_train_col_edited)
y_train_proba = lgbm_pseudolabelling_non_smote.predict_proba(X_train_col_edited)[:, 1]
print("\nTraining Set Metrics:")
print(classification_report(y_train, y_train_pred))
print("ROC AUC Score:", roc_auc_score(y_train, y_train_proba))
print("Brier Score:", brier_score_loss(y_train, y_train_proba))

y_test_pred = lgbm_pseudolabelling_non_smote.predict(X_test_col_edited)
y_test_proba = lgbm_pseudolabelling_non_smote.predict_proba(X_test_col_edited)[:, 1]
print("\nTest Set Metrics:")
print(classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_test_proba))
print("Brier Score:", brier_score_loss(y_test, y_test_proba))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Number of positive: 817, number of negative: 17044
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.137065 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126710
[LightGBM] [Info] Number of data points in the train set: 17861, number of used features: 584
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.045742 -> initscore=-3.037914
[LightGBM] [Info] Start training from score -3.037914
[LightGBM] [Info] Number of positive: 817, number of negative: 17044
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.200761 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126710
[LightGBM] [Info] Number of data points in th