In [None]:
! pip install category_encoders

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
from os.path import join
from google.colab import drive
import xgboost as xgb
from imblearn.over_sampling import RandomOverSampler
import category_encoders as ce


print('Modules imported')

Modules imported


In [3]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
ROOT = '/content/gdrive/MyDrive/data_finance'
os.listdir(ROOT)

['finance.csv']

Reading data

In [5]:
dt = pd.read_csv(join(ROOT, 'finance.csv'))

Resampling function, otherwise the heavy class imbalance can introduce bias to the classifier and utterly wreck it.

In [8]:
def resamp(dt, target='fraud'):
  X = dt.drop(target, axis=1)
  y = dt[target]
  # Apply random oversampling
  ovr = RandomOverSampler(random_state=42)
  X_resampled, y_resampled = ovr.fit_resample(X, y)

  # Create a new DataFrame with the resampled data
  resDt = pd.concat([X_resampled, y_resampled], axis=1)
  return resDt


In [10]:
vcNoSampl = dt['fraud'].value_counts(normalize=True)*100
print(f"Class percentages: {round(vcNoSampl[0],2)}, {round(vcNoSampl[1],2)}")
dtSample = resamp(dt)
vcSampl = dtSample['fraud'].value_counts(normalize=True)*100
print(f"Class percentages: {round(vcSampl[0],2)}, {round(vcSampl[1],2)}")

Class percentages: 98.79, 1.21
Class percentages: 50.0, 50.0


### Training XGBoost
------------------------------------------------

In [None]:
xgbC = xgb.XGBClassifier()
# Assuming you have a DataFrame called 'dt' with the target column named 'fraud'


# Separate the features (X) and the target (y)
X = dtSample.drop('fraud', axis=1)
y = dtSample['fraud']
non_numeric_cols = dtSample.select_dtypes(exclude=['number']).columns.tolist()
print(non_numeric_cols)
# X = dtSample.drop('fraud', axis=1)
# y = dtSample['fraud']
# split data in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Perform target encoding
encoder = ce.TargetEncoder(cols=non_numeric_cols)
X_train_encoded = encoder.fit_transform(X_train, y_train)
X_test_encoded = encoder.transform(X_test)

['customer', 'age', 'gender', 'zipcodeOri', 'merchant', 'zipMerchant', 'category']


In [None]:
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train_encoded, y_train)


In [None]:
y_pred = xgb_classifier.predict(X_test_encoded)

# Evaluate the classifier
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("\nConfusion Matrix:\n")
print(cm)
print("\n\nClassification Report:")
print(report)


Confusion Matrix:

[[116595   1030]
 [     0 117353]]


Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    117625
           1       0.99      1.00      1.00    117353

    accuracy                           1.00    234978
   macro avg       1.00      1.00      1.00    234978
weighted avg       1.00      1.00      1.00    234978



Function to preprocess incoming entry