In [None]:
# https://www.kaggle.com/datasets/kartik2112/fraud-detection?select=fraudTrain.csv
# dataset preprocessing
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

df_train = pd.read_csv("fraudTrain.csv")
df_test = pd.read_csv("fraudTest.csv")

# data preprocessing plus exploratory data analysis
def preprocess(df):
    # combine first and last name to one column
    #df['name'] = df['first'] + df['last']
    df = df.drop(columns=['first','last'])

    # combine address into

    # drop time, also drop data for a baseline test (can manipulate what data to add back in later)
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df = df.sort_values(by="trans_date_trans_time", ascending=True)
    df = df.drop(columns=['trans_date_trans_time', 'job', 'dob', 'unix_time', 'city_pop', 'category', 'street', 'city', 'state', 'zip'
                          , 'gender', 'cc_num', 'trans_num'])

    return df

df_train = preprocess(df_train)
df_test = preprocess(df_test)

# training data
X_training = df_train.drop('is_fraud', axis=1)
y_training = df_train['is_fraud']

# testing data
X_test = df_test.drop('is_fraud', axis=1)
y_test = df_test['is_fraud']


In [None]:
# encoding for categorical data
ohe_merchant = OneHotEncoder(handle_unknown='ignore')
ohe_merchant.fit(df_train[['merchant']])
X_training_ohe= ohe_merchant.transform(X_training[['merchant']])
X_testing_ohe = ohe_merchant.transform(X_test[['merchant']])

In [4]:
# logistic regression model

from sklearn.metrics import classification_report, confusion_matrix, average_precision_score
from sklearn.linear_model import LogisticRegression

# use balanced to let the model increase weighting for fraud cases
logistic_regression = LogisticRegression(
    class_weight='balanced',
    max_iter=10000,
    random_state=0
)

logistic_regression.fit(X_training_ohe, y_training)
lr_pred = logistic_regression.predict(X_testing_ohe)

# manipulating the threshold (threshold here is set to get some amount in each category of confusion matrix)
lr_prob = logistic_regression.predict_proba(X_testing_ohe)[:, 1]
threshold = 0.4
lr_pred = (lr_prob >= threshold).astype(int)

# print results
target_names = ["legit", "fraud"]
print(classification_report(y_test, lr_pred, target_names=target_names, zero_division=0))
print("PRAUC: ", average_precision_score(y_test, lr_prob))

cm = confusion_matrix(y_test,lr_pred)
cm_df = pd.DataFrame(
    cm,
    index=["Actual Legit", "Actual Fraud"],
    columns=["Predicted Legit", "Predicted Fraud"]
)

print(cm_df)

              precision    recall  f1-score   support

       legit       1.00      0.55      0.71    553574
       fraud       0.01      0.77      0.01      2145

    accuracy                           0.55    555719
   macro avg       0.50      0.66      0.36    555719
weighted avg       0.99      0.55      0.71    555719

PRAUC:  0.00910538255392279
              Predicted Legit  Predicted Fraud
Actual Legit           304399           249175
Actual Fraud              498             1647


In [5]:
# SVM model
# have to use linearSVM because SVM scales horribly with this dataset
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler

svm = LinearSVC(
    class_weight="balanced",
    max_iter=10000,
    random_state=0
)

svm.fit(X_training_ohe, y_training)
svm_pred = svm.predict(X_testing_ohe)

# manipulating the threshold (threshold here is set to get some amount in each category of confusion matrix)
decision_scores = svm.decision_function(X_testing_ohe)
threshold = 0.2 
svm_pred = (decision_scores >= threshold).astype(int)

print(classification_report(y_test, svm_pred, target_names=target_names, zero_division=0))
print("PRAUC: ", average_precision_score(y_test, decision_scores))

cm_svm = confusion_matrix(y_test, svm_pred)
cm_svm_df = pd.DataFrame(
    cm_svm,
    index=["Actual Legit", "Actual Fraud"],
    columns=["Predicted Legit", "Predicted Fraud"]
)

print(cm_svm_df)


              precision    recall  f1-score   support

       legit       1.00      0.76      0.86    553574
       fraud       0.01      0.60      0.02      2145

    accuracy                           0.76    555719
   macro avg       0.50      0.68      0.44    555719
weighted avg       0.99      0.76      0.86    555719

PRAUC:  0.009063840551307855
              Predicted Legit  Predicted Fraud
Actual Legit           421531           132043
Actual Fraud              850             1295


In [6]:
# neural network (multilayer perceptron) (~2 min to run)
from sklearn.neural_network import MLPClassifier
from sklearn.utils.class_weight import compute_sample_weight

# baseline hyperparameters,
mlp = make_pipeline(StandardScaler(with_mean=False), MLPClassifier(
    hidden_layer_sizes=(128, 64), # larger dataset requires more nodes in each layer
    activation='relu',            
    solver='adam',
    alpha = 0.001,               
    max_iter=500,        
    random_state=0,
))

mlp.fit(X_training_ohe, y_training)
mlp_pred = mlp.predict(X_testing_ohe)

mlp_prob = mlp.predict_proba(X_testing_ohe)[:, 1]

# manipulating the threshold (threshold here is set to get some amount in each category of confusion matrix)
# 0.05 and above results in model not finding anything
threshold = 0.01
mlp_pred = (mlp_prob >= threshold).astype(int)

print(classification_report(y_test, mlp_pred, target_names=target_names, zero_division=0))
print("PRAUC: ", average_precision_score(y_test, mlp_prob))

cm_mlp = confusion_matrix(y_test, mlp_pred)
cm_mlp_df = pd.DataFrame(
    cm_mlp,
    index=["Actual Legit", "Actual Fraud"],
    columns=["Predicted Legit", "Predicted Fraud"]
)

print(cm_mlp_df)

              precision    recall  f1-score   support

       legit       1.00      0.79      0.88    553574
       fraud       0.01      0.56      0.02      2145

    accuracy                           0.79    555719
   macro avg       0.50      0.67      0.45    555719
weighted avg       0.99      0.79      0.88    555719

PRAUC:  0.009030535032463377
              Predicted Legit  Predicted Fraud
Actual Legit           435305           118269
Actual Fraud              938             1207


In [26]:
# XGBoost
from xgboost import XGBClassifier

num_pos = y_training.sum()
num_neg = len(y_training) - num_pos
xgb = XGBClassifier(
    objective='binary:logistic',
    n_estimators=500,  
    max_depth=5,        
    learning_rate=0.05, 
    subsample=0.8,      
    colsample_bytree=0.8, 
    random_state=0,
    tree_method='hist', 
    scale_pos_weight=num_neg /num_pos, # balance the classes to have more weight on fraud cases
)

xgb.fit(X_training_ohe, y_training)
xgb_pred = xgb.predict(X_testing_ohe)

xgb_prob = xgb.predict_proba(X_testing_ohe)[:, 1]

# manipulating the threshold (threshold here is set to get some amount in each category of confusion matrix)
threshold = 0.4
xgb_pred = (xgb_prob >= threshold).astype(int)

print(classification_report(y_test, xgb_pred, target_names=target_names, zero_division=0))
print("PRAUC: ", average_precision_score(y_test, xgb_prob))

cm_xgb = confusion_matrix(y_test, xgb_pred)
cm_xgb_df = pd.DataFrame(
    cm_xgb,
    index=["Actual Legit", "Actual Fraud"],
    columns=["Predicted Legit", "Predicted Fraud"]
)

print(cm_xgb_df)

              precision    recall  f1-score   support

       legit       1.00      0.19      0.33    553574
       fraud       0.00      0.93      0.01      2145

    accuracy                           0.20    555719
   macro avg       0.50      0.56      0.17    555719
weighted avg       0.99      0.20      0.32    555719

PRAUC:  0.008352688758024767
              Predicted Legit  Predicted Fraud
Actual Legit           107595           445979
Actual Fraud              151             1994


In [8]:
# Random Forest (~1-2 min)
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=500, 
    min_samples_split=2,
    random_state=42,
    class_weight='balanced',
    max_depth=15
)

# baseline test
rf.fit(X_training_ohe, y_training)
rf_pred = rf.predict(X_testing_ohe)

# manipulating the threshold (threshold here is set to get some amount in each category of confusion matrix)
rf_prob = rf.predict_proba(X_testing_ohe)[:, 1]
threshold = 0.45
rf_pred = (rf_prob >= threshold).astype(int)

print(classification_report(y_test, rf_pred, target_names=target_names, zero_division=0))
print("PRAUC: ", average_precision_score(y_test, rf_prob))

cm_rf = confusion_matrix(y_test, rf_pred)
cm_rf_df = pd.DataFrame(
    cm_rf,
    index=["Actual Legit", "Actual Fraud"],
    columns=["Predicted Legit", "Predicted Fraud"]
)

print(cm_rf_df)

              precision    recall  f1-score   support

       legit       1.00      0.01      0.01    553574
       fraud       0.00      1.00      0.01      2145

    accuracy                           0.01    555719
   macro avg       0.50      0.50      0.01    555719
weighted avg       0.99      0.01      0.01    555719

PRAUC:  0.008063305459718513
              Predicted Legit  Predicted Fraud
Actual Legit             4161           549413
Actual Fraud                5             2140
