In [2]:
# https://www.kaggle.com/datasets/kartik2112/fraud-detection?select=fraudTrain.csv
# dataset preprocessing
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

df_train = pd.read_csv("fraudTrain.csv")
df_test = pd.read_csv("fraudTest.csv")

# data preprocessing plus exploratory data analysis
def preprocess(df):
    # combine first and last name to one column
    #df['name'] = df['first'] + df['last']
    df = df.drop(columns=['first','last'])

    # combine address into

    # drop time, also drop data for a baseline test (can manipulate what data to add back in later)
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df = df.sort_values(by="trans_date_trans_time", ascending=True)
    df = df.drop(columns=['trans_date_trans_time', 'job', 'dob', 'unix_time', 'city_pop', 'category', 'street', 'city', 'state', 'zip'
                          , 'gender', 'cc_num', 'trans_num'])

    return df

df_train = preprocess(df_train)
df_test = preprocess(df_test)

# training data
X_training = df_train.drop('is_fraud', axis=1)
y_training = df_train['is_fraud']

# testing data
X_test = df_test.drop('is_fraud', axis=1)
y_test = df_test['is_fraud']


In [3]:
# encoding for categorical data
ct = ColumnTransformer(
    transformers=[
        ('merchants', OneHotEncoder(handle_unknown='ignore'), ['merchant'])
    ],
    remainder='passthrough'
)

ct.fit(X_training)

X_training_ohe = ct.transform(X_training)
X_testing_ohe  = ct.transform(X_test)
print(ct.get_feature_names_out())

['merchants__merchant_fraud_Abbott-Rogahn'
 'merchants__merchant_fraud_Abbott-Steuber'
 'merchants__merchant_fraud_Abernathy and Sons'
 'merchants__merchant_fraud_Abshire PLC'
 'merchants__merchant_fraud_Adams, Kovacek and Kuhlman'
 'merchants__merchant_fraud_Adams-Barrows'
 'merchants__merchant_fraud_Altenwerth, Cartwright and Koss'
 'merchants__merchant_fraud_Altenwerth-Kilback'
 'merchants__merchant_fraud_Ankunding LLC'
 'merchants__merchant_fraud_Ankunding-Carroll'
 'merchants__merchant_fraud_Armstrong, Walter and Gottlieb'
 'merchants__merchant_fraud_Auer LLC'
 'merchants__merchant_fraud_Auer-Mosciski'
 'merchants__merchant_fraud_Auer-West'
 'merchants__merchant_fraud_Bahringer Group'
 'merchants__merchant_fraud_Bahringer, Bergnaum and Quitzon'
 'merchants__merchant_fraud_Bahringer, Osinski and Block'
 'merchants__merchant_fraud_Bahringer, Schoen and Corkery'
 'merchants__merchant_fraud_Bahringer-Larson'
 'merchants__merchant_fraud_Bahringer-Streich'
 'merchants__merchant_fraud_Ba

In [4]:
# logistic regression model

from sklearn.metrics import classification_report, confusion_matrix, average_precision_score
from sklearn.linear_model import LogisticRegression


# use balanced to let the model increase weighting for fraud cases
logistic_regression = LogisticRegression(
    class_weight='balanced',
    max_iter=10000,
    random_state=0
)

logistic_regression.fit(X_training_ohe, y_training)
lr_pred = logistic_regression.predict(X_testing_ohe)

# manipulating the threshold (threshold here is set to get some amount in each category of confusion matrix)
lr_prob = logistic_regression.predict_proba(X_testing_ohe)[:, 1]
threshold = 0.4
lr_pred = (lr_prob >= threshold).astype(int)

# print results
target_names = ["legit", "fraud"]
print(classification_report(y_test, lr_pred, target_names=target_names, zero_division=0))
print("PRAUC: ", average_precision_score(y_test, lr_prob))

cm = confusion_matrix(y_test,lr_pred)
cm_df = pd.DataFrame(
    cm,
    index=["Actual Legit", "Actual Fraud"],
    columns=["Predicted Legit", "Predicted Fraud"]
)

print(cm_df)

              precision    recall  f1-score   support

       legit       1.00      0.88      0.94    553574
       fraud       0.02      0.76      0.05      2145

    accuracy                           0.88    555719
   macro avg       0.51      0.82      0.49    555719
weighted avg       1.00      0.88      0.93    555719

PRAUC:  0.13636489718082287
              Predicted Legit  Predicted Fraud
Actual Legit           489024            64550
Actual Fraud              506             1639


In [5]:
# SVM model
# have to use linearSVM because SVM scales horribly with this dataset
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler


svm = LinearSVC(
    class_weight="balanced",
    max_iter=10000,
    random_state=0
)

svm.fit(X_training_ohe, y_training)
svm_pred = svm.predict(X_testing_ohe)

# manipulating the threshold (threshold here is set to get some amount in each category of confusion matrix)
decision_scores = svm.decision_function(X_testing_ohe)
threshold = 0.2 
svm_pred = (decision_scores >= threshold).astype(int)

print(classification_report(y_test, svm_pred, target_names=target_names, zero_division=0))
print("PRAUC: ", average_precision_score(y_test, decision_scores))

cm_svm = confusion_matrix(y_test, svm_pred)
cm_svm_df = pd.DataFrame(
    cm_svm,
    index=["Actual Legit", "Actual Fraud"],
    columns=["Predicted Legit", "Predicted Fraud"]
)

print(cm_svm_df)


              precision    recall  f1-score   support

       legit       1.00      0.94      0.97    553574
       fraud       0.04      0.72      0.08      2145

    accuracy                           0.94    555719
   macro avg       0.52      0.83      0.52    555719
weighted avg       1.00      0.94      0.96    555719

PRAUC:  0.16810179212455229
              Predicted Legit  Predicted Fraud
Actual Legit           518268            35306
Actual Fraud              609             1536


In [6]:
# neural network (multilayer perceptron) (~2 min to run)
from sklearn.neural_network import MLPClassifier
from sklearn.utils.class_weight import compute_sample_weight


# baseline hyperparameters,
mlp = make_pipeline(StandardScaler(with_mean=False), MLPClassifier(
    hidden_layer_sizes=(128, 64), # larger dataset requires more nodes in each layer
    activation='relu',            
    solver='adam',
    alpha = 0.001,               
    max_iter=500,        
    random_state=0,
))

mlp.fit(X_training_ohe, y_training)
mlp_pred = mlp.predict(X_testing_ohe)

mlp_prob = mlp.predict_proba(X_testing_ohe)[:, 1]

# manipulating the threshold (threshold here is set to get some amount in each category of confusion matrix)
# 0.05 and above results in model not finding anything
threshold = 0.01
mlp_pred = (mlp_prob >= threshold).astype(int)

print(classification_report(y_test, mlp_pred, target_names=target_names, zero_division=0))
print("PRAUC: ", average_precision_score(y_test, mlp_prob))

cm_mlp = confusion_matrix(y_test, mlp_pred)
cm_mlp_df = pd.DataFrame(
    cm_mlp,
    index=["Actual Legit", "Actual Fraud"],
    columns=["Predicted Legit", "Predicted Fraud"]
)

print(cm_mlp_df)

              precision    recall  f1-score   support

       legit       1.00      0.99      0.99    553574
       fraud       0.21      0.83      0.33      2145

    accuracy                           0.99    555719
   macro avg       0.60      0.91      0.66    555719
weighted avg       1.00      0.99      0.99    555719

PRAUC:  0.6665958545528449
              Predicted Legit  Predicted Fraud
Actual Legit           546810             6764
Actual Fraud              374             1771


In [7]:
# XGBoost
from xgboost import XGBClassifier


num_pos = y_training.sum()
num_neg = len(y_training) - num_pos
xgb = XGBClassifier(
    objective='binary:logistic',
    n_estimators=500,  
    max_depth=5,        
    learning_rate=0.05, 
    subsample=0.8,      
    colsample_bytree=0.8, 
    random_state=0,
    tree_method='hist', 
    scale_pos_weight=num_neg /num_pos, # balance the classes to have more weight on fraud cases
)

xgb.fit(X_training_ohe, y_training)
xgb_pred = xgb.predict(X_testing_ohe)

xgb_prob = xgb.predict_proba(X_testing_ohe)[:, 1]

# manipulating the threshold (threshold here is set to get some amount in each category of confusion matrix)
threshold = 0.4
xgb_pred = (xgb_prob >= threshold).astype(int)

print(classification_report(y_test, xgb_pred, target_names=target_names, zero_division=0))
print("PRAUC: ", average_precision_score(y_test, xgb_prob))

cm_xgb = confusion_matrix(y_test, xgb_pred)
cm_xgb_df = pd.DataFrame(
    cm_xgb,
    index=["Actual Legit", "Actual Fraud"],
    columns=["Predicted Legit", "Predicted Fraud"]
)

print(cm_xgb_df)

              precision    recall  f1-score   support

       legit       1.00      0.90      0.95    553574
       fraud       0.03      0.91      0.07      2145

    accuracy                           0.90    555719
   macro avg       0.52      0.90      0.51    555719
weighted avg       1.00      0.90      0.95    555719

PRAUC:  0.22927798211939696
              Predicted Legit  Predicted Fraud
Actual Legit           499762            53812
Actual Fraud              200             1945


In [8]:
# Random Forest (~1-2 min)
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=500, 
    min_samples_split=2,
    random_state=42,
    class_weight='balanced',
    max_depth=15
)

# baseline test
rf.fit(X_training_ohe, y_training)
rf_pred = rf.predict(X_testing_ohe)

# manipulating the threshold (threshold here is set to get some amount in each category of confusion matrix)
rf_prob = rf.predict_proba(X_testing_ohe)[:, 1]
threshold = 0.45
rf_pred = (rf_prob >= threshold).astype(int)

print(classification_report(y_test, rf_pred, target_names=target_names, zero_division=0))
print("PRAUC: ", average_precision_score(y_test, rf_prob))

cm_rf = confusion_matrix(y_test, rf_pred)
cm_rf_df = pd.DataFrame(
    cm_rf,
    index=["Actual Legit", "Actual Fraud"],
    columns=["Predicted Legit", "Predicted Fraud"]
)

print(cm_rf_df)

              precision    recall  f1-score   support

       legit       1.00      0.97      0.99    553574
       fraud       0.09      0.74      0.16      2145

    accuracy                           0.97    555719
   macro avg       0.55      0.86      0.57    555719
weighted avg       1.00      0.97      0.98    555719

PRAUC:  0.1499692341120678
              Predicted Legit  Predicted Fraud
Actual Legit           537994            15580
Actual Fraud              554             1591


In [9]:

# make the fraud test data for a model (20% of all false negatives)
def get_FN(pred):
    all_fn_indices = np.where((pred == 0) & (y_test.to_numpy() == 1))[0]

    picked_indices = np.random.choice(all_fn_indices, size=int(0.2 * len(all_fn_indices)), replace=False)
    return X_testing_ohe[picked_indices, :]

# functions for predicting with a threshold
def predict_with_threshold(model, threshold, data):
    prob = model.predict_proba(data)[:, 1]
    pred = (prob >= threshold).astype(int)

    return pred

def predict_with_threshold_svm(model, threshold, data):
    decision_scores = model.decision_function(data) 
    pred = (decision_scores >= threshold).astype(int)

    return pred

# get prediction based on specific model and data
def get_pred(index, data):
    if index == 0:
        return predict_with_threshold(logistic_regression, 0.4, data)
    elif index == 1:
        return predict_with_threshold_svm(svm, 0.2, data)
    elif index == 2:
        return predict_with_threshold(mlp, 0.01, data)
    elif index == 3:
        return predict_with_threshold(xgb, 0.4, data)
    else:
        return predict_with_threshold(rf, 0.45, data)
    
print(type(X_testing_ohe))
print(X_testing_ohe.shape)

<class 'scipy.sparse._csr.csr_matrix'>
(555719, 699)


In [18]:
# experiment
from numpy import random

models = [logistic_regression, svm, mlp, xgb, rf]
fraud_sets = [get_FN(lr_pred), get_FN(svm_pred), get_FN(mlp_pred), get_FN(xgb_pred), get_FN(rf_pred)]
model_losses = [0,0,0,0,0]

# probability of committing fraud (fraud procedure for now)
# can manipulate
fraud_prob = 0.1

# threshold to switch to another model if this monetary value is not reached 
# can manipulate
threshold = 100000

# amount of txns to pass before potentially switching
# can manipulate
window = 100000

# extract the amount column
feature_names = ct.get_feature_names_out()
amt_idx = list(feature_names).index("remainder__amt")

# total amount of money lost by all models
total_loss = 0
# total amount of money lost in a window
loss_at_model = 0

# counter for window
count = 0

# picks an index for a model
m = random.randint(0,4)

# fraud txn counter
fraud_count = 0
window_count = 0

loss_at_window = 0
loss_at_model = 0

for i in X_testing_ohe:
    # generate a float to decide whether or not to do fraud
    do_fraud = random.random()
    if do_fraud <= fraud_prob:
        # sample one txn uniformally
        rows = fraud_sets[m].shape[0]
        rand_idx = random.randint(0, rows - 1)
        fraud_txn = fraud_sets[m][rand_idx]

        fraud_count += 1
        # get the prediction for the transaction
        model_pred = get_pred(m, fraud_txn)
        if model_pred == 0: # money lost if model labels incorrectly
            loss_at_model += i[0, amt_idx]
            loss_at_window += i[0, amt_idx]

    if count == window: # end of window
        model_losses[m] += loss_at_model
        if loss_at_model <= threshold: # check threshold; switch to another model if threshold is not met
            m_prev = m
            while m_prev == m:
                m = random.randint(0,4)
            print("switch\n") # model switch indicated
            print("loss_at_model: ", loss_at_model)
            loss_at_model = 0

        total_loss += loss_at_window # add loss at window to total loss
        count = -1 # reset count

        window_count += 1
        print("loss_at_window: ", loss_at_window)
        loss_at_window = 0
    count += 1

print("Total loss: ",total_loss)
print("Ave. number of fraud_txns per window: ",(fraud_count / window_count))
print("Ave. loss per window: ",(total_loss / window_count))
print("Loss per model: ", model_losses)
        


loss_at_window:  699367.4299999982
loss_at_window:  671011.6599999975
loss_at_window:  665396.4099999999
loss_at_window:  710402.2499999997
loss_at_window:  701928.6299999987
Total loss:  3448106.3799999943
Ave. number of fraud_txns per window:  11059.0
Ave. loss per window:  689621.2759999989
Loss per model:  [np.float64(10299806.149999946), 0, 0, 0, 0]
