In [49]:
import kagglehub

import warnings
warnings.filterwarnings("ignore")
from datetime import datetime

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, precision_recall_fscore_support, precision_recall_curve
from sklearn.ensemble import IsolationForest

In [None]:
path = kagglehub.dataset_download("valakhorasani/bank-transaction-dataset-for-fraud-detection")
print("Path to dataset files:", path)
df = pd.read_csv(path + "\\bank_transactions_data_2.csv")
display(df.head())
print(df.shape)

Path to dataset files: C:\Users\mia.jensen\.cache\kagglehub\datasets\valakhorasani\bank-transaction-dataset-for-fraud-detection\versions\4


Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate
0,TX000001,AC00128,14.09,2023-04-11 16:29:14,Debit,San Diego,D000380,162.198.218.92,M015,ATM,70,Doctor,81,1,5112.21,2024-11-04 08:08:08
1,TX000002,AC00455,376.24,2023-06-27 16:44:19,Debit,Houston,D000051,13.149.61.4,M052,ATM,68,Doctor,141,1,13758.91,2024-11-04 08:09:35
2,TX000003,AC00019,126.29,2023-07-10 18:16:08,Debit,Mesa,D000235,215.97.143.157,M009,Online,19,Student,56,1,1122.35,2024-11-04 08:07:04
3,TX000004,AC00070,184.5,2023-05-05 16:32:11,Debit,Raleigh,D000187,200.13.225.150,M002,Online,26,Student,25,1,8569.06,2024-11-04 08:09:06
4,TX000005,AC00411,13.45,2023-10-16 17:51:24,Credit,Atlanta,D000308,65.164.3.100,M091,Online,26,Student,198,1,7429.4,2024-11-04 08:06:39


(2512, 16)


In [69]:
print(list(df.TransactionType.unique()))
print(list(df.Location.unique()))
print(list(df.Channel.unique()))
print(list(df.CustomerOccupation.unique()))

['Debit', 'Credit']
['San Diego', 'Houston', 'Mesa', 'Raleigh', 'Atlanta', 'Oklahoma City', 'Seattle', 'Indianapolis', 'Detroit', 'Nashville', 'Albuquerque', 'Memphis', 'Louisville', 'Denver', 'Austin', 'Columbus', 'Los Angeles', 'Las Vegas', 'Fort Worth', 'Miami', 'Milwaukee', 'Baltimore', 'New York', 'San Francisco', 'San Jose', 'San Antonio', 'Philadelphia', 'Charlotte', 'Tucson', 'Chicago', 'Sacramento', 'Kansas City', 'Omaha', 'Virginia Beach', 'Dallas', 'Boston', 'Jacksonville', 'Phoenix', 'Washington', 'El Paso', 'Colorado Springs', 'Fresno', 'Portland']
['ATM', 'Online', 'Branch']
['Doctor', 'Student', 'Retired', 'Engineer']


In [70]:
cat_cols = ["TransactionType", "Location", "Channel", "CustomerOccupation"]
df_dummies = pd.get_dummies(df[cat_cols], drop_first=True).astype(int)
df_dummy = pd.merge(df, df_dummies, left_index=True, right_index=True)
df_dummy.head()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate,TransactionType_Debit,Location_Atlanta,Location_Austin,Location_Baltimore,Location_Boston,Location_Charlotte,Location_Chicago,Location_Colorado Springs,Location_Columbus,Location_Dallas,Location_Denver,Location_Detroit,Location_El Paso,Location_Fort Worth,Location_Fresno,Location_Houston,Location_Indianapolis,Location_Jacksonville,Location_Kansas City,Location_Las Vegas,Location_Los Angeles,Location_Louisville,Location_Memphis,Location_Mesa,Location_Miami,Location_Milwaukee,Location_Nashville,Location_New York,Location_Oklahoma City,Location_Omaha,Location_Philadelphia,Location_Phoenix,Location_Portland,Location_Raleigh,Location_Sacramento,Location_San Antonio,Location_San Diego,Location_San Francisco,Location_San Jose,Location_Seattle,Location_Tucson,Location_Virginia Beach,Location_Washington,Channel_Branch,Channel_Online,CustomerOccupation_Engineer,CustomerOccupation_Retired,CustomerOccupation_Student
0,TX000001,AC00128,14.09,2023-04-11 16:29:14,Debit,San Diego,D000380,162.198.218.92,M015,ATM,70,Doctor,81,1,5112.21,2024-11-04 08:08:08,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,TX000002,AC00455,376.24,2023-06-27 16:44:19,Debit,Houston,D000051,13.149.61.4,M052,ATM,68,Doctor,141,1,13758.91,2024-11-04 08:09:35,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,TX000003,AC00019,126.29,2023-07-10 18:16:08,Debit,Mesa,D000235,215.97.143.157,M009,Online,19,Student,56,1,1122.35,2024-11-04 08:07:04,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
3,TX000004,AC00070,184.5,2023-05-05 16:32:11,Debit,Raleigh,D000187,200.13.225.150,M002,Online,26,Student,25,1,8569.06,2024-11-04 08:09:06,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1
4,TX000005,AC00411,13.45,2023-10-16 17:51:24,Credit,Atlanta,D000308,65.164.3.100,M091,Online,26,Student,198,1,7429.4,2024-11-04 08:06:39,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1


In [71]:
df_feat_eng = df_dummy.copy()

df_feat_eng["TransactionDate"] = pd.to_datetime(df_feat_eng["TransactionDate"])
df_feat_eng["PreviousTransactionDate"] = pd.to_datetime(df_feat_eng["PreviousTransactionDate"])

df_feat_eng["DaysSinceLastTransaction"] = (df_feat_eng["PreviousTransactionDate"] - df_feat_eng["TransactionDate"]).dt.total_seconds() / (60 * 60 * 24)

df_feat_eng["Year"] = df_feat_eng["TransactionDate"].dt.year
df_feat_eng["Month"] = df_feat_eng["TransactionDate"].dt.month
df_feat_eng["Day"] = df_feat_eng["TransactionDate"].dt.day

df_feat_eng.head()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate,TransactionType_Debit,Location_Atlanta,Location_Austin,Location_Baltimore,Location_Boston,Location_Charlotte,Location_Chicago,Location_Colorado Springs,Location_Columbus,Location_Dallas,Location_Denver,Location_Detroit,Location_El Paso,Location_Fort Worth,Location_Fresno,Location_Houston,Location_Indianapolis,Location_Jacksonville,Location_Kansas City,Location_Las Vegas,Location_Los Angeles,Location_Louisville,Location_Memphis,Location_Mesa,Location_Miami,Location_Milwaukee,Location_Nashville,Location_New York,Location_Oklahoma City,Location_Omaha,Location_Philadelphia,Location_Phoenix,Location_Portland,Location_Raleigh,Location_Sacramento,Location_San Antonio,Location_San Diego,Location_San Francisco,Location_San Jose,Location_Seattle,Location_Tucson,Location_Virginia Beach,Location_Washington,Channel_Branch,Channel_Online,CustomerOccupation_Engineer,CustomerOccupation_Retired,CustomerOccupation_Student,DaysSinceLastTransaction,Year,Month,Day
0,TX000001,AC00128,14.09,2023-04-11 16:29:14,Debit,San Diego,D000380,162.198.218.92,M015,ATM,70,Doctor,81,1,5112.21,2024-11-04 08:08:08,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,572.652014,2023,4,11
1,TX000002,AC00455,376.24,2023-06-27 16:44:19,Debit,Houston,D000051,13.149.61.4,M052,ATM,68,Doctor,141,1,13758.91,2024-11-04 08:09:35,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,495.642546,2023,6,27
2,TX000003,AC00019,126.29,2023-07-10 18:16:08,Debit,Mesa,D000235,215.97.143.157,M009,Online,19,Student,56,1,1122.35,2024-11-04 08:07:04,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,482.577037,2023,7,10
3,TX000004,AC00070,184.5,2023-05-05 16:32:11,Debit,Raleigh,D000187,200.13.225.150,M002,Online,26,Student,25,1,8569.06,2024-11-04 08:09:06,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,548.650637,2023,5,5
4,TX000005,AC00411,13.45,2023-10-16 17:51:24,Credit,Atlanta,D000308,65.164.3.100,M091,Online,26,Student,198,1,7429.4,2024-11-04 08:06:39,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,384.593924,2023,10,16


In [72]:
df_model = df_feat_eng.copy()
df_model = df_model.drop(columns=cat_cols+["TransactionDate", "PreviousTransactionDate", "TransactionID", "AccountID", "DeviceID", "IP Address", "MerchantID"])

model_if = IsolationForest(
    n_estimators=100, 
    max_samples=len(df_model), 
    contamination=0.1, 
    random_state=42, 
    verbose=0
)

model_if.fit(df_model)
print(model_if)

scores = model_if.decision_function(df_model)
y_pred = model_if.predict(df_model)

y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

df_feat_eng["Anomaly Score"] = -scores
df_feat_eng["Is Anomaly"] = y_pred
df_feat_eng.sort_values(by=["Anomaly Score"], ascending=[False])

IsolationForest(contamination=0.1, max_samples=2512, random_state=42)


Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate,TransactionType_Debit,Location_Atlanta,Location_Austin,Location_Baltimore,Location_Boston,Location_Charlotte,Location_Chicago,Location_Colorado Springs,Location_Columbus,Location_Dallas,Location_Denver,Location_Detroit,Location_El Paso,Location_Fort Worth,Location_Fresno,Location_Houston,Location_Indianapolis,Location_Jacksonville,Location_Kansas City,Location_Las Vegas,Location_Los Angeles,Location_Louisville,Location_Memphis,Location_Mesa,Location_Miami,Location_Milwaukee,Location_Nashville,Location_New York,Location_Oklahoma City,Location_Omaha,Location_Philadelphia,Location_Phoenix,Location_Portland,Location_Raleigh,Location_Sacramento,Location_San Antonio,Location_San Diego,Location_San Francisco,Location_San Jose,Location_Seattle,Location_Tucson,Location_Virginia Beach,Location_Washington,Channel_Branch,Channel_Online,CustomerOccupation_Engineer,CustomerOccupation_Retired,CustomerOccupation_Student,DaysSinceLastTransaction,Year,Month,Day,Anomaly Score,Is Anomaly
871,TX000872,AC00482,154.32,2024-01-01 16:53:32,Credit,Tucson,D000055,8.244.162.185,M015,ATM,38,Engineer,269,1,6972.50,2024-11-04 08:11:43,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,307.637627,2024,1,1,0.067466,1
556,TX000557,AC00083,2.03,2024-01-01 16:07:05,Credit,Detroit,D000271,112.86.148.212,M012,ATM,33,Engineer,28,1,3012.31,2024-11-04 08:08:50,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,307.667882,2024,1,1,0.054270,1
1213,TX001214,AC00170,1192.20,2023-12-21 16:21:27,Credit,Jacksonville,D000174,40.10.25.102,M093,Branch,60,Retired,103,5,7816.41,2024-11-04 08:09:14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,318.658183,2023,12,21,0.052116,1
274,TX000275,AC00454,1176.28,2023-12-20 16:08:02,Credit,Kansas City,D000476,50.202.8.53,M074,ATM,54,Engineer,174,5,323.69,2024-11-04 08:11:44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,319.669236,2023,12,20,0.051725,1
117,TX000118,AC00498,56.98,2024-01-01 16:34:10,Credit,Oklahoma City,D000256,101.90.40.193,M086,ATM,43,Engineer,122,2,5688.37,2024-11-04 08:07:42,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,307.648287,2024,1,1,0.046616,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,TX001990,AC00311,56.96,2023-06-26 17:38:30,Debit,Albuquerque,D000379,70.130.169.235,M011,ATM,22,Student,135,1,1597.85,2024-11-04 08:07:18,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,496.603333,2023,6,26,-0.064065,0
2009,TX002010,AC00341,17.43,2023-02-23 16:21:38,Debit,Albuquerque,D000574,155.19.119.0,M042,ATM,56,Doctor,33,1,10658.88,2024-11-04 08:08:36,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,619.657616,2023,2,23,-0.065731,0
1627,TX001628,AC00141,398.70,2023-08-28 16:02:11,Debit,Albuquerque,D000426,198.39.35.23,M006,ATM,47,Doctor,276,1,7987.74,2024-11-04 08:09:04,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,433.671447,2023,8,28,-0.066621,0
181,TX000182,AC00149,453.54,2023-04-19 16:36:03,Debit,Albuquerque,D000150,223.32.70.156,M017,ATM,58,Doctor,82,1,12046.30,2024-11-04 08:12:06,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,564.650035,2023,4,19,-0.066728,0
