In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [44]:
# 1. Data load
bank_fraud_df = pd.read_csv("Bank_transaction_fraud_detection.csv")

In [45]:
bank_fraud_df

Unnamed: 0,Customer_ID,Customer_Name,Gender,Age,State,City,Bank_Branch,Account_Type,Transaction_ID,Transaction_Date,...,Merchant_Category,Account_Balance,Transaction_Device,Transaction_Location,Device_Type,Is_Fraud,Transaction_Currency,Customer_Contact,Transaction_Description,Customer_Email
0,d5f6ec07-d69e-4f47-b9b4-7c58ff17c19e,Osha Tella,Male,60,Kerala,Thiruvananthapuram,Thiruvananthapuram Branch,Savings,4fa3208f-9e23-42dc-b330-844829d0c12c,23-01-2025,...,Restaurant,74557.27,Voice Assistant,"Thiruvananthapuram, Kerala",POS,0,INR,+9198579XXXXXX,Bitcoin transaction,oshaXXXXX@XXXXX.com
1,7c14ad51-781a-4db9-b7bd-67439c175262,Hredhaan Khosla,Female,51,Maharashtra,Nashik,Nashik Branch,Business,c9de0c06-2c4c-40a9-97ed-3c7b8f97c79c,11/1/2025,...,Restaurant,74622.66,POS Mobile Device,"Nashik, Maharashtra",Desktop,0,INR,+9191074XXXXXX,Grocery delivery,hredhaanXXXX@XXXXXX.com
2,3a73a0e5-d4da-45aa-85f3-528413900a35,Ekani Nazareth,Male,20,Bihar,Bhagalpur,Bhagalpur Branch,Savings,e41c55f9-c016-4ff3-872b-cae72467c75c,25-01-2025,...,Groceries,66817.99,ATM,"Bhagalpur, Bihar",Desktop,0,INR,+9197745XXXXXX,Mutual fund investment,ekaniXXX@XXXXXX.com
3,7902f4ef-9050-4a79-857d-9c2ea3181940,Yamini Ramachandran,Female,57,Tamil Nadu,Chennai,Chennai Branch,Business,7f7ee11b-ff2c-45a3-802a-49bc47c02ecb,19-01-2025,...,Entertainment,58177.08,POS Mobile App,"Chennai, Tamil Nadu",Mobile,0,INR,+9195889XXXXXX,Food delivery,yaminiXXXXX@XXXXXXX.com
4,3a4bba70-d9a9-4c5f-8b92-1735fd8c19e9,Kritika Rege,Female,43,Punjab,Amritsar,Amritsar Branch,Savings,f8e6ac6f-81a1-4985-bf12-f60967d852ef,30-01-2025,...,Entertainment,16108.56,Virtual Card,"Amritsar, Punjab",Mobile,0,INR,+9195316XXXXXX,Debt repayment,kritikaXXXX@XXXXXX.com
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156342,f596668a-39e9-4f58-bc33-1c97f54ae343,Ansh Agrawal,Female,52,Arunachal Pradesh,Itanagar,Itanagar Branch,Savings,abaf2883-2ab2-4360-93ae-6eebb9a5ba37,17-01-2025,...,Health,48152.65,ATM Booth Kiosk,"Itanagar, Arunachal Pradesh",ATM,0,INR,+9199893XXXXXX,Moving services payment,anshXXXX@XXXXXXX.com
156343,07513777-8e89-4abc-b561-5ed3331a8f4a,Suhani Bhasin,Female,64,Delhi,South Delhi,South Delhi Branch,Business,e520daaa-2d08-4939-a9d5-df67512ad4a9,12/1/2025,...,Clothing,69956.08,POS Mobile App,"South Delhi, Delhi",Desktop,0,INR,+9191151XXXXXX,Bitcoin transaction,suhaniXXXX@XXXXX.com
156344,dfdd6356-5c6e-4eb6-b6a6-9365c9ee5399,Karan Dass,Female,34,Lakshadweep,Kavaratti,Kavaratti Branch,Checking,035ef4d9-f670-4c9c-954c-38e023661a13,7/1/2025,...,Entertainment,62197.09,Voice Assistant,"Kavaratti, Lakshadweep",Desktop,0,INR,+9199760XXXXXX,Medical treatment payment,karanXXXX@XXXXX.com
156345,953268d2-f0ec-41be-9b5e-5cd38032c0a4,Dev Sarma,Male,20,Puducherry,Mahe,Mahe Branch,Savings,7568fb14-76c3-46a8-9dfd-192ad6e29046,9/1/2025,...,Clothing,27862.23,POS Mobile App,"Mahe, Puducherry",Mobile,0,INR,+9195478XXXXXX,Gift card purchase,devXXXXX@XXXXX.com


In [46]:
## Count of records  with Is fraud
## Count shows  the dataset is imbalancced
bank_fraud_df["Is_Fraud"].value_counts()

0    148436
1      7911
Name: Is_Fraud, dtype: int64

In [47]:
# 2. Label NaN rows remove
bank_fraud_df = bank_fraud_df.dropna(subset=['Is_Fraud'])

# 3. Independent (X) & dependent (y)
independent = bank_fraud_df[[
    'Age',
    'Transaction_Amount',
    'Account_Balance',
    'Transaction_Type',
    'Merchant_Category',
    'Device_Type',
    'State',
    'City'
]]
dependent = bank_fraud_df['Is_Fraud']


In [48]:
# 4. Train–test split (stratify for imbalance)
X_train, X_test, y_train, y_test = train_test_split(
    independent,
    dependent,
    test_size=0.30,
    random_state=0,
    stratify=dependent
)

# 5. Columns by type
num_cols = ['Age', 'Transaction_Amount', 'Account_Balance']
cat_cols = ['Transaction_Type', 'Merchant_Category', 'Device_Type', 'State', 'City']


In [49]:
# 6. Preprocessing: pass numeric, OneHot encode categorical
preprocess = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)



# 1.Logistic Regression

In [50]:
# 1a. Model pipeline: preprocessing + Logistic Regression (balanced)
lr_model = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        solver='liblinear'
    ))
])

In [51]:
#1b. Train model
lr_model.fit(X_train, y_train)

#1c. Evaluation – default threshold 0.5
y_pred = lr_model.predict(X_test)
print("Default threshold (0.5)")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



  if LooseVersion(joblib_version) < '0.12':


Default threshold (0.5)
Accuracy: 0.5319688732544505
              precision    recall  f1-score   support

           0       0.95      0.53      0.68     44532
           1       0.05      0.48      0.09      2373

    accuracy                           0.53     46905
   macro avg       0.50      0.51      0.39     46905
weighted avg       0.90      0.53      0.65     46905



In [52]:
#1d. Evaluation – custom threshold 0.3
y_proba = lr_model.predict_proba(X_test)[:, 1]      # probability of fraud
y_pred_03 = (y_proba >= 0.3).astype(int)

print("Custom threshold (0.3)")
print(classification_report(y_test, y_pred_03))

Custom threshold (0.3)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     44532
           1       0.05      1.00      0.10      2373

    accuracy                           0.05     46905
   macro avg       0.03      0.50      0.05     46905
weighted avg       0.00      0.05      0.00     46905



  'precision', 'predicted', average, warn_for)


In [53]:
#1e.Classification Report
from sklearn.metrics import confusion_matrix
lr_cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix")
print(lr_cm)

Confusion Matrix
[[23822 20710]
 [ 1243  1130]]


# 2.Random Forest

In [54]:
#2a. Model pipeline: preprocessing + Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_model = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0))])

In [55]:
#2b. Train model
rf_model.fit(X_train, y_train)

# 2c. Evaluation – default threshold 0.5
y_pred = rf_model.predict(X_test)
print("Default threshold (0.5)")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

  if LooseVersion(joblib_version) < '0.12':
  if _joblib.__version__ >= LooseVersion('0.12'):


Default threshold (0.5)
Accuracy: 0.9487687879756955
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     44532
           1       0.03      0.00      0.00      2373

    accuracy                           0.95     46905
   macro avg       0.49      0.50      0.49     46905
weighted avg       0.90      0.95      0.92     46905



  if _joblib.__version__ >= LooseVersion('0.12'):


In [57]:
#2d. Evaluation – custom threshold 0.3
y_proba = rf_model.predict_proba(X_test)[:, 1]      # probability of fraud
y_pred_03 = (y_proba >= 0.3).astype(int)

print("Custom threshold (0.3)")
print(classification_report(y_test, y_pred_03))

  if _joblib.__version__ >= LooseVersion('0.12'):


Custom threshold (0.3)
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     44532
           1       0.05      0.03      0.04      2373

    accuracy                           0.93     46905
   macro avg       0.50      0.50      0.50     46905
weighted avg       0.90      0.93      0.91     46905



In [58]:
#2e.Classification Report
from sklearn.metrics import confusion_matrix
rf_cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix")
print(rf_cm)

Confusion Matrix
[[44501    31]
 [ 2372     1]]


# 3.Decision Tree

In [59]:
#3a. Model pipeline: preprocessing + Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt_model = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', DecisionTreeClassifier(criterion = 'entropy', random_state = 0))])

In [60]:
#3b. Train model
dt_model.fit(X_train, y_train)

# 3c. Evaluation – default threshold 0.5
y_pred = dt_model.predict(X_test)
print("Default threshold (0.5)")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

  if LooseVersion(joblib_version) < '0.12':


Default threshold (0.5)
Accuracy: 0.9010766442809935
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     44532
           1       0.05      0.05      0.05      2373

    accuracy                           0.90     46905
   macro avg       0.50      0.50      0.50     46905
weighted avg       0.90      0.90      0.90     46905



In [62]:
#3d. Evaluation – custom threshold 0.3
y_proba = dt_model.predict_proba(X_test)[:, 1]      # probability of fraud
y_pred_03 = (y_proba >= 0.3).astype(int)

print("Custom threshold (0.3)")
print(classification_report(y_test, y_pred_03))

Custom threshold (0.3)
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     44532
           1       0.05      0.05      0.05      2373

    accuracy                           0.90     46905
   macro avg       0.50      0.50      0.50     46905
weighted avg       0.90      0.90      0.90     46905



In [63]:
#3e.Classification Report
from sklearn.metrics import confusion_matrix
dt_cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix")
print(dt_cm)

Confusion Matrix
[[42142  2390]
 [ 2250   123]]
