In [125]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix



In [126]:
# Load dataset
df = pd.read_csv(r"C:\Users\NASEEMA NASRIN\Desktop\fraudTest.csv")

# Display first 5 rows
print(df.head())


   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2020-06-21 12:14:25  2291163933867244   
1           1   2020-06-21 12:14:33  3573030041201292   
2           2   2020-06-21 12:14:53  3598215285024754   
3           3   2020-06-21 12:15:15  3591919803438423   
4           4   2020-06-21 12:15:17  3526826139003047   

                               merchant        category    amt   first  \
0                 fraud_Kirlin and Sons   personal_care   2.86    Jeff   
1                  fraud_Sporer-Keebler   personal_care  29.84  Joanne   
2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28  Ashley   
3                     fraud_Haley Group        misc_pos  60.05   Brian   
4                 fraud_Johnston-Casper          travel   3.19  Nathan   

       last gender                       street  ...      lat      long  \
0   Elliott      M            351 Darlene Green  ...  33.9659  -80.9355   
1  Williams      F             3638 Marsh Union  ...  40.3207 

In [127]:
print("\nDataset Info:")
print(df.info())

print("\nClass Distribution:")
print(df['is_fraud'].value_counts())



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  int64  
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                 

In [128]:
# Features and Target
X = df.select_dtypes(include=['int64', 'float64'])
X = X.drop('is_fraud', axis=1)
X = X.fillna(0)

y = df['is_fraud']

In [129]:
x_train, x_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)


In [130]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
lr = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)

lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)

print("\n--- Logistic Regression ---")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))



--- Logistic Regression ---
[[166072      0]
 [   644      0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    166072
           1       0.00      0.00      0.00       644

    accuracy                           1.00    166716
   macro avg       0.50      0.50      0.50    166716
weighted avg       0.99      1.00      0.99    166716



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [131]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)

print("\n--- Random Forest ---")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))



--- Random Forest ---
[[166042     30]
 [   334    310]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    166072
           1       0.91      0.48      0.63       644

    accuracy                           1.00    166716
   macro avg       0.95      0.74      0.81    166716
weighted avg       1.00      1.00      1.00    166716



In [132]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

dt = DecisionTreeClassifier(
    class_weight='balanced',
    random_state=42
)

dt.fit(x_train, y_train)
y_pred_dt = dt.predict(x_test)

print("\n--- Decision Tree ---")
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))



--- Decision Tree ---
[[165739    333]
 [   371    273]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    166072
           1       0.45      0.42      0.44       644

    accuracy                           1.00    166716
   macro avg       0.72      0.71      0.72    166716
weighted avg       1.00      1.00      1.00    166716



In [133]:
print("Logistic Regression:", lr.score(x_test, y_test))
print("Decision Tree:", dt.score(x_test, y_test))
print("Random Forest:", rf.score(x_test, y_test))


Logistic Regression: 0.9961371434055519
Decision Tree: 0.9957772499340195
Random Forest: 0.9978166462727033
