<a href="https://colab.research.google.com/github/nikhilcn-ec/Credit_Card-Fraud-Detection/blob/main/credit_card.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_path = "/content/drive/MyDrive/datasets/credit_card.csv"


In [None]:
import pandas as pd
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [None]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [None]:
import numpy as np
len_not_fraud = len(df['isFraud'][df.isFraud == 0])
len_is_fraud = len(df['isFraud'][df.isFraud == 1])

arr = np.array([len_not_fraud, len_is_fraud])
labels = ['Not Fraudulent', 'Fraudulent']
print(f"Total No. of Non-Fraudulent Cases: {len_not_fraud}")
print(f"Total No. Fraudulent Cases: {len_is_fraud}")

Total No. of Non-Fraudulent Cases: 6354407
Total No. Fraudulent Cases: 8213


In [None]:
df = df.drop(columns=['nameOrig', 'nameDest',
                      'oldbalanceOrg', 'newbalanceOrig',
                      'oldbalanceDest', 'newbalanceDest'])

In [None]:
df = df.drop(columns=['isFlaggedFraud'])

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

print(df['type'].unique())

[3 4 1 2 0]


In [None]:
df.head()

Unnamed: 0,step,type,amount,isFraud
0,1,3,9839.64,0
1,1,3,1864.28,0
2,1,4,181.0,1
3,1,1,181.0,1
4,1,3,11668.14,0


In [None]:
nulls = df.isnull().sum()
print("Null values in each column:\n", nulls)


Null values in each column:
 step       0
type       0
amount     0
isFraud    0
dtype: int64


In [None]:
print("\nData types of each column:")
print(df.dtypes)


Data types of each column:
step         int64
type         int64
amount     float64
isFraud      int64
dtype: object


In [None]:
print("\nShape of dataset after cleaning:", df.shape)


Shape of dataset after cleaning: (6362620, 4)


In [None]:
X = df.drop(columns=['isFraud'])
y = df['isFraud']

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Shape of X: (6362620, 3)
Shape of y: (6362620,)


In [None]:
# Keep all fraud cases
fraud = df[df['isFraud'] == 1]
# Take 10–30% of majority class
non_fraud = df[df['isFraud'] == 0].sample(frac=0.1, random_state=42)

df_small = pd.concat([fraud, non_fraud])
X_small = df_small.drop(columns=['isFraud'])
y_small = df_small['isFraud']

print("Shape before SMOTE:", X_small.shape)

Shape before SMOTE: (643654, 3)


In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_resampled_rus, y_resampled_rus = rus.fit_resample(X, y)

print("Before RUS:", y.value_counts())
print("After RUS:", y_resampled_rus.value_counts())


Before RUS: isFraud
0    6354407
1       8213
Name: count, dtype: int64
After RUS: isFraud
0    8213
1    8213
Name: count, dtype: int64


In [None]:
from imblearn.over_sampling import SMOTE


smote = SMOTE(random_state=42, k_neighbors=3)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_small, y_small)

print("Before SMOTE:", y.value_counts())
print("After SMOTE:", y_resampled_smote.value_counts())


Before SMOTE: isFraud
0    6354407
1       8213
Name: count, dtype: int64
After SMOTE: isFraud
1    635441
0    635441
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import pandas as pd


In [None]:
# Split original dataset for evaluation
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Example: undersampled training set
# X_train_rus, y_train_rus = X_resampled_rus, y_resampled_rus
# SMOTE training set
# X_train_smote, y_train_smote = X_resampled_smote, y_resampled_smote


In [None]:
def evaluate_models(X_train, y_train, X_test, y_test):
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
    }

    results = []

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else y_pred

        results.append({
            'Model': name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred, zero_division=0),
            'Recall': recall_score(y_test, y_pred, zero_division=0),
            'F1 Score': f1_score(y_test, y_pred, zero_division=0),
            'ROC-AUC': roc_auc_score(y_test, y_prob),
            'PR-AUC': average_precision_score(y_test, y_prob)
        })

    return pd.DataFrame(results)


In [None]:
print("===== Random Undersampling Results =====")
results_rus = evaluate_models(X_resampled_rus, y_resampled_rus, X_test, y_test)
print(results_rus)

print("\n===== SMOTE Results =====")
results_smote = evaluate_models(X_resampled_smote, y_resampled_smote, X_test, y_test)
print(results_smote)


===== Random Undersampling Results =====
                 Model  Accuracy  Precision    Recall  F1 Score   ROC-AUC  \
0  Logistic Regression  0.889809   0.008044  0.689592  0.015903  0.841010   
1        Decision Tree  0.868261   0.009706  1.000000  0.019225  0.934045   
2        Random Forest  0.885586   0.011159  1.000000  0.022071  0.986311   
3    Gradient Boosting  0.884110   0.009960  0.902009  0.019703  0.962788   

     PR-AUC  
0  0.019622  
1  0.009706  
2  0.105424  
3  0.246318  

===== SMOTE Results =====
                 Model  Accuracy  Precision    Recall  F1 Score   ROC-AUC  \
0  Logistic Regression  0.886021   0.007900  0.700548  0.015623  0.832515   
1        Decision Tree  0.952368   0.026391  1.000000  0.051425  0.976154   
2        Random Forest  0.954984   0.027882  1.000000  0.054251  0.997390   
3    Gradient Boosting  0.911103   0.012080  0.839927  0.023817  0.953455   

     PR-AUC  
0  0.020650  
1  0.026391  
2  0.419215  
3  0.252072  


In [None]:
import joblib
from sklearn.ensemble import RandomForestClassifier

# 1️⃣ Train the Random Forest on SMOTE-resampled data
rf_model_smote = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model_smote.fit(X_resampled_smote, y_resampled_smote)

# 2️⃣ Save the trained model
joblib.dump(rf_model_smote, "rf_smote_model.pkl")

# ✅ Later in your Streamlit app, you can load it:
# rf_model = joblib.load("rf_smote_model.pkl")


['rf_smote_model.pkl']

In [None]:
joblib.dump(rf_model_smote, "rf_smote_model.pkl")

['rf_smote_model.pkl']

In [None]:
model_path = "/content/drive/MyDrive/datasets/rf_smote_model.pkl"

['/content/drive/MyDrive/datasets/rf_smote_model.pkl']