<a href="https://colab.research.google.com/github/nitingoyal123/Credit-Card-Fraud-Detection/blob/main/credit_card_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
data = pd.read_csv('https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud', on_bad_lines='skip')
data.shape

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
data['Amount'] = sc.fit_transform(pd.DataFrame(data['Amount']))

In [None]:
def data_cleaning(data) :

  new_data = data.drop_duplicates()
  data = data.drop('Time', axis=1)
  X = data.drop('Class', axis=1)
  Y = data['Class']
  new_data = pd.concat([X,Y], axis=1)
  new_data = new_data.dropna(subset=[Y.name])
  return new_data

In [None]:
# imbalance data
data['Class'].value_counts()

In [None]:
param_grid = {
    'Logistic Regression': {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['lbfgs', 'liblinear']}
,
    'Decision Tree Classifier': {
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]}
,
    'Random Forest Classifier': {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]}
}

In [None]:
def evaluate_model(Y_pred, Y_test, name) :
  print(f"\n============={name}============")
  print(f"\nAccuracy : {accuracy_score(Y_pred, Y_test)}")
  print(f"\nPrecision : {precision_score(Y_pred, Y_test)}")
  print(f"\nRecall : {recall_score(Y_pred, Y_test)}")
  print(f"\nF1 Score : {f1_score(Y_pred, Y_test)}")

# UnderSampling

In [None]:
# UNDERSAMPLING

new_data = data_cleaning(data)

normal = new_data[new_data['Class'] == 0]
fraud = new_data[new_data['Class'] == 1]

normal_sample = normal.sample(fraud.shape[0])
new_data = pd.concat([normal_sample, fraud])
new_data.shape

In [None]:
X_clean = new_data.drop('Class', axis=1)
Y_clean = new_data['Class']

Y_clean.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_clean, Y_clean, test_size=0.2, random_state=42)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

def create_pipeline(clf) :
  imputer = SimpleImputer()
  pipeline = Pipeline([
      ('imputer', imputer),
      ('classifier', clf)
  ])
  return pipeline

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
classifiers = {
    "Logistic Regression" : LogisticRegression(max_iter=1000),
    "Decision Tree Classifier" : DecisionTreeClassifier(),
    "Random Forest Classifier" : RandomForestClassifier()
}

for name, clf in classifiers.items() :
  grid_search = GridSearchCV(create_pipeline(clf), param_grid[name], cv=5, scoring='roc_auc')
  grid_search.fit(X_train, Y_train)
  Y_pred = grid_search.predict(X_test)
  evaluate_model(Y_pred, Y_test, name)

# OverSampling

In [None]:
new_data = data_cleaning(data)
X = new_data.drop('Class', axis=1)
Y = new_data['Class']

In [None]:
Y.value_counts()

In [None]:
from imblearn.over_sampling import SMOTE

X_res, Y_res = SMOTE().fit_resample(X,Y)
Y_res.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_res, Y_res, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
classifiers = {
    "Logistic Regression" : LogisticRegression(max_iter=1000),
    "Decision Tree Classifier" : DecisionTreeClassifier(),
    "Random Forest Classifier" : RandomForestClassifier()
}

for name, clf in classifiers.items() :
  grid_search = GridSearchCV(create_pipeline(clf), param_grid[name], cv=5, scoring='roc_auc')
  grid_search.fit(X_train, Y_train)
  Y_pred = grid_search.predict(X_test)
  evaluate_model(Y_pred, Y_test, name)



Accuracy : 0.9821291103046299

Precision : 0.9778613882981624

Recall : 0.9865764427573307

F1 Score : 0.9821995837441122


Accuracy : 0.9993951391180028

Precision : 0.9992911281967392

Recall : 0.9995091355331334

F1 Score : 0.9994001199760049
