<a href="https://colab.research.google.com/github/ndegwaanth/Credit_Card_Fraud_Detection/blob/main/Credit_card_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('/content/creditcard.csv')
df.head()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.drop(['Time'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1,1)) # Reshape the 'Amount' column into a 2D array

In [None]:
df.head()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df['Class'].value_counts()

In [None]:
df.isna().sum()

In [None]:
df['Class'].fillna(df['Class'].mode()[0], inplace=True)

In [None]:
df.isna().any()

In [None]:
sns.countplot(x='Class', data=df)
plt.show()

In [None]:
for col in df.columns:
  plt.figure(figsize=(12, 4))
  sns.countplot(data=df, x=col)
  plt.title(f'credit fraud countplot for {col}')
  plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = {
    'Logistics Regression': LogisticRegression(),
    'Decision Tress Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier()
}

for model_name, model in model.items():
  print(f"\n=========== {model_name}==============")
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
  print(f"\n Precision Score: {precision_score(y_test, y_pred)}")
  print(f"\n Recall Score: {recall_score(y_test, y_pred)}")
  print(f"\n F1 Score: {f1_score(y_test, y_pred)}")

  cnf = confusion_matrix(y_test, y_pred)
  sns.heatmap(cnf, annot=True, fmt='d', cmap='Blues')
  plt.title(f'Confusion Matrix for {model_name}')
  plt.xlabel('Predicted Labels')
  plt.ylabel('True Labels')
  plt.show()

### UNDERSAMPLING

In [None]:
norma_df = df[df['Class']==0]
fraud_df = df[df['Class']==1]

In [None]:
norma_df.shape

In [None]:
fraud_df.shape

In [None]:
normal_sample = norma_df.sample(n=473)

In [None]:
normal_sample.shape

In [None]:
new_df = pd.concat([normal_sample, fraud_df], ignore_index=True)
new_df.head()

In [None]:
new_df['Class'].value_counts()

In [None]:
X_df = new_df.drop('Class', axis=1)
y_df = new_df['Class']

In [None]:
X_test_1, X_train_1, y_test_1, y_train_1 = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

In [None]:
model = {
    'Logistics Regression': LogisticRegression(),
    'Decision Tress Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier()
}

for model_name, model in model.items():
  print(f"\n=========== {model_name}==============")
  model.fit(X_train_1, y_train_1)
  y_pred_1 = model.predict(X_test_1)
  print(f"Accuracy Score: {accuracy_score(y_test_1, y_pred_1)}")
  print(f"\n Precision Score: {precision_score(y_test_1, y_pred_1)}")
  print(f"\n Recall Score: {recall_score(y_test_1, y_pred_1)}")
  print(f"\n F1 Score: {f1_score(y_test_1, y_pred_1)}")

  cnf = confusion_matrix(y_test_1, y_pred_1)
  sns.heatmap(cnf, annot=True, fmt='d', cmap='Blues')
  plt.title(f'Confusion Matrix for {model_name}')
  plt.xlabel('Predicted Labels')
  plt.ylabel('True Labels')
  plt.show()

### OVERSAMPLING

In [None]:
from imblearn.over_sampling import SMOTE


In [None]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
X_resampled.shape

In [None]:
y_resampled.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
model = {
    'Logistics Regression': LogisticRegression(),
    'Decision Tress Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier()
}

for model_name, model in model.items():
  print(f"\n=========== {model_name}==============")
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
  print(f"\n Precision Score: {precision_score(y_test, y_pred)}")
  print(f"\n Recall Score: {recall_score(y_test, y_pred)}")
  print(f"\n F1 Score: {f1_score(y_test, y_pred)}")

  cnf = confusion_matrix(y_test, y_pred)
  sns.heatmap(cnf, annot=True, fmt='d', cmap='Blues')
  plt.title(f'Confusion Matrix for {model_name}')
  plt.xlabel('Predicted Labels')
  plt.ylabel('True Labels')
  plt.show()

In [None]:
y_resampled.value_counts()

In [None]:
import pickle

with open("Logistic_credic_model.pkl", "wb") as file:
  pickle.dump(model['Logistics Regression'], file)
with open("Decision_Tress_Classifier_model.pkl", "wb") as file:
  pickle.dump(model['Decision Tress Classifier'], file)
with open("Random_Forest_Classifier.pkl", "wb") as file:
  pickle.dump(model['Random Forest Classifier'], file)

In [None]:
with open("Logistic_credic_model.pkl", "wb") as file:
  logistic_model = pickle.load(file)
with open("Decision_Tress_Classifier_model.pkl", "wb") as file:
  decision_model = pickle.load(file)
with open("Random_Forest_Classifier.pkl", "wb") as file:
  random_model = pickle.load(file)


In [None]:
def model_prediction_with_logistic(input_data):
