In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

In [None]:
!pip install LazyPredict
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from collections import Counter
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, 
    confusion_matrix, classification_report
)
from sklearn.ensemble import (
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, 
    GradientBoostingClassifier, RandomForestClassifier
)
from sklearn.linear_model import (
    LogisticRegression, PassiveAggressiveClassifier, Perceptron, 
    RidgeClassifier, RidgeClassifierCV, SGDClassifier
)
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.dummy import DummyClassifier
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.under_sampling import RandomUnderSampler
from lazypredict.Supervised import LazyClassifier

import warnings
warnings.filterwarnings(action='ignore')

%matplotlib inline
plt.style.use('ggplot')

In [None]:
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')

In [None]:
df.head()

In [None]:
df['Time'].describe()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
import plotly.express as px 
fig = px.pie(data_frame = df,names = 'Class',title = 'Distribution of Classes',)
fig.show()

In [None]:
fig,ax = plt.subplots(1,2,figsize = (15,6))
fig.suptitle('Comparison of Normal and Fraud Transactions wrt Amount', fontsize=16)


sns.kdeplot(df[df['Class'] == 0]['Amount'], label='Amount',ax = ax[0])
sns.kdeplot(df[df['Class'] == 1]['Amount'], label='FraudAmount',ax = ax[0])
ax[0].set_xscale('symlog')
ax[0].set_xlabel('log(Amount)')
ax[0].set_ylabel('Probability')
plt.legend()

plt.subplot(1, 2, 2)
sns.scatterplot(df[df['Class'] == 0]['Amount'].describe(), label='Amount',ax = ax[1])
sns.scatterplot(df[df['Class'] == 1]['Amount'].describe(), label='FraudAmount',ax = ax[1])
ax[1].set_yscale('log')
ax[1].set_xlabel('')
ax[1].set_ylabel('log(Value)')
plt.tight_layout()
plt.show()

In [None]:
fig,ax = plt.subplots(9,3,figsize=(30,40))
ax = ax.flatten()
for i,column in enumerate(df.columns[1:28]):

    sns.histplot(data = df, x = df[column],ax = ax[i])
plt.tight_layout()    
plt.show()

In [None]:
plt.figure(figsize=(30,20))
sns.heatmap(df.corr(),cbar = 'coolwarm', annot = True, fmt = "0.2f")

for corr,column in zip(df.corr().iloc[30],df.columns):
    if abs(corr)<0.13:
        df.drop(columns = column , inplace = True)

In [None]:
plt.figure(figsize=(15,6))
plt.title('Heatmap of Modified DataFrame')
sns.heatmap(df.corr(),cmap='coolwarm',fmt='0.2f',annot = True)
plt.show()

In [None]:
fig,ax = plt.subplots(3,3,figsize=(20,20))
ax = ax.flatten()
plt.suptitle('Box Plot to check for Outliers')
for i,column in enumerate(df.columns):
    if i ==9:
        continue
    sns.boxplot(data = df , x = column, ax = ax[i])
    plt.title(column)

In [None]:
X = df.drop(columns = ['Class'])
y = df['Class']
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
print(f' shape of X_train is {X_train.shape}, shape of X_test is {X_test.shape}, shape of y_train is {y_train.shape}, shape of y_test is {y_test.shape}')

In [None]:
rus = RandomUnderSampler(random_state = 7)
X_res, y_res = rus.fit_resample(X_train, y_train)

print(f' shape of X_res_rus is {X_res.shape}, shape of y_res_rus is {y_res.shape}')
print(f'Distribution of y_res_rus: {Counter(y_res)}')

In [None]:
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_res, X_test, y_res, y_test)

# lazyPredict used weighted f1_Score , which is misleading in case of imbalanced Data 

In [None]:
models

In [None]:
models = {
    "ExtraTreesClassifier": ExtraTreesClassifier(),
    "LabelPropagation": LabelPropagation(),
    "LabelSpreading": LabelSpreading(),
    "LGBMClassifier": LGBMClassifier(),
    "LinearSVC": LinearSVC(),
    "CalibratedClassifierCV": CalibratedClassifierCV(),
    "RandomForestClassifier": RandomForestClassifier(),
    "SGDClassifier": SGDClassifier(),
    "LogisticRegression": LogisticRegression(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "Perceptron": Perceptron(),
    "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
    "SVC": SVC(),
    "GaussianNB": GaussianNB(),
    "BernoulliNB": BernoulliNB(),
    "XGBClassifier": XGBClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "BaggingClassifier": BaggingClassifier(),
    "NuSVC": NuSVC(),"DecisionTreeClassifier": DecisionTreeClassifier(),
    "RidgeClassifier": RidgeClassifier(),
    "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
    "ExtraTreeClassifier": ExtraTreeClassifier(),
    "NearestCentroid": NearestCentroid(),
    "RidgeClassifierCV": RidgeClassifierCV(),
    "PassiveAggressiveClassifier": PassiveAggressiveClassifier(),
    "DummyClassifier": DummyClassifier()
}
results = {
    "Model": [],
    "Accuracy": [],
    "F1 Score Weighted": [],
    "F1 Score Macro": [],
    "Precision_weighted": [],
    "Recall_weighted": [],
    "Precision_macro_avg": [],
    "Recall_macro_avg": []
    
}


for model_name, model in models.items():

    model.fit(X_res, y_res)
    
    y_pred = model.predict(X_test)
    results["Model"].append(model_name)
    results["Accuracy"].append(accuracy_score(y_test, y_pred))
    results["F1 Score Weighted"].append(f1_score(y_test, y_pred, average='weighted'))
    results["F1 Score Macro"].append(f1_score(y_test, y_pred, average='macro'))
    results["Precision_weighted"].append(precision_score(y_test, y_pred, average='weighted'))
    results["Recall_weighted"].append(recall_score(y_test, y_pred, average='weighted'))
    results["Precision_macro_avg"].append(precision_score(y_test, y_pred, average='macro'))
    results["Recall_macro_avg"].append(recall_score(y_test, y_pred, average='macro'))


results_df = pd.DataFrame(results)

In [None]:
results_df

In [None]:
FinalModel = NearestCentroid()
FinalModel.fit(X_res, y_res)
pred = FinalModel.predict(X_test)
print(classification_report(y_test,pred))

In [None]:
conf_matrix = confusion_matrix(y_test, pred)


plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix For NearestCentroid Model')
plt.show()