In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt



In [2]:
# Load data and review the first 5 rows
df = pd.read_csv("../input/creditcardfraud/creditcard.csv")
df.head(5)

In [5]:
# correlation matrix
corr = df.corr()
corr.head(5)


In [6]:
# correlation of "Class", i.e. target with predictors
corr['Class'].abs().sort_values(ascending = False).to_frame()

In [None]:
# Dimension of data
df.shape

In [7]:
# Review the name of columns, their null-values, and their data type
df.info()

In [8]:
# Review the statistics of columns
df.describe()

# **EDA: **Univariate analysis****

In [9]:
# Distribution of Time column 
sns.histplot(df.Time, bins=50)

Since df.Amount is very skewed, we use log(df.Amount+1) for plotting.  

In [10]:
# Distribution of log(Amount) column
sns.histplot(np.log(df.Amount+1), bins=50)
plt.xlabel("log(Amount)")

In [11]:
# Distribution of Class column
class_counts = df.Class.value_counts()
g = sns.barplot(x=class_counts.index, y=class_counts.values)
g.set_xticklabels(['Normal','Fraudulent'])
g.set_ylabel('counts')

In [12]:
# Distribution of Class column
class_counts = df.Class.value_counts()
g = sns.barplot(x=class_counts.index, y=np.log(class_counts.values))
g.set_xticklabels(['Normal','Fraudulent'])
g.set_ylabel('log(counts)')

In [13]:
# Distribution of Class column
df.Class.value_counts()

In [14]:
# Distribution of Class column
df.Class.value_counts()
print(f"Percentage of fraudulent transactions is {df.Class.value_counts()[1]/df.shape[0]*100:.3f}%")

So, the data is highly unbalanced. 

# **EDA Bivariate Analysis**

In [15]:
# Distribution of Amount per Class
g = sns.catplot(x="Class", y="Amount", data=df)
g.set_xticklabels(['Normal','Fraudulent'])

The range of transaction amount for fraudulent transactions is less than normal ones. 

In [16]:
# Distribution of Time per Class
g = sns.catplot(x="Class", y="Time", data=df)
g.set_xticklabels(['Normal','Fraudulent'])

The range of transaction time is almost the same for both types of transactions. 

In [17]:
# Highly correlated columns with Class column (target)
corr_df = df.corr()
corr_df_class = corr_df["Class"].abs().sort_values(ascending=False)
print(corr_df_class[corr_df_class>=0.2].index)

In [18]:
# Distribution of mostly correlated columns (with Class column) per Class
fig, ax = plt.subplots(1, 4, figsize=(15, 5))
        
sns.boxplot(x="Class", y="V17", data=df, ax=ax[0])
sns.boxplot(x="Class", y="V14", data=df, ax=ax[1])
sns.boxplot(x="Class", y="V12", data=df, ax=ax[2])
sns.boxplot(x="Class", y="V10", data=df, ax=ax[3])


In [None]:
# Distribution of Vi columns per Class
fig, ax = plt.subplots(7, 4, figsize=(15, 13))
k=1
for i in range(7):
    for j in range(4):
        
        sns.boxplot(x="Class", y=f"V{k}", data=df, ax=ax[i, j])
        k=k+1

# Data pre-processing

In [19]:
df.duplicated().sum()
# Checking for duplicated entries
duplicates = df.duplicated().sum() 
if  duplicates == 0:
    print("There are no duplicted rows in this data")
else:
    print(f"There are {duplicates} duplicated rows.")

Since duplicates are an extreme case of nonrandom sampling, and they might bias the fitted models, and lead to the model overfitting, we remove them. 

In [20]:
# Drop the duplicated rows
df=df.drop_duplicates()

In [21]:
# selecting only required columns for ML models
df = df.drop(['Time'], axis=1)

Since the Classes are highly imbalanced, we use oversampling from the minority class, which is adding more copies to the minority class. Oversampling is a good choice here as we don’t have millions of rows to work with, however, it can cause overfitting. 


In [22]:
# Create two different dataframes of majority and minority class 
df_minority = df[df.Class==1]
df_majority = df[df.Class==0]

# Oversample the minority class
from sklearn.utils import resample
df_minority_oversampled = resample(df_minority, 
                                 replace=True,    # sample with replacement
                                 n_samples=len(df_majority) , # to match majority class with len(df_majority) rows
                                 random_state=42)  # reproducible results

# Combine majority class with oversampled minority class
df_oversampled = pd.concat([df_minority_oversampled, df_majority])

In [23]:
df_oversampled.Class.value_counts()

In [24]:
g = sns.countplot(df_oversampled.Class)
g.set_xticklabels(['Normal','Fraudulent'])

Now, we have a balanced data. 

In [26]:
# Target y and features X for developing ML models
y = df_oversampled.Class
X = df_oversampled.drop(['Class'], axis=1)

In [27]:
# split the data (X, y) to train-data and test-data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, stratify=y, random_state=101)

from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, roc_auc_score

Any ML algorithm that computes the distance between the data points requires feature scaling (Standardization or Normalization), such as Logistic Regression, and SVM (Support Vector Machine). However, the ML algorithms which are tree-based do not need feature scaling , such as Random Forests, and
Gradient Boosted Decision Trees.


In [28]:
# Standardize features by removing the mean and scaling to unit variance.
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(X_train)
X_train_ss = ss.transform(X_train)
X_test_ss = ss.transform(X_test)

# **ML Models**

In [29]:
# A function to store the performance of each classifier model
def ML_models_performance(model, X_train, y_train ,X_test ,y_test, y_pred, test_probs, model_name):
 
    performance_df=pd.DataFrame({'Train_accuracy':model.score(X_train,y_train),"Test_accuracy":model.score(X_test,y_test),
                       "Precision":precision_score(y_pred,y_test),"Recall":recall_score(y_pred,y_test),
                       "F1_Score":f1_score(y_pred,y_test), "roc_auc_score":roc_auc_score(y_test, test_probs[:, 1])}, index=[model_name])
    return performance_df

In [None]:
from sklearn.linear_model import LogisticRegression
# Create the LogisticRegression model
lr = LogisticRegression(solver = 'sag', random_state=1)
# Fit the model
lr.fit(X_train_ss, y_train)
# Use the trained model to predict Classes
y_pred_lr = lr.predict(X_test_ss)
# Use the trained model to calculate the probabilities of each Class
test_probs_lr = lr.predict_proba(X_test_ss)

# model performance
print(f'Model train accuracy: {lr.score(X_train_ss, y_train)*100:.3f}%')
print(f'Model test accuracy: {lr.score(X_test_ss, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred_lr,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred_lr,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred_lr,y_test):.3f}')
print(f'Model test roc_auc_score: {roc_auc_score(y_test, test_probs_lr[:, 1]):.3f}')

In [None]:
lr_performance = ML_models_performance(lr, X_train_ss, y_train ,X_test_ss ,y_test, y_pred_lr, test_probs_lr, "Logisitc Regression")
lr_performance

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Create the RandomForestClassifier model
rf = RandomForestClassifier(random_state=1)
# Fit the model
rf.fit(X_train_ss, y_train)
# Use the trained model to predict Classes
y_pred_rf = rf.predict(X_test_ss)
# Use the trained model to calculate the probabilities of each Class
test_probs_rf = rf.predict_proba(X_test_ss)

# model performance
print(f'Model train accuracy: {rf.score(X_train_ss, y_train)*100:.3f}%')
print(f'Model test accuracy: {rf.score(X_test_ss, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred_rf,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred_rf,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred_rf,y_test):.3f}')
print(f'Model test roc_auc_score: {roc_auc_score(y_test, test_probs_rf[:, 1]):.3f}')

In [None]:
rf_performance = ML_models_performance(rf, X_train_ss, y_train ,X_test_ss ,y_test, y_pred_rf, test_probs_rf, "Random Forest")
rf_performance

In [None]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
# Create the LinearSVC model
lsvc = LinearSVC(random_state=1, dual=False)
# Fit the model
lsvc.fit(X_train_ss, y_train)
# Use the trained model to predict Classes
y_pred_lsvc = lsvc.predict(X_test_ss)
# AS LinearSVC does not generate probabilities directly we use CalibratedClassifierCV to calculate the probabilities of each Class
clf = CalibratedClassifierCV(lsvc) 
clf.fit(X_train_ss, y_train)
test_probs_lsvc = clf.predict_proba(X_test_ss)

# model performance
print(f'Model train accuracy: {lsvc.score(X_train_ss, y_train)*100:.3f}%')
print(f'Model test accuracy: {lsvc.score(X_test_ss, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred_lsvc,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred_lsvc,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred_lsvc,y_test):.3f}')
print(f'Model test roc_auc_score: {roc_auc_score(y_test, test_probs_lsvc[:, 1]):.3f}')

In [None]:
lsvc_performance = ML_models_performance(lsvc, X_train_ss, y_train ,X_test_ss ,y_test, y_pred_lsvc, test_probs_lsvc, "LinearSVC")
lsvc_performance

In [None]:
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
# Create the SVC model with ’rbf’ kernel
svcg = SVC(random_state=1)
# Fit the model
svcg.fit(X_train_ss, y_train)
# Use the trained model to predict Classes
y_pred_svcg = svcg.predict(X_test_ss)
# we use CalibratedClassifierCV to calculate the probabilities of each Class
clf = CalibratedClassifierCV(svcg) 
clf.fit(X_train_ss, y_train)
test_probs_svcg = clf.predict_proba(X_test_ss)

# model performance
print(f'Model train accuracy: {svcg.score(X_train_ss, y_train)*100:.3f}%')
print(f'Model test accuracy: {svcg.score(X_test_ss, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred_svcg,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred_svcg,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred_svcg,y_test):.3f}')
print(f'Model test roc_auc_score: {roc_auc_score(y_test, test_probs_svcg[:, 1]):.3f}')

In [None]:
svcg_performance = ML_models_performance(svcg, X_train_ss, y_train ,X_test_ss ,y_test, y_pred_svcg, test_probs_svcg, "SVC")
svcg_performance

In [None]:
from xgboost import XGBClassifier
# Create the XGBClassifier model 
xgb=XGBClassifier(random_state=1)
# Fit the model
xgb.fit(X_train_ss,y_train)
# Use the trained model to predict
y_pred_xgb = xgb.predict(X_test_ss)
test_probs_xgb = xgb.predict_proba(X_test_ss)

# model performance
print(f'Model train accuracy: {xgb.score(X_train_ss, y_train)*100:.3f}%')
print(f'Model test accuracy: {xgb.score(X_test_ss, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred_xgb,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred_xgb,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred_xgb,y_test):.3f}')
print(f'Model test roc_auc_score: {roc_auc_score(y_test, test_probs_xgb[:, 1]):.3f}')

In [None]:
xgb_performance = ML_models_performance(xgb, X_train_ss, y_train ,X_test_ss ,y_test, y_pred_xgb, test_probs_xgb, "XGBClassifier")
xgb_performance

In [None]:
# Compare the performance of different models
comparison_df = pd.concat([lr_performance, rf_performance, lsvc_performance, xgb_performance])
comparison_df

Note that ROC curves are used when there are roughly equal numbers of observations for each class, while
Precision-Recall curves are used when there is class imbalance. Note that ROC curves show an optimistic performance of the ML model on datasets with a class imbalance. The reason for this is because of the use of true negatives in the False Positive Rate (x-axis) in the ROC Curve and the careful avoidance of this rate in the Precision-Recall curve.

In [None]:
# Plot roc curves for different classifier models
lr_fpr, lr_tpr, _ = roc_curve(y_test, test_probs_lr[:, 1])
rf_fpr, rf_tpr, _ = roc_curve(y_test, test_probs_rf[:, 1])
lsvc_fpr, lsvc_tpr, _ = roc_curve(y_test, test_probs_lsvc[:, 1])
#svcg_fpr, svcg_tpr, _ = roc_curve(y_test, test_probs_svcg[:, 1])
xgb_fpr, xgb_tpr, _ = roc_curve(y_test, test_probs_xgb[:, 1])


# plot the roc curve for different classifier models
plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic REgression')
plt.plot(rf_fpr, rf_tpr, marker='o', label='Random Forest')
plt.plot(lsvc_fpr, lsvc_tpr, marker='v', label='LinearSVC')
#plt.plot(svcg_fpr, svcg_tpr, marker='^', label='SVC')
plt.plot(xgb_fpr, xgb_tpr, marker='*', label='XGBClassifier')

# plot the roc curve for no-skill model
# generate a no skill prediction (majority class)
ns_probs = [0 for i in range(len(y_test))]
# calculate roc_auc_score
ns_auc = roc_auc_score(y_test, ns_probs)
# calculate roc curve
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')

# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve 

# Plot precision_recall curve for different classifier models
lr_precision, lr_recall, _ = precision_recall_curve(y_test, test_probs_lr[:, 1])
rf_precision, rf_recall, _ = precision_recall_curve(y_test, test_probs_rf[:, 1])
lsvc_precision, lsvc_recall, _ = precision_recall_curve(y_test, test_probs_lsvc[:, 1])
#svcg_precision, svcg_recall, _ = precision_recall_curve(y_test, test_probs_svcg[:, 1])
xgb_precision, xgb_recall, _ = precision_recall_curve(y_test, test_probs_xgb[:, 1])


# plot the precision_recall curve for different classifier models
plt.plot(lr_recall, lr_precision, marker='.', label='Logistic REgression')
plt.plot(rf_recall, rf_precision, marker='o', label='Random Forest')
plt.plot(lsvc_recall, lsvc_precision, marker='v', label='LinearSVC')
#plt.plot(svcg_recall, svcg_precision, marker='^', label='SVC')
plt.plot(xgb_recall, xgb_precision, marker='*', label='XGBClassifier')

# plot the precision_recall curves for no-skill model
no_skill = len(y_test[y_test==1]) / len(y_test)
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')

# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
# show the plot
plt.show()

So, based on the comparison_df table, the roc_curve and the precision_recall_curve, XGBClassifier and Random Forest are considered good classifier models for this dataset.

In general, tree-based algorithms usually perform well on imbalanced data, as they work by learning a hierarchy of if/else questions, which make both classes be addressed. 