In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV

from collections import Counter

In [2]:
df = pd.read_csv('C:/Users/000K1G744/Desktop/EDA -Feature Engineering/creditcard.csv')

In [3]:
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [4]:
df.shape

(284807, 31)

In [5]:
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

## Data Preprocessing ##

In [6]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

## Train and Test Split ##

In [7]:
x = df.drop(['Class'], axis=1)
y = df['Class']
# Splitting the data into train and test
X_train,X_test,y_train,y_test=train_test_split(x, y, train_size=0.8, stratify = y, random_state=100)

In [8]:
y_train.shape,y_test.shape

((227845,), (56962,))

In [9]:
y_train.value_counts()/len(y_train)

0    0.998271
1    0.001729
Name: Class, dtype: float64

In [10]:
y_test.value_counts()/len(y_test)


0    0.99828
1    0.00172
Name: Class, dtype: float64

In [11]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_train = Scaler_X.fit_transform(X_train)
X_test = Scaler_X.transform(X_test)

## Handling Class imbalance using SMOTE ##

# A) SMOTE Technique

In [12]:
from imblearn.over_sampling import SMOTE

counter = Counter(y_train)
print('Before',counter)
# oversampling the train dataset using SMOTE
smt = SMOTE()
#X_train, y_train = smt.fit_resample(X_train, y_train)
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

counter = Counter(y_train_sm)
print('After',counter)

Before Counter({0: 227451, 1: 394})
After Counter({0: 227451, 1: 227451})


# B) SMOTE BOrderline Technique

In [14]:
from imblearn.over_sampling import BorderlineSMOTE

counter = Counter(y_train)
print('Before',counter)
# oversampling the train dataset using SMOTE
bsmote = BorderlineSMOTE(random_state = 101, kind = 'borderline-1')
#X_train, y_train = smt.fit_resample(X_train, y_train)
X_train_borderline, y_train_borderline = bsmote.fit_resample(X_train, y_train)

counter = Counter(y_train_borderline)
print('After',counter)


Before Counter({0: 227451, 1: 394})
After Counter({0: 227451, 1: 227451})


## C ) SMOTE SVM ##

In [15]:
from imblearn.over_sampling import SVMSMOTE
counter = Counter(y_train)
print('Before',counter)
# oversampling the train dataset using SMOTE
svmsmote = SVMSMOTE(random_state = 101)
#X_train, y_train = smt.fit_resample(X_train, y_train)
X_train_svm, y_train_svm = svmsmote.fit_resample(X_train, y_train)
counter = Counter(y_train_svm)
print('After',counter)

Before Counter({0: 227451, 1: 394})
After Counter({0: 227451, 1: 227451})


# B) ADASYN Technique

In [16]:
from imblearn.over_sampling import ADASYN
counter = Counter(y_train)
print('Before',counter)
## oversampling the train dataset using ADASYN 
ada = ADASYN(random_state=130)
X_train_ada,y_train_ada=ada.fit_resample(X_train,y_train)
counter = Counter(y_train_ada)
print('After',counter)

Before Counter({0: 227451, 1: 394})
After Counter({0: 227451, 1: 227434})


# C) Hybrid Techniques

C.1 SMOTE + Tomek Links

In [None]:
from imblearn.combine import SMOTETomek
counter = Counter(y_train)
print('Before',counter)
# oversampling the train dataset using SMOTE + Tomek
smtom = SMOTETomek(random_state=139)
X_train_smtom, y_train_smtom = smtom.fit_resample(X_train, y_train)
counter = Counter(y_train_smtom)
print('After',counter)

Before Counter({0: 227451, 1: 394})


# C.2) SMOTE + ENN

In [None]:
from imblearn.combine import SMOTEENN

counter = Counter(y_train)
print('Before',counter)
# oversampling the train dataset using SMOTE + ENN
smenn = SMOTEENN()
X_train_smenn, y_train_smenn = smenn.fit_resample(X_train, y_train)

counter = Counter(y_train_smenn)
print('After',counter)

## Model Building Imbalanced Data ##

In [None]:
model = list()
resample = list()
precision = list()
recall = list()
F1score = list()
AUCROC = list()

In [None]:
def test_eval(clf_model, X_test, y_test, algo=None, sampling=None):
    # Test set prediction
    y_prob=clf_model.predict_proba(X_test)
    y_pred=clf_model.predict(X_test)

    print('Confusion Matrix')
    print('='*60)
    print(confusion_matrix(y_test,y_pred),"\n")
    print('Classification Report')
    print('='*60)
    print(classification_report(y_test,y_pred),"\n")
    print('AUC-ROC')
    print('='*60)
    print(roc_auc_score(y_test, y_prob[:,1]))
          
    model.append(algo)
    precision.append(precision_score(y_test,y_pred))
    recall.append(recall_score(y_test,y_pred))
    F1score.append(f1_score(y_test,y_pred))
    AUCROC.append(roc_auc_score(y_test, y_prob[:,1]))
    resample.append(sampling)

## Model 1 : Logistic Regression ##

# Original Unsampled Data

In [None]:
log_model=LogisticRegression()

params={'C':np.logspace(-10, 1, 15),'class_weight':[None,'balanced'],'penalty':['l1','l2']}

cv = StratifiedKFold(n_splits=5, random_state=100, shuffle=True)

# Create grid search using 5-fold cross validation
clf_LR = GridSearchCV(log_model, params, cv=cv, scoring='roc_auc', n_jobs=-1)
clf_LR.fit(X_train, y_train)
clf_LR.best_estimator_

In [None]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression', 'actual')

# SMOTE Resampling 

In [None]:
clf_LR.fit(X_train_sm, y_train_sm)
clf_LR.best_estimator_

In [None]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression', 'smote')

## ADASYN Resampling ##

In [None]:
clf_LR.fit(X_train_ada, y_train_ada)
clf_LR.best_estimator_

In [None]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression', 'adasyn')

## SMOTE + Tomek Resampling ##

In [None]:
clf_LR.fit(X_train_smtom, y_train_smtom)
clf_LR.best_estimator_

In [None]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression', 'smote+tomek')

## SMOTE +ENN Resampling ##

In [None]:
clf_LR.fit(X_train_smenn, y_train_smenn)
clf_LR.best_estimator_

In [None]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression', 'smote+enn')