In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score,roc_auc_score


In [1]:
from mlxtend.plotting import plot_learning_curves

In [2]:
df = pd.read_csv('creditcard.csv')

### Splitting Train and Test sets

In [3]:
X = df.drop('Class', axis=1)
y = df['Class']

sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

Train: [ 30473  30496  31002 ... 284804 284805 284806] Test: [    0     1     2 ... 57017 57018 57019]
Train: [     0      1      2 ... 284804 284805 284806] Test: [ 30473  30496  31002 ... 113964 113965 113966]
Train: [     0      1      2 ... 284804 284805 284806] Test: [ 81609  82400  83053 ... 170946 170947 170948]
Train: [     0      1      2 ... 284804 284805 284806] Test: [150654 150660 150661 ... 227866 227867 227868]
Train: [     0      1      2 ... 227866 227867 227868] Test: [212516 212644 213092 ... 284804 284805 284806]


In [4]:
print(Counter(original_ytrain))
print(Counter(original_ytest))


Counter({0: 227452, 1: 394})
Counter({0: 56863, 1: 98})


In [5]:
original_ytrain = original_ytrain.values.reshape(-1,1)
original_ytest = original_ytest.values.reshape(-1,1)

### Feature Scaling

In [6]:
column_indices = []
column_indices.append(original_Xtrain.columns.get_loc('Time'))
column_indices.append(original_Xtrain.columns.get_loc('Amount'))

In [7]:
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import RobustScaler 
column_trans = ColumnTransformer([('scaler', RobustScaler(), column_indices)], remainder='passthrough') 
column_trans.fit(original_Xtrain)
original_Xtrain = column_trans.transform(original_Xtrain)
original_Xtest = column_trans.transform(original_Xtest)

### Over Sampling

In [9]:
smt = SMOTE(sampling_strategy='minority')
X_res, y_res = smt.fit_sample(original_Xtrain, original_ytrain)

[100, 200, 300, 400, 500]

In [11]:
max_iter = list(range(500,1100,100))
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
log_reg_params = [{'penalty': ['l1','l2'], 'solver': ['liblinear'], 'max_iter': max_iter, 'C': C_values},
              {'penalty': ['l2'], 'solver': ['lbfgs'], 'max_iter': max_iter , 'C': C_values}]
rand_log_reg = RandomizedSearchCV(LogisticRegression(), log_reg_params, n_iter=4)
rand_log_reg.fit(X_res, y_res)
best_est = rand_log_reg.best_estimator_

In [12]:
best_est

LogisticRegression(C=0.01, max_iter=600)

In [13]:
type(best_est)

sklearn.linear_model._logistic.LogisticRegression

In [22]:
accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []

max_iter = list(range(500,1100,100))
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

log_reg_params = [{'penalty': ['l1','l2'], 'solver': ['liblinear'], 'max_iter': max_iter, 'C': C_values},
              {'penalty': ['l2'], 'solver': ['lbfgs'], 'max_iter': max_iter , 'C': C_values}]

# log_reg_params = {"penalty": ['l1', 'l2'],"solver": ['lbfgs', 'liblinear'], 'max_iter': list(range(100,600,100)) , 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
# rand_log_reg = RandomizedSearchCV(LogisticRegression(), log_reg_params, n_iter=4)
pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), rand_log_reg.best_estimator_)

for train, test in sss.split(original_Xtrain, original_ytrain):   
    model = pipeline.fit(original_Xtrain[train], original_ytrain[train])
#     best_est = rand_log_reg.best_estimator_
    prediction = model.predict(original_Xtrain[test])
    
    accuracy_lst.append(pipeline.score(original_Xtrain[test], original_ytrain[test]))
    precision_lst.append(precision_score(original_ytrain[test], prediction))
    recall_lst.append(recall_score(original_ytrain[test], prediction))
    f1_lst.append(f1_score(original_ytrain[test], prediction))
    auc_lst.append(roc_auc_score(original_ytrain[test], prediction))

    

In [35]:
y_pred = best_est.predict(original_Xtest)

In [36]:
y_score = best_est.decision_function(original_Xtest)

In [37]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(original_ytest, y_score)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

Average precision-recall score: 0.74


In [38]:
from sklearn.metrics import classification_report
labels = ['No Fraud', 'Fraud']
smote_prediction = best_est.predict(original_Xtest)
print(classification_report(original_ytest, smote_prediction, target_names=labels))

              precision    recall  f1-score   support

    No Fraud       1.00      0.99      0.99     56863
       Fraud       0.11      0.86      0.20        98

    accuracy                           0.99     56961
   macro avg       0.56      0.92      0.60     56961
weighted avg       1.00      0.99      0.99     56961

