### Credit Card Fraud Detection
#### https://www.kaggle.com/mlg-ulb/creditcardfraud

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

df = pd.read_csv("./creditcard.csv")[:80_000]
df.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


In [2]:
X = df.drop(columns=['Time', 'Amount', 'Class']).values
y = df['Class'].values

f"Shapes of X={X.shape} y={y.shape}, #Fraud Cases={y.sum()}"

'Shapes of X=(80000, 28) y=(80000,), #Fraud Cases=196'

In [3]:
from sklearn.linear_model import LogisticRegression

mod = LogisticRegression(class_weight={0: 1, 1: 2}, max_iter=1000)
mod.fit(X, y).predict(X).sum()

171

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, make_scorer

grid = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid={'class_weight': [{0: 1, 1: v} for v in np.linspace(1, 20, 30)]},
    scoring={'precision': make_scorer(precision_score), 'recall_score': make_scorer(recall_score)},
    refit='precision',
    return_train_score=True,
    cv=7,
    n_jobs=-1
)
grid.fit(X, y)



GridSearchCV(cv=7, estimator=LogisticRegression(max_iter=1000), n_jobs=-1,
             param_grid={'class_weight': [{0: 1, 1: 1.0},
                                          {0: 1, 1: 1.6551724137931034},
                                          {0: 1, 1: 2.310344827586207},
                                          {0: 1, 1: 2.9655172413793105},
                                          {0: 1, 1: 3.6206896551724137},
                                          {0: 1, 1: 4.275862068965517},
                                          {0: 1, 1: 4.931034482758621},
                                          {0: 1, 1: 5.586206896551724},
                                          {0: 1, 1: 6.241379310344827},
                                          {0: 1, 1: 6.896551724137931},
                                          {...
                                          {0: 1, 1: 14.758620689655173},
                                          {0: 1, 1: 15.413793103448276},
                       

In [5]:
from sklearn.metrics import precision_score, recall_score

precision_score(y, grid.predict(X)) # Given that I predict fraud how accurate am I?
recall_score(y, grid.predict(X)) # Did I get all the fraud cases

0.7602040816326531

In [6]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,params,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,...,rank_test_recall_score,split0_train_recall_score,split1_train_recall_score,split2_train_recall_score,split3_train_recall_score,split4_train_recall_score,split5_train_recall_score,split6_train_recall_score,mean_train_recall_score,std_train_recall_score
0,1.117415,0.224456,0.010706,0.003305,"{0: 1, 1: 1.0}","{'class_weight': {0: 1, 1: 1.0}}",0.6875,0.46,1.0,0.9375,...,30,0.625,0.755952,0.589286,0.607143,0.553571,0.636905,0.630952,0.628401,0.058482
1,1.031042,0.235943,0.011126,0.00373,"{0: 1, 1: 1.6551724137931034}","{'class_weight': {0: 1, 1: 1.6551724137931034}}",0.722222,0.470588,1.0,0.944444,...,29,0.672619,0.827381,0.666667,0.678571,0.613095,0.708333,0.714286,0.697279,0.061319
2,1.009762,0.144431,0.007915,0.003632,"{0: 1, 1: 2.310344827586207}","{'class_weight': {0: 1, 1: 2.310344827586207}}",0.736842,0.480769,1.0,0.947368,...,28,0.690476,0.845238,0.702381,0.72619,0.678571,0.75,0.75,0.734694,0.051975
3,0.912665,0.126548,0.008063,0.003702,"{0: 1, 1: 2.9655172413793105}","{'class_weight': {0: 1, 1: 2.9655172413793105}}",0.814815,0.480769,1.0,0.947368,...,27,0.732143,0.857143,0.72619,0.738095,0.702381,0.779762,0.761905,0.756803,0.047023
4,0.837731,0.120331,0.006994,0.002505,"{0: 1, 1: 3.6206896551724137}","{'class_weight': {0: 1, 1: 3.6206896551724137}}",0.833333,0.480769,1.0,0.95,...,26,0.761905,0.857143,0.767857,0.755952,0.72619,0.791667,0.767857,0.77551,0.037876
5,0.807956,0.084218,0.007025,0.003516,"{0: 1, 1: 4.275862068965517}","{'class_weight': {0: 1, 1: 4.275862068965517}}",0.83871,0.480769,0.833333,0.904762,...,25,0.797619,0.857143,0.77381,0.785714,0.767857,0.809524,0.779762,0.795918,0.028228
6,0.760501,0.106206,0.006242,0.002039,"{0: 1, 1: 4.931034482758621}","{'class_weight': {0: 1, 1: 4.931034482758621}}",0.83871,0.480769,0.857143,0.904762,...,24,0.815476,0.857143,0.791667,0.797619,0.791667,0.821429,0.791667,0.809524,0.022498
7,0.628679,0.104905,0.005519,0.002036,"{0: 1, 1: 5.586206896551724}","{'class_weight': {0: 1, 1: 5.586206896551724}}",0.8125,0.480769,0.857143,0.869565,...,23,0.833333,0.869048,0.815476,0.833333,0.803571,0.827381,0.803571,0.826531,0.020967
8,0.78309,0.129498,0.006259,0.0023,"{0: 1, 1: 6.241379310344827}","{'class_weight': {0: 1, 1: 6.241379310344827}}",0.787879,0.480769,0.888889,0.869565,...,21,0.833333,0.869048,0.821429,0.845238,0.815476,0.833333,0.821429,0.834184,0.016964
9,0.8301,0.145683,0.006875,0.003158,"{0: 1, 1: 6.896551724137931}","{'class_weight': {0: 1, 1: 6.896551724137931}}",0.787879,0.480769,0.888889,0.869565,...,21,0.839286,0.875,0.833333,0.85119,0.821429,0.839286,0.85119,0.844388,0.015726


In [8]:
plt.figure(figsize=(12, 4))

df = pd.DataFrame(grid.cv_results_)
for score in ['mean_test_recall', 'mean_test_precision']:
    plt.plot([_[1] for _ in df['param_class_weight']], df[score], label=score)
    
plt.legend()

KeyError: 'mean_test_recall'

<Figure size 864x288 with 0 Axes>