### Self Approached

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

df  = pd.read_csv("creditcard.csv")[:80_000]
df.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


In [2]:
X = df.drop(columns=["Class", "Time"])
y = df["Class"]

print(f'fraud cases: {y.sum()}')

fraud cases: 196


In [3]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight = {0:1, 1:2} ,max_iter = 1000)
pred = model.fit(X, y).predict(X)

In [4]:
pred.sum()

171

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, make_scorer

def min_recall_precision(est, X, y_true, sample_weight = None):
    y_pred = est.predict(X)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    return min(recall, precision)

grid = GridSearchCV(
    estimator = LogisticRegression(max_iter = 10000),
    param_grid = {'class_weight': [{0:1, 1:w} for w in np.linspace(1, 20, 30)]},
    scoring = {'precision': make_scorer(precision_score), 
               'recall': make_scorer(recall_score), 
               'min_both': min_recall_precision},
    refit = 'min_both',
    return_train_score = True,
    cv = 10,
    n_jobs = -1
)

grid.fit(X, y);

In [9]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,params,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,...,split2_train_min_both,split3_train_min_both,split4_train_min_both,split5_train_min_both,split6_train_min_both,split7_train_min_both,split8_train_min_both,split9_train_min_both,mean_train_min_both,std_train_min_both
0,7.483978,1.086105,0.049485,0.034434,"{0: 1, 1: 1.0}","{'class_weight': {0: 1, 1: 1.0}}",1.0,0.463415,0.583333,1.0,...,0.627119,0.548023,0.579545,0.579545,0.5625,0.619318,0.636364,0.602273,0.613887,0.05427
1,8.39454,1.488478,0.053515,0.025685,"{0: 1, 1: 1.6551724137931034}","{'class_weight': {0: 1, 1: 1.6551724137931034}}",1.0,0.463415,0.583333,1.0,...,0.689266,0.627119,0.676136,0.647727,0.630682,0.681818,0.698864,0.6875,0.680804,0.050224
2,8.36197,1.448769,0.029724,0.013954,"{0: 1, 1: 2.310344827586207}","{'class_weight': {0: 1, 1: 2.310344827586207}}",1.0,0.452381,0.583333,1.0,...,0.734463,0.683616,0.710227,0.693182,0.6875,0.715909,0.744318,0.727273,0.723321,0.044069
3,8.247654,0.92774,0.038243,0.013857,"{0: 1, 1: 2.9655172413793105}","{'class_weight': {0: 1, 1: 2.9655172413793105}}",1.0,0.452381,0.583333,1.0,...,0.779661,0.706215,0.744318,0.727273,0.715909,0.755682,0.772727,0.738636,0.748844,0.039393
4,7.885362,1.538442,0.03641,0.013292,"{0: 1, 1: 3.6206896551724137}","{'class_weight': {0: 1, 1: 3.6206896551724137}}",1.0,0.452381,0.583333,1.0,...,0.824859,0.734463,0.755682,0.75,0.732955,0.772727,0.784091,0.761364,0.771501,0.037419
5,8.931915,1.565046,0.058799,0.059043,"{0: 1, 1: 4.275862068965517}","{'class_weight': {0: 1, 1: 4.275862068965517}}",1.0,0.44186,0.583333,1.0,...,0.841808,0.768362,0.778409,0.795455,0.778409,0.789773,0.801136,0.772727,0.79419,0.029025
6,9.215779,0.806553,0.029829,0.012952,"{0: 1, 1: 4.931034482758621}","{'class_weight': {0: 1, 1: 4.931034482758621}}",1.0,0.44186,0.583333,1.0,...,0.842697,0.79661,0.789773,0.813559,0.801136,0.795455,0.804469,0.789773,0.809449,0.021987
7,7.390618,0.936552,0.031014,0.011737,"{0: 1, 1: 5.586206896551724}","{'class_weight': {0: 1, 1: 5.586206896551724}}",1.0,0.431818,0.583333,1.0,...,0.842697,0.803371,0.798913,0.811111,0.80226,0.8125,0.80663,0.804469,0.815083,0.020193
8,8.154003,1.028557,0.039515,0.019281,"{0: 1, 1: 6.241379310344827}","{'class_weight': {0: 1, 1: 6.241379310344827}}",1.0,0.44186,0.583333,0.947368,...,0.842697,0.80663,0.801075,0.811111,0.8,0.815642,0.806452,0.80663,0.816127,0.019782
9,9.338525,1.42812,0.050642,0.031872,"{0: 1, 1: 6.896551724137931}","{'class_weight': {0: 1, 1: 6.896551724137931}}",0.944444,0.44186,0.583333,0.947368,...,0.839779,0.807692,0.801075,0.812155,0.801105,0.812155,0.808511,0.809783,0.81654,0.018964


### freeCodeCamp Copy

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

df  = pd.read_csv("creditcard.csv")[:80_000]
df.head(3)

In [None]:
X = df.drop(columns=['Time', 'Amount', 'Class']).values
y = df['Class'].values
f"Shapes of X={X.shape} y={y.shape}, #Fraud Cases={y.sum()}"

In [None]:
from sklearn.linear_model import LogisticRegression

mod = LogisticRegression(class_weight={0: 1, 1: 2}, max_iter=1000)
mod.fit(X, y).predict(X).sum()

In [None]:
def min_recall_precision(y_true, y_pred):
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    return min(recall, precision)

make_scorer(min_recall_precision, greater_is_better=False)
# ?make_scorer

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, make_scorer

def min_recall_precision(est, X, y_true, sample_weight=None):
    y_pred = est.predict(X)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    return min(recall, precision)

grid = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid={'class_weight': [{0: 1, 1: v} for v in np.linspace(1, 20, 30)]},
    scoring={'precision': make_scorer(precision_score), 
             'recall': make_scorer(recall_score),
             'min_both': min_recall_precision},
    refit='min_both',
    return_train_score=True,
    cv=10,
    n_jobs=-1
)
grid.fit(X, y);

In [None]:
# s = make_scorer(min_recall_precision)
# ??s

Here's a summary for the test metrics.

In [None]:
plt.figure(figsize=(12, 4))
df_results = pd.DataFrame(grid.cv_results_)
for score in ['mean_test_recall', 'mean_test_precision', 'mean_test_min_both']:
    plt.plot([_[1] for _ in df_results['param_class_weight']], 
             df_results[score], 
             label=score)
plt.legend();

And here's the train metrics.

In [None]:
plt.figure(figsize=(12, 4))
df_results = pd.DataFrame(grid.cv_results_)
for score in ['mean_train_recall', 'mean_train_precision', 'mean_test_min_both']:
    plt.scatter(x=[_[1] for _ in df_results['param_class_weight']], 
                y=df_results[score.replace('test', 'train')], 
                label=score)
plt.legend();

# Using Outlier Detection Models

In [None]:
from collections import Counter
from sklearn.ensemble import IsolationForest
mod = IsolationForest().fit(X)
np.where(mod.predict(X) == -1, 1, 0)

And now in a gridsearch.

In [None]:
def outlier_precision(mod, X, y):
    preds = mod.predict(X)
    return precision_score(y, np.where(preds == -1, 1, 0))

def outlier_recall(mod, X, y):
    preds = mod.predict(X)
    return recall_score(y, np.where(preds == -1, 1, 0))

grid = GridSearchCV(
    estimator=IsolationForest(),
    param_grid={'contamination': np.linspace(0.001, 0.02, 10)},
    scoring={'precision': outlier_precision, 
             'recall': outlier_recall},
    refit='precision',
    cv=5,
    n_jobs=-1
)
grid.fit(X, y);

plt.figure(figsize=(12, 4))
df_results = pd.DataFrame(grid.cv_results_)
for score in ['mean_test_recall', 'mean_test_precision']:
    plt.plot(df_results['param_contamination'], 
             df_results[score], 
             label=score)
plt.legend();

<br><br><br><br><br><br><br><br><br><br><br><br>

In [None]:
df = pd.DataFrame(grid.cv_results_)
plt.plot([_[1] for _ in df['param_class_weight']], df['mean_test_recall'])
plt.plot([_[1] for _ in df['param_class_weight']], df['mean_test_precision']);

In [None]:
def min_pre_rec(y, y_true):
    return min(recall_score(y, y_true), precision_score(y, y_true))

In [None]:
def outlier_precision(mod, X, y):
    preds = mod.predict(X)
    return precision_score(y, np.where(preds == 1, 0, 1))

def outlier_recall(mod, X, y):
    preds = mod.predict(X)
    return recall_score(y, np.where(preds == 1, 0, 1))

In [None]:
grid = GridSearchCV(
    estimator=LogisticRegression(class_weight=10),
    param_grid={'class_weight': [{0: 1, 1: v} for v in np.linspace(1, 40, t5)]},
    scoring={'precision': make_scorer(precision_score), 'recall': make_scorer(recall_score), 'min_pre_rec': make_scorer(min_pre_rec)},
    refit='precision',
    cv = 10,
    n_jobs=-1
)
grid.fit(X, y)

In [None]:
df = pd.DataFrame(grid.cv_results_)
plt.plot([_[1] for _ in df['param_class_weight']], df['mean_test_recall'])
plt.plot([_[1] for _ in df['param_class_weight']], df['mean_test_precision'])
plt.plot([_[1] for _ in df['param_class_weight']], df['mean_test_min_pre_rec']);

In [None]:
_ = make_scorer(recall_score)

In [None]:
import numpy as np 

np.eye(4)

In [None]:
1 + 1