In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/creditcard.csv')
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


split x and y , stratify=y, make sure y is in proportion

In [2]:
x=df.drop("Class",axis=1)
y=df["Class"]

x_train, x_test, y_train, y_test= train_test_split(x,y,test_size=0.2, stratify=y, random_state=42)

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipe=Pipeline([
    ("scaler", StandardScaler()),
    ("clf",LogisticRegression(class_weight="balanced",max_iter=10000))
])

In [5]:
pipe.fit(x_train, y_train) #train
y_pred=pipe.predict(x_test)#test

In [6]:
from sklearn.metrics import classification_report,roc_auc_score

print(classification_report(y_test, y_pred))

y_score = pipe.predict_proba(x_test)[:, 1]
roc = roc_auc_score(y_test, y_score)
print("ROC AUC Score:", roc)


              precision    recall  f1-score   support

           0       1.00      0.98      0.99     56864
           1       0.06      0.92      0.11        98

    accuracy                           0.98     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.98      0.99     56962

ROC AUC Score: 0.9720881652464024


High ROC AUC score indicate model have a outstanding ability to detect the potential fraud. Low precision indicate the model is too sensitive to non-fraud transaction. We want to improve the precision. Therefore, we want to make it less sensitive

In [7]:
y_pred_new = (y_score >= 0.9999999999).astype(int)

print(classification_report(y_test, y_pred_new))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.85      0.76      0.80        98

    accuracy                           1.00     56962
   macro avg       0.93      0.88      0.90     56962
weighted avg       1.00      1.00      1.00     56962


In [8]:
def predict_fraud(X_new, model, threshold=0.9999999999):
    proba = model.predict_proba(X_new)[:, 1]
    return (proba >= threshold).astype(int)

In [9]:
import joblib
joblib.dump(pipe, '../models/logistic_model.pkl')

['../models/logistic_model.pkl']

In [31]:
!jupyter nbconvert --to script main.ipynb

[NbConvertApp] Converting notebook main.ipynb to script
[NbConvertApp] Writing 52 bytes to main.py


In [10]:
from sklearn.metrics import (
    average_precision_score,
    precision_score,
    recall_score,
    f1_score
)

# 这里 y_pred 和 y_score 都是用默认阈值 0.5 得来的 baseline 版本
auc_base = roc
pra_base = average_precision_score(y_test, y_score)
precision_base = precision_score(y_test, y_pred)
recall_base = recall_score(y_test, y_pred)
f1_base = f1_score(y_test, y_pred)

print("Baseline Logistic Regression Metrics:")
print("AUC:", auc_base)
print("PR-AUC:", pra_base)
print("Precision:", precision_base)
print("Recall:", recall_base)
print("F1:", f1_base)


Baseline Logistic Regression Metrics:
AUC: 0.9720881652464024
PR-AUC: 0.718934567134695
Precision: 0.061016949152542375
Recall: 0.9183673469387755
F1: 0.11443102352193261


In [11]:
import pandas as pd
import os

os.makedirs("../results", exist_ok=True)

baseline_results = pd.DataFrame([
    ['Logistic-Baseline', auc_base, pra_base, recall_base, precision_base, f1_base]
], columns=['Model','AUC','PR-AUC','Recall','Precision','F1'])

baseline_results.to_csv("../results/logistic_baseline_results.csv", index=False)
baseline_results


Unnamed: 0,Model,AUC,PR-AUC,Recall,Precision,F1
0,Logistic-Baseline,0.972088,0.718935,0.918367,0.061017,0.114431
