In [None]:
# Kaggle dataset download
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ealaxi/paysim1")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'paysim1' dataset.
Path to dataset files: /kaggle/input/paysim1


In [2]:
import pandas as pd
df = pd.read_csv("/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv")

In [3]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
df.info()
df.isna().sum()
df["isFraud"].value_counts(normalize=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


Unnamed: 0_level_0,proportion
isFraud,Unnamed: 1_level_1
0,0.998709
1,0.001291


In [5]:
pd.crosstab(df["type"], df["isFraud"], normalize="index")

isFraud,0,1
type,Unnamed: 1_level_1,Unnamed: 2_level_1
CASH_IN,1.0,0.0
CASH_OUT,0.99816,0.00184
DEBIT,1.0,0.0
PAYMENT,1.0,0.0
TRANSFER,0.992312,0.007688


In [6]:
split_step = df["step"].quantile(0.8)

train_df = df[df["step"] <= split_step].copy()
test_df  = df[df["step"] > split_step].copy()


In [7]:
train_df["balance_error"] = (
    train_df["oldbalanceOrg"] - train_df["amount"]
).clip(lower=0)


In [8]:
train_df = train_df.sort_values(["nameOrig", "step"])

train_df["tx_count_24h"] = (
    train_df.groupby("nameOrig")["step"]
    .rolling(24)
    .count()
    .reset_index(level=0, drop=True)
)

train_df["avg_amount_24h"] = (
    train_df.groupby("nameOrig")["amount"]
    .rolling(24)
    .mean()
    .reset_index(level=0, drop=True)
)


In [9]:
train_df["tx_count_24h"] = train_df["tx_count_24h"].fillna(0)
train_df["avg_amount_24h"] = train_df["avg_amount_24h"].fillna(train_df["amount"])


In [10]:
test_df["balance_error"] = (
    test_df["oldbalanceOrg"] - test_df["amount"]
).clip(lower=0)

test_df["tx_count_24h"] = 0
test_df["avg_amount_24h"] = test_df["amount"]


In [11]:
FEATURES = [
    "amount",
    "oldbalanceOrg",
    "balance_error",
    "tx_count_24h",
    "avg_amount_24h"
]


In [12]:
X_train = train_df[FEATURES]
y_train = train_df["isFraud"]

X_test  = test_df[FEATURES]
y_test  = test_df["isFraud"]

In [13]:
from sklearn.ensemble import HistGradientBoostingClassifier

model = HistGradientBoostingClassifier(
    max_depth=6,
    learning_rate=0.05,
    max_iter=300,
    class_weight="balanced",
    random_state=3
)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]

In [14]:
from sklearn.metrics import precision_recall_curve, average_precision_score

precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
pr_auc = average_precision_score(y_test, y_prob)

pr_auc

np.float64(0.8987627934806167)

In [29]:
import numpy as np

threshold = np.percentile(y_prob, 99.8)
y_pred = (y_prob >= threshold).astype(int)

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1244486
           1       0.94      0.55      0.70      4250

    accuracy                           1.00   1248736
   macro avg       0.97      0.78      0.85   1248736
weighted avg       1.00      1.00      1.00   1248736



In [33]:
import joblib

joblib.dump(model, "fraud_model_v1.pkl")
joblib.dump(threshold, "best_threshold.pkl")

['best_threshold.pkl']