In [14]:
import polars as pl
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import average_precision_score
import xgboost as xgb

In [15]:
# Load Engineered Parquet
fe = pl.read_parquet("../data/features_eng.parquet").to_pandas()
print("Shape:", fe.shape)
fe.head()

Shape: (2512, 19)


Unnamed: 0,TransactionID,AccountID,DeviceID,Channel,TransactionType,TransactionAmount,TransactionDuration,LoginAttempts,AccountBalance,CustomerAge,CustomerOccupation,is_fraud,is_online,is_debit,many_login_attempts,amount_ratio,acct_txn_cnt,acct_fraud_cnt,acct_avg_amt
0,TX000006,AC00393,D000579,ATM,Debit,92.15,172,1,781.68,18,Student,0,0,1,0,0.117736,8,0.0,209.20625
1,TX000143,AC00163,D000439,ATM,Debit,227.14,294,1,341.94,18,Student,0,0,1,0,0.662332,2,0.0,307.955
2,TX000254,AC00442,D000556,ATM,Debit,218.96,13,1,754.21,18,Student,0,0,1,0,0.289933,9,0.0,186.591111
3,TX000413,AC00421,D000451,ATM,Credit,242.39,271,1,1328.73,18,Student,0,0,0,0,0.182285,3,0.0,221.373333
4,TX000463,AC00074,D000630,ATM,Debit,17.45,88,1,1959.52,18,Student,0,0,1,0,0.008901,3,0.0,132.366667


In [16]:
# Train Test Split
X = fe.drop(columns=["is_fraud", "TransactionID"])
y = fe["is_fraud"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)

Train size: (1884, 17) Test size: (628, 17)


In [None]:
# Identify categorical columns
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
num_cols = [c for c in X_train.columns if c not in cat_cols]

# Column transformer: one-hot for cat, pass-through for numeric
preproc = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

In [18]:
# Scale-pos-weight for imbalance
scale_pos = (len(y_train) - y_train.sum()) / y_train.sum()

# Build pipeline 
model = xgb.XGBClassifier(
    n_estimators=400,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="aucpr",
    scale_pos_weight=scale_pos,
    n_jobs=-1,
    random_state=42
)

pipe = make_pipeline(preproc, model)

In [19]:
# Fit and evaluate
pipe.fit(X_train, y_train)

proba = pipe.predict_proba(X_test)[:, 1]
auprc = average_precision_score(y_test, proba)
print(f"AUPRC on hold-out: {auprc:.3f}")

AUPRC on hold-out: 0.917


In [10]:
import os
os.makedirs("../models", exist_ok=True)

pipe[-1].save_model("../models/fraud_xgb.json")
print("Model saved to ../models/fraud_xgb.json")

Model saved to ../models/fraud_xgb.json
