In [3]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score, average_precision_score

# --- Configuration ---
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1400)

DATA_PATH = "../data/processed/iyzico_featured_leakfree.csv"
OUT_PATH = "../data/processed/ablation_results.csv"
RANDOM_STATE = 42
TARGET = "is_fraud_transaction"
MISSING_CAT = "MISSING"

# Model settings for testing
# We use fewer iterations here (600) to compare feature sets quickly
PARAMS = dict(
    iterations=600,
    depth=8,
    learning_rate=0.01,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=RANDOM_STATE,
    verbose=0,
    allow_writing_files=False
)

def time_split(df, train_q=0.70, valid_q=0.85):
    """Split data based on time order (Train -> Valid -> Test)."""
    df = df.sort_values("payment_date").reset_index(drop=True)

    # Find cut-off dates
    t0 = df["payment_date"].quantile(train_q)
    t1 = df["payment_date"].quantile(valid_q)

    # Split into 3 parts
    train = df[df["payment_date"] <= t0].copy()
    valid = df[(df["payment_date"] > t0) & (df["payment_date"] <= t1)].copy()
    test  = df[df["payment_date"] > t1].copy()

    return train, valid, test

def train_and_evaluate(train_df, valid_df, test_df, feature_cols, exp_name):
    """Train the model with specific features and return scores."""

    X_train = train_df[feature_cols].copy()
    y_train = train_df[TARGET].astype(int)

    X_valid = valid_df[feature_cols].copy()
    y_valid = valid_df[TARGET].astype(int)

    X_test  = test_df[feature_cols].copy()
    y_test  = test_df[TARGET].astype(int)

    # Identify categorical columns
    cat_features = [c for c in X_train.columns if str(X_train[c].dtype) in ["object", "string", "category"]]

    # Fill missing values for categories
    for c in cat_features:
        X_train[c] = X_train[c].fillna(MISSING_CAT).astype(str)
        X_valid[c] = X_valid[c].fillna(MISSING_CAT).astype(str)
        X_test[c]  = X_test[c].fillna(MISSING_CAT).astype(str)

    # Handle class imbalance (fraud is rare)
    pos_count = y_train.sum()
    neg_count = len(y_train) - pos_count
    scale_pos_weight = neg_count / max(pos_count, 1)

    model = CatBoostClassifier(**PARAMS, scale_pos_weight=scale_pos_weight)

    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

    # Train model
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50)

    # Get predictions
    preds_proba = model.predict_proba(X_test)[:, 1]

    # Calculate AUC scores
    roc = roc_auc_score(y_test, preds_proba)
    pr_auc = average_precision_score(y_test, preds_proba)

    # Calculate Top 1% Recall (Business Metric)
    k = max(int(len(y_test) * 0.01), 1)
    top_k_idx = np.argsort(-preds_proba)[:k]
    total_fraud = max(y_test.sum(), 1)
    top_recall = y_test.iloc[top_k_idx].sum() / total_fraud

    return {
        "Experiment": exp_name,
        "Feat_Count": len(feature_cols),
        "ROC_AUC": round(roc, 4),
        "PR_AUC": round(pr_auc, 4),
        "Top1%_Recall": round(top_recall, 4)
    }

def main():
    df = pd.read_csv(DATA_PATH, low_memory=False)

    # Convert date column
    df["payment_date"] = pd.to_datetime(df["payment_date"])

    # Split data
    train, valid, test = time_split(df)

    # --- Select Features ---
    # Identify engineered features by keywords
    velocity_keywords = ["_cnt_", "_sum_", "_avg_", "_price_to_avg_", "_tsl_"]
    all_cols = [c for c in df.columns if c not in [TARGET, "payment_date", "is_fraud_transaction"]]

    # 1. Raw Features only (No engineered features)
    raw_feats = [c for c in all_cols if not any(k in c for k in velocity_keywords)]

    # 2. Card Velocity only (Add card features, exclude others)
    excluded_entities = ["buyer_gsm", "merchant_id", "buyer_email", "payment_source_id"]

    card_velocity_feats = []
    for c in all_cols:
        is_velocity = any(k in c for k in velocity_keywords)

        if not is_velocity:
            card_velocity_feats.append(c) # Keep raw features
        else:
            # Keep only card-related velocity
            if c.startswith("card_id") and not any(c.startswith(e) for e in excluded_entities):
                card_velocity_feats.append(c)

    # 3. Full Model (Use everything)
    full_feats = all_cols

    # --- Run Experiments ---
    results = []

    # Exp A: Baseline
    results.append(train_and_evaluate(train, valid, test, raw_feats, "A) Baseline (Raw Only)"))

    # Exp B: Single Entity
    results.append(train_and_evaluate(train, valid, test, card_velocity_feats, "B) Card Velocity Only"))

    # Exp C: Full Model
    results.append(train_and_evaluate(train, valid, test, full_feats, "C) Full Multi-Entity"))

    # --- Show Results ---
    res_df = pd.DataFrame(results)
    print("\n=== ABLATION STUDY RESULTS ===")
    print(res_df.to_string(index=False))

    res_df.to_csv(OUT_PATH, index=False)

if __name__ == "__main__":
    main()


=== ABLATION STUDY RESULTS ===
            Experiment  Feat_Count  ROC_AUC  PR_AUC  Top1%_Recall
A) Baseline (Raw Only)          44   0.9811  0.0364        0.4472
 B) Card Velocity Only          54   0.9469  0.0210        0.0369
  C) Full Multi-Entity          68   0.9826  0.0399        0.4988
