In [11]:
import os
import json
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1400)

# Paths
DATA_PATH = "../data/processed/iyzico_featured_leakfree.csv"
MODEL_PATH = "../models/final_catboost_model.cbm"
ARTIFACT_PATH = "../models/training_artifacts.json"
FEATCOLS_PATH = "../models/feature_columns.json"

# 1. Load Training Artifacts
# We need these to ensure the inference input matches training schema exactly
with open(ARTIFACT_PATH, "r") as f:
    artifact = json.load(f)

with open(FEATCOLS_PATH, "r") as f:
    feat_cols = json.load(f)

TARGET = artifact["target"]
DROP_COLS = artifact["drop_cols"]
CAT_COLS = artifact["cat_cols"]
MISSING_CAT = artifact.get("missing_cat", "MISSING")

# 2. Load Model
model = CatBoostClassifier()
model.load_model(MODEL_PATH)

# 3. Prepare Test Data
# In a real scenario, this would be live incoming data.
# Here, we simulate it by loading the processed dataset.
df = pd.read_csv(DATA_PATH, low_memory=False)
df["payment_date"] = pd.to_datetime(df["payment_date"], errors="coerce")
df = df.dropna(subset=["payment_date"]).reset_index(drop=True)

# Select 'Future' Data (Test Set Period)
t1 = df["payment_date"].quantile(0.85)
future_data = df[df["payment_date"] > t1]

# 4. Sampling Strategy: Real Fraud Cases
# We specifically select known fraud cases to demonstrate the model's detection capability.
real_frauds = future_data[future_data[TARGET] == 1]

if len(real_frauds) > 0:
    # Take up to 10 real fraud examples
    demo_sample = real_frauds.sample(min(10, len(real_frauds)), random_state=42).copy()
else:
    # Fallback to random sampling if no fraud found in this slice
    demo_sample = future_data.sample(10, random_state=42).copy()

# 5. Preprocess Input
# Drop columns that were excluded during training
X_input = demo_sample.drop(columns=[c for c in DROP_COLS if c in demo_sample.columns]).copy()

# Enforce column order to match training
X_input = X_input.reindex(columns=feat_cols)

# Enforce data types (Categorical & Numeric)
for c in CAT_COLS:
    if c in X_input.columns:
        X_input[c] = X_input[c].astype("string").fillna(MISSING_CAT)

num_cols = [c for c in X_input.columns if c not in CAT_COLS]
for c in num_cols:
    X_input[c] = pd.to_numeric(X_input[c], errors="coerce")

# 6. Predict
# Get probability of class 1 (Fraud)
probabilities = model.predict_proba(X_input)[:, 1]
predictions = (probabilities >= 0.5).astype(int)

# 7. Generate Output Report
# Identify key columns for the report
id_col = "card_id" if "card_id" in demo_sample.columns else "bin_number"
report_cols = ["payment_date", id_col, TARGET]
report = demo_sample[report_cols].copy()

report["pred_label"] = predictions
report["fraud_proba"] = probabilities

# Add Risk Levels based on probability
report["risk_level"] = pd.cut(
    report["fraud_proba"],
    bins=[-0.1, 0.2, 0.5, 0.8, 1.1],
    labels=["LOW", "MEDIUM", "HIGH", "CRITICAL"]
)

# Sort by risk (Highest first)
report = report.sort_values("fraud_proba", ascending=False).reset_index(drop=True)

print("\n=== INFERENCE RESULTS (Real Fraud Samples) ===")
print(report.to_string(index=False))

# Summary stats
caught_count = report['pred_label'].sum()
total_count = len(report)
print(f"\nSummary: Caught {caught_count}/{total_count} fraud attempts in this batch.")


=== INFERENCE RESULTS (Real Fraud Samples) ===
           payment_date     card_id  is_fraud_transaction  pred_label  fraud_proba risk_level
2024-09-23 03:02:39.554 699963_9630                     1           1     0.977150   CRITICAL
2024-09-28 04:50:20.265 625766_4253                     1           1     0.975470   CRITICAL
2024-09-18 13:16:41.610 251584_9188                     1           1     0.954394   CRITICAL
2024-09-17 23:03:25.211 835654_8443                     1           1     0.936742   CRITICAL
2024-09-20 04:48:21.970 708115_2317                     1           1     0.898968   CRITICAL
2024-09-30 19:14:20.321 945021_4483                     1           1     0.895425   CRITICAL
2024-09-29 10:12:23.959 144131_2835                     1           1     0.865930   CRITICAL
2024-09-19 23:13:12.809 894116_9087                     1           1     0.746167       HIGH
2024-09-24 19:52:08.076 714697_1730                     1           0     0.417341     MEDIUM
2024-09-18 0