In [4]:
import pandas as pd
import numpy as np

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1400)

RAW_PATH = "../data/raw/iyzico_fraud_data.csv"
OUT_PATH = "../data/processed/iyzico_featured_leakfree.csv"

# Placeholder for missing values
UNKNOWN = "UNKNOWN"

# Force string types to avoid errors (e.g. phone numbers starting with 0)
DTYPE_MAP = {
    "buyer_gsm": "string",
    "buyer_email": "string",
    "bin_number": "string",
    "last_four_digits": "string",
    "merchant_id": "string",
    "payment_source_id": "string",
}

def add_time_since_last(df, key_col):
    """Calculate seconds passed since the last transaction for a specific ID."""
    # Sort by ID and date to get correct order
    temp = df.sort_values([key_col, "payment_date"])

    # Get the date of the previous transaction
    prev_date = temp.groupby(key_col)["payment_date"].shift(1)

    # Calculate difference in seconds
    tsl = (temp["payment_date"] - prev_date).dt.total_seconds()

    # Return result aligned with temp index
    return tsl.fillna(-1).astype(np.float32), temp.index

def add_velocity_features(df, key_col, value_col="price", windows=("1h", "24h")):
    """
    Calculate stats (count, sum) over time windows.
    Important: closed='left' ensures we don't use current data (prevents leakage).
    """
    # Sort data for rolling window calculation
    temp = df.sort_values([key_col, "payment_date"]).copy()
    temp = temp.set_index("payment_date")

    # Group data by ID
    grouped_key = temp.groupby(key_col)[key_col]
    grouped_val = temp.groupby(key_col)[value_col]

    for w in windows:
        # Calculate Count and Sum (excluding current row)
        cnt = grouped_key.rolling(w, closed="left").count().values.astype(np.float32)
        summ = grouped_val.rolling(w, closed="left").sum().values.astype(np.float32)

        # Add new features
        temp[f"{key_col}_cnt_{w}"] = cnt
        temp[f"{key_col}_sum_{w}"] = summ

        # Calculate Average (handle division by zero)
        avg = np.where(cnt > 0, summ / cnt, np.nan).astype(np.float32)
        temp[f"{key_col}_avg_{w}"] = avg

        # Calculate Ratio: Current Price / Average
        ratio = np.where(np.isfinite(avg) & (avg > 0),
                         (temp[value_col].values / avg).astype(np.float32),
                         1.0).astype(np.float32)
        temp[f"{key_col}_ratio_{w}"] = ratio

    return temp.reset_index()

def main():
    print(">> Loading data...")
    df = pd.read_csv(RAW_PATH, dtype=DTYPE_MAP, low_memory=False)

    # Fix date format
    df["payment_date"] = pd.to_datetime(df["payment_date"], errors="coerce")
    df = df.dropna(subset=["payment_date"]).reset_index(drop=True)

    # Convert numeric columns
    df["price"] = pd.to_numeric(df["price"], errors="coerce").fillna(0.0).astype(np.float32)

    if "is_fraud_transaction" in df.columns:
        df["is_fraud_transaction"] = pd.to_numeric(df["is_fraud_transaction"], errors="coerce").fillna(0).astype(int)

    # --- 1. Preprocessing ---
    print(">> Preprocessing...")

    # Create a flag for missing card types before filling them
    df["is_card_type_missing"] = df["card_type"].isna().astype(np.int8)
    df["card_type"] = df["card_type"].fillna("FOREIGN")

    # Fill missing values for categorical columns
    cat_cols = ["card_association", "card_family", "merchant_id", "payment_source_id"]
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].fillna(UNKNOWN).astype("string")

    # Handle missing buyer info
    if "buyer_gsm" in df.columns:
        df["is_gsm_missing"] = df["buyer_gsm"].isna().astype(np.int8)
        df["buyer_gsm"] = df["buyer_gsm"].fillna(UNKNOWN).astype("string")

    if "buyer_email" in df.columns:
        df["is_email_missing"] = df["buyer_email"].isna().astype(np.int8)
        df["buyer_email"] = df["buyer_email"].fillna(UNKNOWN).astype("string")

    # Create a Card ID (combine bin + last4)
    df["card_id"] = df["bin_number"].astype("string") + "_" + df["last_four_digits"].astype("string")

    # Extract time features
    df["txn_hour"] = df["payment_date"].dt.hour.astype(np.int16)
    df["txn_dow"] = df["payment_date"].dt.dayofweek.astype(np.int16)

    # --- 2. Feature Engineering (Velocity) ---
    print(">> Calculating Velocity Features...")

    # A. Card Velocity (5min, 1h, 24h)
    df = add_velocity_features(df, "card_id", windows=("5min", "1h", "24h"))

    # Time since last transaction (Card)
    tsl_vals, tsl_idx = add_time_since_last(df, "card_id")
    df.loc[tsl_idx, "card_id_tsl_sec"] = tsl_vals

    # B. GSM Velocity (User behavior)
    if "buyer_gsm" in df.columns:
        df = add_velocity_features(df, "buyer_gsm", windows=("1h", "24h"))
        tsl_vals, tsl_idx = add_time_since_last(df, "buyer_gsm")
        df.loc[tsl_idx, "buyer_gsm_tsl_sec"] = tsl_vals

    # C. Merchant Velocity (Store attacks)
    if "merchant_id" in df.columns:
        df = add_velocity_features(df, "merchant_id", windows=("1h", "24h"))
        tsl_vals, tsl_idx = add_time_since_last(df, "merchant_id")
        df.loc[tsl_idx, "merchant_id_tsl_sec"] = tsl_vals

    # --- 3. Post-Processing ---
    # Set velocity features to 0 if the entity is UNKNOWN (remove noise)
    for entity in ["buyer_gsm", "merchant_id"]:
        if entity in df.columns:
            mask = df[entity] == UNKNOWN
            cols = [c for c in df.columns if c.startswith(f"{entity}_") and
                    any(x in c for x in ["_cnt_", "_sum_", "_avg_", "_ratio_", "_tsl_"])]
            if cols:
                df.loc[mask, cols] = 0

    # Save final dataset
    df.to_csv(OUT_PATH, index=False)
    print(f">> Processed data saved: {OUT_PATH}")
    print(f">> Final Shape: {df.shape}")

if __name__ == "__main__":
    main()

>> Loading data...
>> Preprocessing...
>> Calculating Velocity Features...
>> Processed data saved: ../data/processed/iyzico_featured_leakfree.csv
>> Final Shape: (3120010, 70)
