In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv("dataset_case_study.csv")
df = df.copy()

# Parse date (day-level)
df["creation_date"] = pd.to_datetime(df["creation_date"], errors="coerce")

# ---- Label (Option A) ----
df["label_fraud"] = (df["auth_result"] == "FRAUD").astype(int)

# ---- Core engineered features (rebuild) ----
df["is_zero_stay"] = (df["length_of_stay"] == 0).astype(int)

df["length_of_stay_adj"] = df["length_of_stay"].clip(lower=1)
df["price_per_night"] = df["price_euro"] / df["length_of_stay_adj"]

df["mismatch_booker_vs_issuing"] = (df["booker_country"] != df["card_issuing_country"]).astype(int)
df["mismatch_booker_vs_hotel"]  = (df["booker_country"] != df["hotel_country"]).astype(int)
df["mismatch_issuing_vs_hotel"] = (df["card_issuing_country"] != df["hotel_country"]).astype(int)

df["geo_mismatch_score"] = (
    df["mismatch_booker_vs_issuing"] +
    df["mismatch_booker_vs_hotel"] +
    df["mismatch_issuing_vs_hotel"]
)

# Optional: keep your existing policy segments as FEATURES or keep as analysis-only later
df["seg_auto_challenge_1"] = ((df["is_zero_stay"]==1) & (df["payment_method"]=="american express")).astype(int)
df["seg_auto_challenge_2"] = ((df["geo_mismatch_score"]==3) & (df["payment_method"].isin(["american express","klarna"]))).astype(int)

# Quick sanity
print("Rows:", len(df))
print("Fraud rate:", df["label_fraud"].mean())
df[["geo_mismatch_score","is_zero_stay","price_per_night"]].describe(include="all")


Rows: 79557
Fraud rate: 0.010432771472026345


Unnamed: 0,geo_mismatch_score,is_zero_stay,price_per_night
count,79557.0,79557.0,79557.0
mean,0.925236,0.004249,263.291271
std,1.079273,0.065043,390.112731
min,0.0,0.0,8.551429
25%,0.0,0.0,127.125833
50%,0.0,0.0,182.552857
75%,2.0,0.0,261.909286
max,3.0,1.0,4996.82


<div class="alert alert-warning">
    <strong>Note (Leakage control):</strong>
    <br/>
    We do <strong>not</strong> use <code>auth_result</code> as a feature. It is only used to create the label (<code>label_fraud</code>).
    All engineered features are derived from booking attributes available
</div>


In [4]:
# - Entity-risk features (daily context) -
# These are strong fraud signals: high activity + identity diversity per entity/day.

# IP-day features
df["ip_tx_per_day"] = df.groupby(["creation_date","ip_id"])["transaction_id"].transform("count")
df["ip_distinct_cards_per_day"] = df.groupby(["creation_date","ip_id"])["credit_card_id"].transform("nunique")
df["ip_distinct_emails_per_day"] = df.groupby(["creation_date","ip_id"])["email_id"].transform("nunique")
df["ip_distinct_hotels_per_day"] = df.groupby(["creation_date","ip_id"])["hotel_id"].transform("nunique")

# Email-day features
df["email_tx_per_day"] = df.groupby(["creation_date","email_id"])["transaction_id"].transform("count")
df["email_distinct_cards_per_day"] = df.groupby(["creation_date","email_id"])["credit_card_id"].transform("nunique")
df["email_distinct_ips_per_day"] = df.groupby(["creation_date","email_id"])["ip_id"].transform("nunique")
df["email_distinct_hotels_per_day"] = df.groupby(["creation_date","email_id"])["hotel_id"].transform("nunique")

# Card-day features
df["card_tx_per_day"] = df.groupby(["creation_date","credit_card_id"])["transaction_id"].transform("count")
df["card_distinct_ips_per_day"] = df.groupby(["creation_date","credit_card_id"])["ip_id"].transform("nunique")
df["card_distinct_emails_per_day"] = df.groupby(["creation_date","credit_card_id"])["email_id"].transform("nunique")
df["card_distinct_hotels_per_day"] = df.groupby(["creation_date","credit_card_id"])["hotel_id"].transform("nunique")

df[[
    "ip_tx_per_day","ip_distinct_cards_per_day","email_tx_per_day","email_distinct_cards_per_day",
    "card_tx_per_day","card_distinct_ips_per_day"
]].describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ip_tx_per_day,79557.0,6.206808,34.925544,1.0,1.0,1.0,1.0,309.0
ip_distinct_cards_per_day,79557.0,1.546577,4.632661,1.0,1.0,1.0,1.0,52.0
email_tx_per_day,79557.0,2.511721,11.395966,1.0,1.0,1.0,1.0,117.0
email_distinct_cards_per_day,79557.0,1.074362,0.540297,1.0,1.0,1.0,1.0,5.0
card_tx_per_day,79557.0,3.934801,21.949926,1.0,1.0,1.0,1.0,221.0
card_distinct_ips_per_day,79557.0,1.037181,0.270149,1.0,1.0,1.0,1.0,3.0


<div class="alert alert-warning">
    <strong>Note (Day-level timestamps):</strong>
    <br/>
    Because timestamps are day-level only, entity “velocity” is computed per day.
    These features may include same-day information. In a real-time system we would compute these in streaming with minute/hour windows.
    For this case study, day-level entity features are still highly indicative of attack behavior (bursts + identity churn).
</div>


In [6]:
import os
os.makedirs("outputs/features", exist_ok=True)

In [7]:
df.to_parquet("outputs/features/features.parquet", index=False)