
# 🚀 Hackathon Demo — Real‑time Fraud Detection (History‑Aware)

**Agenda**
1. Load a tiny sample of the dataset (pipe‑delimited).
2. Build *history‑aware* features (velocity, seen‑before, time since last, distances…).
3. Train a baseline classifier (RandomForest).
4. Pick an operating **threshold** from the Precision‑Recall curve (aim ↑recall).
5. Evaluate: Accuracy / Precision / Recall / F1 + Confusion Matrix.
6. Save `model.pkl` + `threshold.json` for the API/streamer.
7. (Optional) Live predict: call local FastAPI `/predict` or run a one‑off prediction.

> **Tip:** Keep the CSV small in the notebook (e.g., 50k rows) so it runs snappy for the demo. Train the full model offline.


In [1]:

# %% Imports
import os, json, math, sqlite3, statistics, datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, precision_recall_curve,
    average_precision_score
)

pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 160)

# Paths (edit for your machine)
DATA_PATH = os.getenv("DATA_PATH", "transactions.csv")  # pipe-delimited
MODEL_PATH = "model.pkl"
FEATURES_PATH = "features.json"
THRESHOLD_PATH = "threshold.json"

# For the live API demo (optional)
FASTAPI_URL = os.getenv("FASTAPI_URL", "http://127.0.0.1:8000/predict?store=0")


ModuleNotFoundError: No module named 'matplotlib'


## 1) Load a small, time‑ordered sample
> For a smooth demo, read up to ~50k rows and **sort by `unix_time`** so our features only see the *past*.


In [None]:

# %% Load a small sample and sort by time
use_rows = int(os.getenv("NB_DEMO_ROWS", "50000"))  # tweak live if needed
df = pd.read_csv(DATA_PATH, sep="|", nrows=use_rows)

# Ensure proper dtypes
for col in ["amt", "lat", "long", "merch_lat", "merch_long", "city_pop", "unix_time"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Sort by time
if "unix_time" in df.columns:
    df = df.sort_values("unix_time").reset_index(drop=True)

df.head(3)



## 2) Feature engineering (history‑aware, lightweight for demo)

This cell implements a *fast* feature builder meant for demos. It uses vectorized pandas where possible and per‑row operations only for a few history lookups limited to the last 24h for each `cc_num`.


In [None]:

# %% Feature helpers (fast-ish, pandas-based)

def fast_age(dob_str, at_unix):
    try:
        y, m, d = map(int, str(dob_str)[:10].split("-"))
        t = dt.datetime.utcfromtimestamp(int(at_unix))
        age = t.year - y - ((t.month, t.day) < (m, d))
        return max(0, age)
    except Exception:
        return -1

def hour_from_unix(unix_time):
    try:
        return dt.datetime.utcfromtimestamp(int(unix_time)).hour
    except Exception:
        return -1

def dow_from_unix(unix_time):
    try:
        return dt.datetime.utcfromtimestamp(int(unix_time)).weekday()  # 0..6 (Mon..Sun)
    except Exception:
        return -1

def is_night(h):
    return 1 if (h <= 6 or h >= 22) else 0

def haversine_km(lat1, lon1, lat2, lon2):
    try:
        R = 6371.0
        phi1 = math.radians(float(lat1)); phi2 = math.radians(float(lat2))
        dphi = math.radians(float(lat2) - float(lat1))
        dlmb = math.radians(float(lon2) - float(lon1))
        a = math.sin(dphi/2)**2 + math.cos(phi1)*math.cos(phi2)*math.sin(dlmb/2)**2
        return 2 * R * math.asin(math.sqrt(a))
    except Exception:
        return np.nan

# Build base columns
def add_base_columns(df):
    df = df.copy()
    df["age"] = [fast_age(d, u) for d, u in zip(df.get("dob", ""), df["unix_time"])]
    df["hour"] = [hour_from_unix(u) for u in df["unix_time"]]
    df["dow"] = [dow_from_unix(u) for u in df["unix_time"]]
    df["is_night"] = df["hour"].apply(is_night)
    df["log_amt"] = np.log1p(df["amt"].fillna(0.0))
    for col in ["lat","long","merch_lat","merch_long","city_pop"]:
        if col not in df.columns:
            df[col] = 0.0
        df[col] = df[col].fillna(0.0)
    # simple gender one-hot
    g = df.get("gender","").astype(str).str.upper()
    df["gender_M"] = (g == "M").astype(int)
    df["gender_F"] = (g == "F").astype(int)
    return df

# History features per cc_num using last 24h window
def add_history_features(df):
    df = df.copy()
    # Pre-allocate columns
    cols = ["velocity_60s","velocity_5m","velocity_15m","velocity_1h",
            "unique_merchants_15m","unique_categories_15m",
            "seen_merchant_before","user_merchant_dist_km",
            "time_since_last_s","time_since_last_merchant_s",
            "user_mean_amt_24h","user_std_amt_24h","user_amt_delta","amt_z_user"]
    for c in cols: df[c] = 0.0

    # Group by cc_num to ensure "past only" within each user
    for cc, g in df.groupby("cc_num", sort=False):
        idx = g.index.values
        times = g["unix_time"].values.astype(np.int64)
        amts = g["amt"].fillna(0.0).values.astype(float)
        merch = g["merchant"].astype(str).values

        # For quick lookup
        lat_user = g["lat"].values; lon_user = g["long"].values
        lat_m = g["merch_lat"].values; lon_m = g["merch_long"].values

        # We'll walk forward and use a small sliding window index
        start = 0
        for i in range(len(g)):
            t = times[i]
            # keep last 24h window
            while start < i and times[start] < t - 24*3600:
                start += 1
            window_slice = slice(start, i)  # strictly past rows
            # velocity counts
            v60  = np.sum(times[window_slice] >= t - 60)
            v5   = np.sum(times[window_slice] >= t - 5*60)
            v15  = np.sum(times[window_slice] >= t - 15*60)
            v1h  = np.sum(times[window_slice] >= t - 60*60)

            # uniques in last 15m
            mask15 = times[window_slice] >= t - 15*60
            uniq_merch = len(set(merch[window_slice][mask15]))
            uniq_cat   = len(set(g["category"].astype(str).values[window_slice][mask15]))

            # seen merchant before?
            seen_before = 1 if (merch[i] in set(merch[window_slice])) else 0

            # time since last overall & same merchant
            last_time = times[i-1] if i > 0 else None
            if last_time is None:
                tsl = 10**9
            else:
                tsl = t - last_time

            # last same-merchant time
            prev_same = None
            for j in range(i-1, start-1, -1):
                if merch[j] == merch[i]:
                    prev_same = times[j]; break
            tslm = (t - prev_same) if prev_same is not None else 10**9

            # distances
            # previous merchant to current merchant (fallback to user coords if needed)
            if i > 0:
                mlat_prev = lat_m[i-1] if not np.isnan(lat_m[i-1]) else lat_user[i-1]
                mlon_prev = lon_m[i-1] if not np.isnan(lon_m[i-1]) else lon_user[i-1]
                dist_prev_to_now = haversine_km(mlat_prev, mlon_prev,
                                                lat_m[i] if not np.isnan(lat_m[i]) else lat_user[i],
                                                lon_m[i] if not np.isnan(lon_m[i]) else lon_user[i])
            else:
                dist_prev_to_now = 0.0

            # user->merchant distance for current row
            dist_user_to_merchant = haversine_km(lat_user[i], lon_user[i], lat_m[i], lon_m[i])
            if np.isnan(dist_user_to_merchant): dist_user_to_merchant = 0.0

            # amount profile 24h
            past_amts = amts[window_slice]
            if past_amts.size:
                mean24 = float(past_amts.mean())
                std24  = float(past_amts.std(ddof=0))
            else:
                mean24, std24 = 0.0, 0.0
            delta = float(amts[i] - mean24)
            z = float(delta / std24) if std24 > 0 else 0.0

            # assign
            df.loc[idx[i], ["velocity_60s","velocity_5m","velocity_15m","velocity_1h"]] = [v60, v5, v15, v1h]
            df.loc[idx[i], ["unique_merchants_15m","unique_categories_15m"]] = [uniq_merch, uniq_cat]
            df.loc[idx[i], "seen_merchant_before"] = float(seen_before)
            df.loc[idx[i], "user_merchant_dist_km"] = float(dist_prev_to_now if not np.isnan(dist_prev_to_now) else 0.0)
            df.loc[idx[i], "time_since_last_s"] = float(tsl)
            df.loc[idx[i], "time_since_last_merchant_s"] = float(tslm)
            df.loc[idx[i], ["user_mean_amt_24h","user_std_amt_24h","user_amt_delta","amt_z_user"]] = [mean24, std24, delta, z]

    return df

# Feature order for the model
FEATURE_ORDER = [
    "age","log_amt","hour","dow","is_night",
    "city_pop","lat","long","merch_lat","merch_long",
    "velocity_60s","velocity_5m","velocity_15m","velocity_1h",
    "unique_merchants_15m","unique_categories_15m",
    "seen_merchant_before","user_merchant_dist_km",
    "time_since_last_s","time_since_last_merchant_s",
    "user_mean_amt_24h","user_std_amt_24h","user_amt_delta","amt_z_user",
    "gender_M","gender_F"
]



### Build the feature matrix


In [None]:

# %% Build features
base = add_base_columns(df)
feat_df = add_history_features(base)

# Ensure target is numeric 0/1
if "is_fraud" in feat_df.columns:
    y = feat_df["is_fraud"].astype(str).str.lower().map(
        {"1":1,"0":0,"true":1,"false":0,"t":1,"f":0,"yes":1,"no":0}
    ).fillna(0).astype(int)
else:
    raise ValueError("Column 'is_fraud' not found.")

X = feat_df[FEATURE_ORDER].fillna(0.0).astype(float).values
X.shape, y.shape



## 3) Train / Evaluate (80/20) and choose threshold from PR‑curve


In [None]:

# %% Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y if y.sum() and y.sum()!=len(y) else None
)

# Model (heavier positive class weight to lift recall)
model = RandomForestClassifier(
    n_estimators=160,
    max_depth=None,
    min_samples_leaf=1,
    n_jobs=-1,
    random_state=42,
    class_weight={0:1.0, 1:5.0}
)
model.fit(X_train, y_train)

# Predictions
y_prob = model.predict_proba(X_test)[:,1]
y_pred_50 = (y_prob >= 0.50).astype(int)

print("=== Metrics @ 0.50 ===")
print(classification_report(y_test, y_pred_50, digits=4))
print("Confusion matrix @ 0.50:")
print(confusion_matrix(y_test, y_pred_50))

# PR curve and threshold suggestion (maximize F1)
prec, rec, thr = precision_recall_curve(y_test, y_prob)
f1 = (2*prec*rec) / (prec + rec + 1e-12)
best_idx = np.argmax(f1)
best_thr = thr[best_idx] if best_idx < len(thr) else 0.5
print(f"Suggested threshold (max F1): {best_thr:.3f}")
print(f"PR-AUC: {average_precision_score(y_test, y_prob):.4f}")

# Plot PR
plt.figure(figsize=(6,4))
plt.plot(rec, prec, lw=2)
plt.scatter(rec[best_idx], prec[best_idx], s=40)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.grid(True)
plt.show()

# Metrics at suggested threshold
y_pred_best = (y_prob >= best_thr).astype(int)
print("=== Metrics @ best threshold ===")
print(classification_report(y_test, y_pred_best, digits=4))
print("Confusion matrix @ best threshold:")
print(confusion_matrix(y_test, y_pred_best))



## 4) Save artifacts (model, features, threshold)


In [None]:

import joblib, json

joblib.dump(model, MODEL_PATH)
with open(FEATURES_PATH, "w") as f:
    json.dump(FEATURE_ORDER, f, indent=2)
with open(THRESHOLD_PATH, "w") as f:
    json.dump({"threshold": float(best_thr)}, f, indent=2)

print("Saved:", MODEL_PATH, FEATURES_PATH, THRESHOLD_PATH)



## 5) (Optional) Live predict via FastAPI

Start your API in a terminal:

```bash
uvicorn fraud_api:app --host 127.0.0.1 --port 8000
```

Then run this cell to send one transaction.


In [None]:

# %% Live predict demo (optional)
import requests, json as _json

if os.environ.get("ENABLE_LIVE_DEMO", "0") == "1":
    sample_tx = df.iloc[len(df)//2].to_dict()
    try:
        r = requests.post(FASTAPI_URL, json=sample_tx, timeout=5)
        print("HTTP", r.status_code)
        print(r.text[:500])
        try:
            print(_json.dumps(r.json(), indent=2))
        except Exception:
            pass
    except Exception as e:
        print("Live demo skipped / failed:", e)
else:
    print("Set ENABLE_LIVE_DEMO=1 to call the API from the notebook.")



## Appendix — Feature list
```
- age
- log_amt
- hour
- dow
- is_night
- city_pop
- lat
- long
- merch_lat
- merch_long
- velocity_60s
- velocity_5m
- velocity_15m
- velocity_1h
- unique_merchants_15m
- unique_categories_15m
- seen_merchant_before
- user_merchant_dist_km
- time_since_last_s
- time_since_last_merchant_s
- user_mean_amt_24h
- user_std_amt_24h
- user_amt_delta
- amt_z_user
- gender_M
- gender_F
```