In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.metrics import average_precision_score, precision_recall_curve

sns.set_theme(style="whitegrid")

In [None]:
path = Path('data/transactions.csv')
if not path.exists():
    raise FileNotFoundError(f"Missing {path}. Run: python scripts/generate_demo_datasets.py (from repo root)")
df = pd.read_csv(path)
df.head()

## Quick EDA

In [None]:
df[['amount','hour','channel','country','device','is_fraud']].describe(include='all')

In [None]:
fraud_rate = df['is_fraud'].mean()
print(f'Rows: {len(df):,}  Fraud rate: {fraud_rate:.2%}')

## Model: Isolation Forest
We train an unsupervised anomaly detector, then evaluate against the provided `is_fraud` label (for demo purposes).

In [None]:
feature_cols_num = ['amount', 'hour']
feature_cols_cat = ['channel', 'country', 'device']
X = df[feature_cols_num + feature_cols_cat]
y = df['is_fraud'].astype(int)

pre = ColumnTransformer(
    transformers=[
        ('num', Pipeline([('scaler', StandardScaler())]), feature_cols_num),
        ('cat', OneHotEncoder(handle_unknown='ignore'), feature_cols_cat),
    ]
)

model = IsolationForest(
    n_estimators=300,
    contamination=float(max(min(y.mean(), 0.10), 0.001)),
    random_state=42,
    n_jobs=-1,
)

pipe = Pipeline([('pre', pre), ('model', model)])
pipe

In [None]:
pipe.fit(X)

# IsolationForest: lower scores are more anomalous; invert for 'fraud score' where higher is more suspicious
raw_score = pipe.named_steps['model'].score_samples(pipe.named_steps['pre'].transform(X))
fraud_score = -raw_score

ap = average_precision_score(y_true=y, y_score=fraud_score)
print(f'Average precision (PR AUC): {ap:.4f}')

In [None]:
precision, recall, _ = precision_recall_curve(y, fraud_score)

plt.figure(figsize=(6,4))
plt.plot(recall, precision)
plt.title('Precision-Recall curve (Isolation Forest)')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim(0, 1)
plt.show()

## Investigate top anomalies

In [None]:
out = df.copy()
out['fraud_score'] = fraud_score
top = out.sort_values('fraud_score', ascending=False).head(25)
top[['transaction_id','timestamp','amount','hour','channel','country','device','is_fraud','fraud_score']]