# 04 - CarMeg SanDiego Hunt
Identify the villain across ARFI's data using name matching, risk scores, and behavioral fingerprinting.

In [None]:
import sys
sys.path.insert(0, "..")
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
data_dir = Path("../data/processed")
datasets = {f.stem: pd.read_parquet(f) for f in data_dir.glob("*.parquet")}

output_dir = Path("../output")
composite = pd.read_csv(output_dir / "risk_scores.csv") if (output_dir / "risk_scores.csv").exists() else pd.DataFrame()
print(f"Risk scores loaded: {composite.shape}")

## Strategy 1: Name Search
Fuzzy match 'CarMeg', 'SanDiego', 'Carmen', 'San Diego' across all name fields.

In [None]:
suspects_name = []
name_patterns = ["carmeg", "sandiego", "carmen", "san diego", "carme", "diego"]

for ds_name, df in datasets.items():
    # Find name-like columns
    name_cols = [c for c in df.columns if any(x in c.lower() for x in ["name", "first", "last", "owner", "customer_name", "member"])]
    for col in name_cols:
        str_col = df[col].astype(str).str.lower()
        for pattern in name_patterns:
            matches = df[str_col.str.contains(pattern, na=False)]
            if len(matches) > 0:
                print(f"[{ds_name}] '{pattern}' found in column '{col}': {len(matches)} match(es)")
                suspects_name.append(matches)

if suspects_name:
    suspects_df = pd.concat(suspects_name).drop_duplicates()
    print(f"\nTotal name-based suspects: {len(suspects_df)}")
    display(suspects_df.head(20))
else:
    print("No name matches found - CarMeg may be using aliases")

## Strategy 2: Top Risk Scores
Examine accounts with highest composite fraud risk.

In [None]:
if len(composite) > 0:
    critical = composite[composite["risk_tier"] == "CRITICAL"]
    high = composite[composite["risk_tier"] == "HIGH"]
    print(f"CRITICAL accounts: {len(critical)}")
    print(f"HIGH accounts: {len(high)}")
    
    # Accounts flagged by multiple detectors are most suspicious
    if "triggered_detectors" in composite.columns:
        composite["detector_count"] = composite["triggered_detectors"].apply(
            lambda x: len(eval(x)) if isinstance(x, str) else len(x) if isinstance(x, list) else 0
        )
        multi_flag = composite[composite["detector_count"] >= 2].sort_values("composite_score", ascending=False)
        print(f"\nAccounts flagged by 2+ detectors: {len(multi_flag)}")
        display(multi_flag.head(20))

## Strategy 3: Behavioral Fingerprint
CarMeg uses social engineering, regulatory awareness, and targets slow-to-notice clients.
Look for accounts flagged by BOTH structuring AND account takeover (regulatory knowledge + social engineering).

In [None]:
# Load individual detector results
detector_files = list(output_dir.glob("detector_*.csv"))
detector_results = {}
for f in detector_files:
    name = f.stem.replace("detector_", "")
    detector_results[name] = pd.read_csv(f)
    print(f"Loaded {name}: {len(detector_results[name])} rows")

# Cross-reference: accounts appearing in multiple detector results with high scores
if detector_results:
    high_risk_per_detector = {}
    for name, df in detector_results.items():
        if "risk_score" in df.columns and "account_id" in df.columns:
            high_risk_per_detector[name] = set(df[df["risk_score"] > 0.5]["account_id"].tolist())
    
    # Find accounts flagged across multiple detectors
    from collections import Counter
    all_flagged = []
    for accounts in high_risk_per_detector.values():
        all_flagged.extend(accounts)
    overlap = Counter(all_flagged)
    carmeg_candidates = [acct for acct, count in overlap.most_common() if count >= 2]
    print(f"\nAccounts flagged by 2+ detectors (score > 0.5): {len(carmeg_candidates)}")
    for acct in carmeg_candidates[:10]:
        detectors_hit = [name for name, accts in high_risk_per_detector.items() if acct in accts]
        print(f"  {acct}: flagged by {detectors_hit}")

## Strategy 4: Timeline Reconstruction
Once CarMeg is identified, build a complete narrative.

In [None]:
# Set the CarMeg account ID here after identification
CARMEG_ACCOUNT_ID = None  # <-- Fill in after analysis

if CARMEG_ACCOUNT_ID:
    txn_key = next((k for k in datasets if "trans" in k.lower()), list(datasets.keys())[0] if datasets else None)
    if txn_key:
        txn = datasets[txn_key]
        acct_col = next((c for c in txn.columns if "account" in c.lower() and "id" in c.lower()), None)
        if acct_col:
            carmeg_txns = txn[txn[acct_col] == CARMEG_ACCOUNT_ID].sort_values(
                next((c for c in txn.columns if "date" in c.lower()), txn.columns[0])
            )
            print(f"CarMeg's transactions: {len(carmeg_txns)}")
            display(carmeg_txns)
            
            from src.visualize import plot_account_timeline
            date_col = next((c for c in txn.columns if "date" in c.lower()), None)
            amt_col = next((c for c in txn.columns if "amount" in c.lower()), None)
            if date_col and amt_col:
                fig = plot_account_timeline(txn, CARMEG_ACCOUNT_ID, date_col=date_col, amount_col=amt_col,
                                           save_path="../output/figures/carmeg_timeline.png")
else:
    print("Set CARMEG_ACCOUNT_ID above after running strategies 1-3")

## CarMeg Profile Summary
*(Fill in after analysis)*
- **Identity**: 
- **Accounts involved**: 
- **Fraud types used**: 
- **Total estimated losses**: 
- **Timeline**: 
- **Modus operandi**: 