# 02 - Exploratory Data Analysis
Explore transaction patterns, identify fraud signals, and understand ARFI's data.

In [None]:
import sys
sys.path.insert(0, "..")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from src.visualize import plot_amount_histogram, plot_volume_by_time
sns.set_theme(style="whitegrid")

In [None]:
data_dir = Path("../data/processed")
parquet_files = list(data_dir.glob("*.parquet"))
datasets = {f.stem: pd.read_parquet(f) for f in parquet_files}
print(f"Loaded {len(datasets)} processed dataset(s)")
for name, df in datasets.items():
    print(f"  {name}: {df.shape}")

## Transaction Amount Distributions
Look for clustering just below $10K (structuring signal).

In [None]:
# Find the main transaction dataset (adjust key name as needed)
txn_key = next((k for k in datasets if "trans" in k.lower()), list(datasets.keys())[0] if datasets else None)
if txn_key:
    txn = datasets[txn_key]
    # Identify the amount column
    amt_col = next((c for c in txn.columns if "amount" in c.lower()), None)
    if amt_col:
        fig = plot_amount_histogram(txn, amount_col=amt_col)
        plt.show()
        # Zoom into $7K-$12K range
        near_threshold = txn[(txn[amt_col].abs() >= 7000) & (txn[amt_col].abs() <= 12000)]
        print(f"Transactions in $7K-$12K range: {len(near_threshold)} ({len(near_threshold)/len(txn)*100:.1f}%)")

## Volume by Day / Hour / Channel

In [None]:
if txn_key:
    date_col = next((c for c in txn.columns if "date" in c.lower() or "time" in c.lower()), None)
    if date_col:
        fig = plot_volume_by_time(txn, date_col=date_col)
        plt.show()
        # Channel breakdown if available
        chan_col = next((c for c in txn.columns if "channel" in c.lower()), None)
        if chan_col:
            print("\nTransaction volume by channel:")
            print(txn[chan_col].value_counts())

## Customer Account Profiles
Accounts per customer, dormancy patterns.

In [None]:
if txn_key:
    acct_col = next((c for c in txn.columns if "account" in c.lower() and "id" in c.lower()), None)
    cust_col = next((c for c in txn.columns if "customer" in c.lower() or "member" in c.lower()), None)
    if acct_col:
        acct_summary = txn.groupby(acct_col).agg(
            txn_count=pd.NamedAgg(column=acct_col, aggfunc="count"),
            **({f"total_{amt_col}": pd.NamedAgg(column=amt_col, aggfunc="sum")} if amt_col else {}),
        )
        print("Account activity summary:")
        print(acct_summary.describe())
    if cust_col and acct_col:
        accts_per_customer = txn.groupby(cust_col)[acct_col].nunique()
        print(f"\nAccounts per customer: mean={accts_per_customer.mean():.1f}, max={accts_per_customer.max()}")
        print(accts_per_customer.value_counts().head(10))

## Inter-Account Transfer Network
Build directed graph for kiting detection.

In [None]:
import networkx as nx

if txn_key:
    dest_col = next((c for c in txn.columns if "dest" in c.lower() or "recipient" in c.lower() or "target" in c.lower()), None)
    if acct_col and dest_col:
        transfers = txn[[acct_col, dest_col]].dropna()
        G = nx.from_pandas_edgelist(transfers, source=acct_col, target=dest_col, create_using=nx.DiGraph)
        print(f"Transfer network: {G.number_of_nodes()} accounts, {G.number_of_edges()} edges")
        cycles = list(nx.simple_cycles(G, length_bound=6))
        print(f"Cycles found (length <= 6): {len(cycles)}")
        if cycles:
            print("Sample cycles:", cycles[:5])
    else:
        print("No destination account column found - skipping network analysis")

## Key Observations
- [ ] Structuring signal strength
- [ ] Dominant transaction channels
- [ ] Dormancy patterns
- [ ] Kiting network presence
- [ ] Anomalous accounts to investigate