In [None]:
# One collective end-to-end demo

def demo_all():
    from pathlib import Path
    import pandas as pd
    
    from preprocessing import load_raw, engineer
    from time_series import build_backlog_series, build_daily_panel
    from interval_analysis import IntervalAnalysis, plot_pg_signoff_monthly_trends
    from eda_opg import EDAConfig, OPGInvestigationEDA

    raw, colmap = load_raw(Path("data/raw/raw.csv"))
    typed = engineer(raw, colmap)
    
    backlog = build_backlog_series(typed)

    if "backlog_available" in backlog.columns and "backlog" not in backlog.columns:
        backlog = backlog.rename(columns={"backlog_available": "backlog"})

    daily, backlog_ts, events = build_daily_panel(typed)

    di = IntervalAnalysis.build_interval_frame(typed, backlog_series=backlog_ts)
    trend = IntervalAnalysis.monthly_trend(
        di, metric="days_to_pg_signoff", agg="median", by=["case_type"]
    ).copy()
    trend["month"] = pd.to_datetime(trend["yyyymm"] + "-01")
    
    cfg = EDAConfig(
        id_col="case_id",
        date_received="dt_received_inv",
        date_allocated="dt_alloc_invest",
        date_signed_off="dt_pg_signoff",
    )
    eda = OPGInvestigationEDA(typed, cfg)
    overview = eda.quick_overview()

    print("=== EDA OVERVIEW ===")
    print(overview)
    print("\n=== INTERVAL TREND HEAD ===")
    print(trend.head())

    # Call your plotting function for the interval and trends
    results = plot_pg_signoff_monthly_trends(di,"data/out/plot/plots")
    
    # Extract for returning
    trend_all = results["trend_all"]
    
    plot_paths = results["plots"]
    
    # # Inspect returned objects if you want
    # results["trend"].tail()
    # results["trend_all"].tail()
    # results["plots"]
    
    return {
        "raw": raw,
        "typed": typed,
        "daily": daily,
        "backlog": backlog_ts,
        "events": events,
        "di": di,
        "trend": trend,
        "overview": overview,
        "trend_all": trend_all,
        "plots": plot_paths,
    }



from demo_pipeline import demo_all
outputs = demo_all()



In [None]:

# demo_eda.py
# Small, self-contained demo that exercises key methods on synthetic OPG-like data.

import numpy as np  # numerical work (corr, quantiles)
import pandas as pd  # core dataframe operations
from eda_opg import EDAConfig, OPGInvestigationEDA

# ----- 1) Create a small synthetic dataset for demonstration -----
rng = np.random.default_rng(42)
n = 2000

# Base dates
start = pd.Timestamp("2024-01-01")
recv_dates = start + pd.to_timedelta(rng.integers(0, 300, size=n), unit="D")

# Allocation occurs for ~85% within 1-30 days; else censored (NaT)
alloc_delays = rng.integers(1, 31, size=n)
allocated_mask = rng.random(size=n) < 0.85
alloc_dates = pd.Series(recv_dates) + pd.to_timedelta(alloc_delays, unit="D")
alloc_dates = alloc_dates.where(allocated_mask, pd.NaT)

# Sign-off for ~70% within 20-120 days from received; else censored
signoff_delays = rng.integers(20, 121, size=n)
so_mask = rng.random(size=n) < 0.70
signoff_dates = pd.Series(recv_dates) + pd.to_timedelta(signoff_delays, unit="D")
signoff_dates = signoff_dates.where(so_mask, pd.NaT)

# Categorical fields
case_types = rng.choice(["LPA", "Deputyship", "Other"], size=n, p=[0.6, 0.3, 0.1])
risk_band = rng.choice(["Low", "Medium", "High"], size=n, p=[0.5, 0.35, 0.15])
teams = rng.choice(["Team A", "Team B", "Team C"], size=n, p=[0.4, 0.4, 0.2])
region = rng.choice(["North", "Midlands", "South"], size=n)

# Daily ops fields
investigators_on_duty = rng.integers(8, 20, size=n)  # rough proxy
allocations = rng.integers(0, 25, size=n)  # allocated on that day
backlog = np.maximum(
    0, 500 + rng.normal(0, 60, size=n).astype(int)
)  # evolving backlog proxy

# Target: legal review ~5%, with higher odds for High risk and longer allocation delay
# We'll simulate it based on logits to mimic a real signal
base_logit = -3.0 + 0.02 * np.nan_to_num(alloc_dates - recv_dates).astype(
    "timedelta64[D]"
).astype(float)
risk_bump = np.select(
    [risk_band == "High", risk_band == "Medium"], [1.2, 0.4], default=0.0
)
logit = base_logit + risk_bump
prob = 1 / (1 + np.exp(-logit))
legal_review = (rng.random(size=n) < prob).astype(int)

# Assemble DataFrame
df = pd.DataFrame(
    {
        "id": np.arange(1, n + 1),
        "date_received_opg": recv_dates,
        "date_allocated_investigator": alloc_dates,
        "date_pg_signoff": signoff_dates,
        "case_type": case_types,
        "risk_band": risk_band,
        "team": teams,
        "region": region,
        "investigators_on_duty": investigators_on_duty,
        "allocations": allocations,
        "backlog": backlog,
        "legal_review": legal_review,
    }
)

# ----- 2) Configure columns and instantiate the EDA toolkit -----
cfg = EDAConfig(
    id_col="id",
    date_received="date_received_opg",
    date_allocated="date_allocated_investigator",
    date_signed_off="date_pg_signoff",
    target_col="legal_review",
    numeric_cols=[
        "days_to_alloc",
        "days_to_signoff",
        "investigators_on_duty",
        "allocations",
        "backlog",
    ],
    categorical_cols=["case_type", "risk_band", "team", "region"],
    time_index_col="date_received_opg",
    team_col="team",
    risk_col="risk_band",
    case_type_col="case_type",
)

eda = OPGInvestigationEDA(df, cfg)

# ----- 3) Run a few core EDA tasks (print or log these in practice) -----
print("\n== QUICK OVERVIEW ==")
print(eda.quick_overview())

print("\n== MISSINGNESS ==")
print(eda.missingness_matrix().head(10))
print(
    "Missing 'days_to_signoff' vs target:\n", eda.missing_vs_target("days_to_signoff")
)

print("\n== OUTLIERS (days_to_signoff) ==")
print(eda.iqr_outliers("days_to_signoff"))

print("\n== CATEGORICAL SUMMARY (case_type × risk_band) ==")
summary = eda.group_summary(
    by=["case_type", "risk_band"],
    metrics={
        "n": ("id", "count"),
        "legal_rate": ("legal_review", "mean"),
        "med_alloc": ("days_to_signoff", "median"),
    },
)
print(summary.head(12))

print("\n== NUMERIC CORRELATIONS (Spearman) ==")
print(eda.numeric_correlations("spearman"))

print("\n== REDUNDANCY DROP LIST (|r|>0.9) ==")
print(eda.redundancy_drop_list())

print("\n== CLASS IMBALANCE ==")
print(eda.imbalance_summary())

print("\n== LEAKAGE SCAN ==")
print(eda.leakage_scan(["post", "signed", "decision", "outcome"]))

print("\n== INTERACTION: risk_band × binned days_to_signoff -> legal_review rate ==")
print(eda.binned_interaction_rate("days_to_signoff", "risk_band"))

print("\n== RESAMPLED TIME SERIES (daily) ==")
ts = eda.resample_time_series(
    {
        "backlog": ("backlog", "last"),
        "inv_mean": ("investigators_on_duty", "mean"),
    }
)
print(ts.tail())

print("\n== LAG CORRELATIONS: backlog vs inv_mean ==")
print(eda.lag_correlations(ts["backlog"], ts["inv_mean"]))

print("\n== KM QUANTILES by risk_band (signoff) ==")
print(eda.km_quantiles_by_group("days_to_signoff", "event_signed_off", "risk_band"))

print("\n== MONTHLY KPIs by team ==")
print(eda.monthly_kpis().head(12))



!python -m demo_eda


In [None]:
# demo_pipline
# run a preprocessing + time-series demo

from pathlib import Path

from preprocessing import load_raw, engineer
from time_series import (
    build_event_log,
    build_wip_series,
    build_backlog_series,
    build_daily_panel,
    summarise_daily_panel,
)

# 1) Load raw data and engineer typed table
raw_path = Path("data/raw/raw.csv")  # adjust if needed
raw, colmap = load_raw(raw_path)
typed = engineer(raw, colmap)

# 2) Build core time-series artefacts
events = build_event_log(typed)
wip = build_wip_series(typed)
backlog = build_backlog_series(typed)

daily, backlog_ts, events_ts = build_daily_panel(
    typed,
    start=None,
    end=None,
    exclude_weekends=True,
    holidays=None,
    pad_days=14,
    backlog_freq="W-FRI",
)

# 3) Aggregate to team-level daily and weekly
team_daily = summarise_daily_panel(daily, by=["date", "team"])
team_weekly = summarise_daily_panel(daily, by=["date", "team"], freq="W-FRI")

print(team_daily.head())
print(team_weekly.head())



In [None]:
!python -m demo_pipeline


In [None]:
# IntervalAnalysis demo

from interval_analysis import IntervalAnalysis

# 1) Build interval frame (one row per staff × date with backlog, events, flags)
di = IntervalAnalysis.build_interval_frame(
    typed,
    backlog_series=backlog_ts,   # expects ['date', 'backlog'] style columns
    bank_holidays=None,
)

print(di.head())

# 2) Monthly trend in days to PG signoff by case_type
trend = IntervalAnalysis.monthly_trend(
    di,
    metric="days_to_pg_signoff",
    agg="median",
    by=["case_type"],
)
print(trend.head())

# 3) Volatility score (e.g., week-on-week variation by team)
vol = IntervalAnalysis.volatility_score(
    di,
    metric="days_to_pg_signoff",
    freq="W",
    by=["team"],
)
print(vol.head())


In [None]:
# run an EDA demo on real data

from eda_opg import EDAConfig, OPGInvestigationEDA

cfg = EDAConfig(
    id_col="case_id",
    date_received="dt_received_inv",   # or dt_received_opg, depending on your engineered columns
    date_allocated="dt_alloc_invest",
    date_signed_off="dt_pg_signoff",
    target_col=None,                  # or "legal_review" if you have it
    numeric_cols=[
        "days_to_pg_signoff",
        "wip_load",
        "backlog",
    ],
    categorical_cols=["team", "case_type", "concern_type"],
    time_index_col="date",            # 'date' from IntervalAnalysis.build_interval_frame
    team_col="team",
    risk_col=None,
    case_type_col="case_type",
)

eda = OPGInvestigationEDA(di, cfg)

print("== QUICK OVERVIEW ==")
print(eda.quick_overview())

print("\n== MISSINGNESS ==")
print(eda.missingness_matrix().head())

print("\n== MONTHLY KPIs ==")
print(eda.monthly_kpis().head())

# You can extend this with the other methods: imbalance_summary, leakage_scan, km_quantiles_by_group, etc.