# Deputyship Data Extraction

In [None]:
!pip install pydbtools

In [None]:
import pydbtools
import os
import calendar
import shutil
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
from dateutil.relativedelta import relativedelta
# Configure logging
import warnings
# Suppress statsmodels AIC/BIC divide-by-zero runtime warnings
warnings.filterwarnings("ignore", message=".*divide by zero encountered in log.*")
#logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Disable logging output
logging.disable(logging.CRITICAL)

def parse_month(month_str: str) -> datetime:
    """Parse 'YYYY-MM' to datetime."""
    result = datetime.strptime(month_str.strip().strip("'\""), "%Y-%m")
    logging.debug(f"parse_month: parsed '{month_str}' to {result}")
    return result


# def clear_directory(path):
#     logging.info(f"clear_directory: clearing path {path}")
#     for filename in os.listdir(path):
#         file_path = os.path.join(path, filename)
#         try:
#             if os.path.isfile(file_path) or os.path.islink(file_path):
#                 os.unlink(file_path)
#                 logging.debug(f"Deleted file {file_path}")
#             elif os.path.isdir(file_path):
#                 shutil.rmtree(file_path)
#                 logging.debug(f"Deleted directory {file_path}")
#         except Exception as e:
#             logging.error(f"Failed to delete {file_path}. Reason: {e}")
def clear_directory(path: str):
    """Safely delete all files in `path` if it’s a directory."""
    if not os.path.isdir(path):
        logging.debug(f"clear_directory: '{path}' is not a directory, skipping.")
        return
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            logging.warning(f"Failed to delete {file_path}: {e}")

def fetch_cases_for_date(run_date: str) -> pd.DataFrame:
    logging.info(f"fetch_cases_for_date: fetching data for {run_date}")
    query = "..."  # trimmed for brevity
    df = pydbtools.read_sql_query(query)
    logging.info(f"fetch_cases_for_date: returned {len(df)} rows for {run_date}")
    return df


def generate_month_list(start_month: str, end_month: str):
    logging.info(f"generate_month_list: from {start_month} to {end_month}")
    start_dt = parse_month(start_month)
    end_dt = parse_month(end_month)
    if start_dt > end_dt:
        raise ValueError(f"Start month ({start_month}) is after end month ({end_month})")
    months = []
    current = start_dt
    while current <= end_dt:
        months.append(current)
        logging.debug(f"Added month {current}")
        current += relativedelta(months=1)
    logging.info(f"Generated {len(months)} months")
    return months


def last_day_of_month(dt: datetime) -> str:
    day = calendar.monthrange(dt.year, dt.month)[1]
    result = dt.replace(day=day).strftime("%Y-%m-%d")
    logging.debug(f"last_day_of_month: for {dt} result {result}")
    return result

In [None]:

def fetch_cases_for_date(run_date: str) -> pd.DataFrame:
    """
    Fetch all cases & their fee reductions for the given run_date (YYYY-MM-DD)
    using pydbtools.read_sql_query, which returns a pandas DataFrame.
    """
    query = f"""
    WITH active_fee_reductions AS (
      SELECT
        fc.client_id,
        SUBSTRING(fr.type,1,1) || LOWER(SUBSTRING(fr.type,2)) AS type,
        DATE(fr.startdate) AS startdate,
        DATE(fr.enddate)   AS enddate,
        fc.payment_method
      FROM opg_sirius_prod.fee_reduction fr
      JOIN opg_sirius_prod.finance_client fc
        ON fc.id = fr.finance_client_id
       AND fc.glueexporteddate = DATE('{run_date}')
      JOIN (
        SELECT
          MAX(id)           AS id,
          finance_client_id
        FROM opg_sirius_prod.fee_reduction
        WHERE enddate           >= DATE('{run_date}')
          AND startdate         <= DATE('{run_date}')
          AND deleted            = FALSE
          AND glueexporteddate   = DATE('{run_date}')
        GROUP BY finance_client_id
      ) latest ON latest.id = fr.id
      WHERE fr.glueexporteddate = DATE('{run_date}')
    )
    SELECT
      c.glueexporteddate,
      c.caserecnumber            AS casenumber,
      c.uid                      AS siriusid,
      (
        SELECT supervisionlevel
        FROM opg_sirius_prod.supervision_level_log sll
        WHERE sll.order_id         = c.id
          AND sll.glueexporteddate = DATE('{run_date}')
        ORDER BY sll.appliesfrom DESC
        LIMIT 1
      ) AS casesupervisionlevel,
      p.risk_score               AS CREC,
      c.casesubtype              AS orderType,
      c.orderdate                AS ordermadedate,
      c.orderstatus              AS orderStatus,
      afr.type                   AS feereductiontype,
      p.dob,
      CASE
        WHEN FLOOR(DATE_DIFF('day', p.dob, p.createddate) / 365.25) < 0 THEN 0
        ELSE ROUND(DATE_DIFF('day', p.dob, p.createddate) / 365.25)
      END AS age_in_years
    FROM opg_sirius_prod.persons p
    JOIN opg_sirius_prod.cases c
      ON p.id                   = c.client_id
     AND c.glueexporteddate     = DATE('{run_date}')
    LEFT JOIN active_fee_reductions afr
      ON afr.client_id          = p.id
    WHERE c.orderstatus IN ('OPEN','ACTIVE','DUPLICATE')
      AND p.glueexporteddate     = DATE('{run_date}')
    ORDER BY c.orderdate;
    """
    return pydbtools.read_sql_query(query)


def generate_month_list(start_month: str, end_month: str):
    """
    Return a list of datetime objects for each month-start
    from start_month to end_month inclusive.
    """
    start_dt = parse_month(start_month)
    end_dt = parse_month(end_month)
    if start_dt > end_dt:
        raise ValueError(f"Start month ({start_month}) is after end month ({end_month})")

    months = []
    current = start_dt
    while current <= end_dt:
        months.append(current)
        current += relativedelta(months=1)
    return months

def last_day_of_month(dt: datetime) -> str:
    """Return the last day of dt's month as 'YYYY-MM-DD'."""
    day = calendar.monthrange(dt.year, dt.month)[1]
    return dt.replace(day=day).strftime("%Y-%m-%d")
        
def export_monthly_reports(first_month: str, last_month: str, output_base="output") -> tuple[pd.DataFrame, pd.DataFrame]:
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last = last_month.strip().strip("'\"")

    # Generate all months
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return pd.DataFrame(), pd.DataFrame()

    # Prepare output directory
    os.makedirs(output_base, exist_ok=True)
    # **Clear the output directory, not the Excel filepath**
    clear_directory(output_base)

    excel_filename = f"cases_{clean_first}_to_{clean_last}.xlsx"
    excel_path = os.path.join(output_base, excel_filename)

    # List to accumulate each month's DataFrame
    all_months = []

    # Create Excel workbook and write each month's sheet
    with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
        for dt in months:
            month_tag = dt.strftime("%Y-%m")
            run_date = last_day_of_month(dt)

            # Fetch data for this month-end
            df = fetch_cases_for_date(run_date)

            # Tag the DataFrame with its month, then collect it
            df["month"] = month_tag
            all_months.append(df)

            # Save CSV for this month
            month_folder = os.path.join(output_base, month_tag)
            os.makedirs(month_folder, exist_ok=True)
            csv_path = os.path.join(month_folder, f"cases_{month_tag}.csv")
            df.to_csv(csv_path, index=False)

            # Add to Excel workbook
            df.to_excel(writer, sheet_name=month_tag, index=False)

            print(f"→ Saved CSV for {month_tag}: {csv_path}")
        pass
        print(f"→ Combined Excel workbook saved at: {excel_path}")

    # After all sheets are written, concatenate & export one big CSV
    if all_months:
        combined_df = pd.concat(all_months, ignore_index=True)
        combined_csv_path = os.path.join(
            output_base,
            f"all_cases_{clean_first}_to_{clean_last}.csv"
        )
        combined_df.to_csv(combined_csv_path, index=False)
        print(f"→ Combined CSV for all months saved at: {combined_csv_path}")
    else:
        combined_df = pd.DataFrame()

    # ---- NEW SUMMARY SECTION ----
    if not combined_df.empty:
        # extract year from the month tag
        combined_df["year"] = pd.to_datetime(combined_df["month"], format="%Y-%m").dt.year

        # annual summary
        annual = (
            combined_df
            .groupby("year")
            .agg(
                total_orders=("casenumber", "size"),
                total_people=("casenumber", "nunique")
            )
            .reset_index()
        )

        # overall summary across entire period
        overall = pd.DataFrame([{
            "year": "all",
            "total_orders": combined_df.shape[0],
            "total_people": combined_df["casenumber"].nunique()
        }])

        # combine for easy comparison
        summary_df = pd.concat([annual, overall], ignore_index=True)

        # print to console
        print("\n=== Orders & Unique-People Summary ===")
        print(summary_df.to_string(index=False))
    else:
        summary_df = pd.DataFrame()
        print("No data to summarise.")

    return combined_df, summary_df

In [None]:
def calculate_monthly_active_cases(
    df: pd.DataFrame,
    first_month: str,
    last_month: str,
    output_base="output"
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    For each month between first_month and last_month (inclusive), fetch the data,
    filter to ACTIVE cases, then aggregate unique casenumber counts by orderType.
    Writes a CSV 'monthly_active_cases_<first>_to_<last>.csv' under output_base,
    prints yearly order & people counts, and returns:
      - result_df: monthly/orderType aggregates
      - summary_df: yearly order & unique-case counts
    """
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last = last_month.strip().strip("'\"")

    # Generate all months
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return pd.DataFrame(), pd.DataFrame()

    # Prepare output directory
    os.makedirs(output_base, exist_ok=True)

    # Lists to collect each month's summary and raw active data
    summaries = []
    active_data = []

    for dt in months:
        month_tag = dt.strftime("%Y-%m")
        run_date = last_day_of_month(dt)

        
        # Fetch and filter to ACTIVE
        #df = fetch_cases_for_date(run_date)
        #df = combined_df
        
        #df_active = df[df["orderstatus"] == "ACTIVE"].copy()
        df_active = df
        df_active["month"] = month_tag

        # Accumulate raw active rows for yearly summary
        active_data.append(df_active)

        # Aggregate unique casenumbers per orderType
        if df_active.empty:
            summaries.append(
                pd.DataFrame([{
                    "month": month_tag,
                    "orderType": None,
                    "active_case_count": 0
                }])
            )
        else:
            summary = (
                df_active
                .groupby(["month", "ordertype"], observed=False)["casenumber"]
                .nunique()
                .reset_index(name="active_case_count")
            )
            summaries.append(summary)

        print(f"→ Aggregated ACTIVE cases for {month_tag}")

    # Combine monthly summaries
    result_df = pd.concat(summaries, ignore_index=True)

    # Write out CSV
    out_csv = os.path.join(
        output_base,
        f"monthly_active_cases_{clean_first}_to_{clean_last}.csv"
    )
    result_df.to_csv(out_csv, index=False)
    print(f"→ Monthly ACTIVE cases CSV saved at: {out_csv}")

    # ---- NEW YEARLY SUMMARY ----
    if active_data:
        combined_active = pd.concat(active_data, ignore_index=True)
        combined_active["year"] = pd.to_datetime(
            combined_active["month"], format="%Y-%m"
        ).dt.year

        # Total orders per year
        orders_year = (
            combined_active
            .groupby("year")
            .size()
            .reset_index(name="order_count")
        )

        # Unique cases (people) per year
        people_year = (
            combined_active
            .groupby("year")["casenumber"]
            .nunique()
            .reset_index(name="unique_cases")
        )

        summary_df = orders_year.merge(people_year, on="year")

        print("\n=== Yearly Active Orders & Unique Cases ===")
        print(summary_df.to_string(index=False))
    else:
        summary_df = pd.DataFrame()
        print("No ACTIVE data to summarise.")

    return active_data, result_df, summary_df


In [None]:
def calculate_monthly_flow(
    #df: pd.DataFrame,
    first_month: str,
    last_month: str,
    output_base="output"
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    For each month from first_month to last_month (inclusive),
    snapshot the set of active casenumbers, then compare to the prior month
    to count how many entered, exited, and how many were active.
    Writes 'monthly_flow_<first>_to_<last>.csv' under output_base,
    prints yearly summaries, and returns (flow_df, summary_df).
    """
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last  = last_month.strip().strip("'\"")

    # Generate month list
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return pd.DataFrame(), pd.DataFrame()

    # Snapshot active casenumbers per month
    snapshots = {}
    for dt in months:
        tag = dt.strftime("%Y-%m")

        # load data frame
        df = fetch_cases_for_date(last_day_of_month(dt))
        
        snapshots[tag] = set(df["casenumber"].unique())
        print(f"→ Snapshot for {tag}: {len(snapshots[tag])} active cases")

    # Build flow records
    flow_records = []
    entered_sets = {}
    exited_sets  = {}
    prev_tag = None

    for tag in sorted(snapshots):
        current = snapshots[tag]
        active_cnt = len(current)

        if prev_tag is None:
            entered = current
            exited  = set()
        else:
            prev = snapshots[prev_tag]
            entered = current - prev
            exited  = prev - current

        entered_sets[tag] = entered
        exited_sets[tag]  = exited

        flow_records.append({
            "month":        tag,
            "active_count": active_cnt,
            "entered":      len(entered),
            "exited":       len(exited)
        })
        prev_tag = tag

    # Create DataFrame & save CSV
    flow_df = pd.DataFrame(flow_records)

    # Remove the forst record as it only shows the whole count of active cases for entered and shows exited = 0
    flow_df = flow_df[1:]
    os.makedirs(output_base, exist_ok=True)
    out_csv = os.path.join(output_base, f"monthly_flow_{clean_first}_to_{clean_last}.csv")
    flow_df.to_csv(out_csv, index=False)
    print(f"→ Monthly flow CSV saved at: {out_csv}")

    # Yearly summary
    flow_df["year"] = pd.to_datetime(flow_df["month"], format="%Y-%m").dt.year
    summary_records = []

    for year, group in flow_df.groupby("year"):
        months_in_year = group["month"].tolist()
        total_entered = group["entered"].sum()
        total_exited  = group["exited"].sum()
        total_active  = group["active_count"].sum()

        unique_entered = len(set().union(*(entered_sets[m] for m in months_in_year)))
        unique_exited  = len(set().union(*(exited_sets[m]  for m in months_in_year)))
        unique_active  = len(set().union(*(snapshots[m]     for m in months_in_year)))

        summary_records.append({
            "year":            year,
            #"entered_orders":  total_entered,
            "entered_people":  unique_entered,
            #"exited_orders":   total_exited,
            "exited_people":   unique_exited,
            #"active_orders":   total_active,
            "active_clients":  unique_active
        })

    summary_df = pd.DataFrame(summary_records)
    print("\n=== Yearly Flow & Active Summary ===")
    print(summary_df.to_string(index=False))

    return flow_df, summary_df


In [None]:
def calculate_yearonyear_flows_and_age_rates(
    first_month: str,
    last_month: str,
    output_base: str = "output",
    redistribute_unknown_age: bool = False,
    age_bins: tuple = None,
    age_labels: tuple = None
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    import os
    import logging
    import pandas as pd
    from dateutil.relativedelta import relativedelta

    logging.info(f"Calculating year-on-year flows and age rates from "
                 f"{first_month} to {last_month}, redistribute_unknown_age={redistribute_unknown_age}")
    os.makedirs(output_base, exist_ok=True)

    # Default 1-year bins if none supplied
    if age_bins is None or age_labels is None:
        age_bins   = list(range(0, 107))
        age_labels = [str(a) for a in age_bins[:-1]]
    # Ensure "Unknown" bucket
    if "Unknown" not in age_labels:
        age_labels = age_labels + ["Unknown"]

    # Storage
    flow_records = []
    age_records = []
    snapshots_cur  = {}
    entered_sets   = {}
    exited_sets    = {}

    for dt in generate_month_list(first_month, last_month):
        prev_dt = dt - relativedelta(years=1)
        if prev_dt < parse_month(first_month):
            continue

        tag = dt.strftime("%Y-%m")
        logging.info(f"Processing month {tag}")

        df_cur  = fetch_cases_for_date(last_day_of_month(dt))
        df_prev = fetch_cases_for_date(last_day_of_month(prev_dt))

        set_cur, set_prev = set(df_cur["casenumber"]), set(df_prev["casenumber"])
        snapshots_cur[tag] = set_cur
        entered_sets[tag]  = set_cur - set_prev
        exited_sets[tag]   = set_prev - set_cur

        flow_records.append({
            "month":                tag,
            "active_current":       len(set_cur),
            "active_previous":      len(set_prev),
            "entered_orders":       len(df_cur[df_cur["casenumber"].isin(entered_sets[tag])]),
            "entered_clients":      len(entered_sets[tag]),
            "exited_orders":        len(df_prev[df_prev["casenumber"].isin(exited_sets[tag])]),
            "exited_clients":       len(exited_sets[tag]),
        })

        # prepare age-grouped DFs
        df_base = df_prev.copy()
        df_in   = df_cur [df_cur ["casenumber"].isin(entered_sets[tag])].copy()
        df_term = df_prev[df_prev["casenumber"].isin(exited_sets[tag])].copy()

        for d in (df_base, df_in, df_term):
            d["age_group"] = pd.cut(
                d["age_in_years"], bins=age_bins,
                labels=age_labels[:-1], right=False, include_lowest=True
            ).astype(object).fillna("Unknown")

        # counts per age
        # — clients = unique casenumbers
        # base_clients = df_base.groupby("age_group")["casenumber"].nunique().reindex(age_labels, fill_value=0)
        # in_clients   = df_in  .groupby("age_group")["casenumber"].nunique().reindex(age_labels, fill_value=0)
        # term_clients = df_term.groupby("age_group")["casenumber"].nunique().reindex(age_labels, fill_value=0)
        base_clients = df_base.groupby("age_group")["casenumber"].size().reindex(age_labels, fill_value=0)
        in_clients   = df_in  .groupby("age_group")["casenumber"].size().reindex(age_labels, fill_value=0)
        term_clients = df_term.groupby("age_group")["casenumber"].size().reindex(age_labels, fill_value=0)
        
        # — orders = row counts (duplicates allowed)
        base_orders  = df_base.groupby("age_group")["casenumber"].size().reindex(age_labels, fill_value=0)
        in_orders    = df_in  .groupby("age_group")["casenumber"].size().reindex(age_labels, fill_value=0)
        term_orders  = df_term.groupby("age_group")["casenumber"].size().reindex(age_labels, fill_value=0)

        # redistribute unknowns if requested
        if redistribute_unknown_age:
            for series in (in_orders, term_orders, base_orders):
                unknown = series["Unknown"]
                if unknown>0:
                    props = series.drop("Unknown")/series.drop("Unknown").sum()
                    alloc = (props*unknown).round().astype(int)
                    series.loc[props.index] += alloc
                    series["Unknown"] = 0
            for series in (in_clients, term_clients, base_clients):
                unknown = series["Unknown"]
                if unknown>0:
                    props = series.drop("Unknown")/series.drop("Unknown").sum()
                    alloc = (props*unknown).round().astype(int)
                    series.loc[props.index] += alloc
                    series["Unknown"] = 0

        # record per age
        for grp in age_labels:
            active = base_clients[grp]
            ords   = base_orders [grp]
            clnts  = base_clients[grp]
            ent_o  = in_orders   [grp]
            ent_c  = in_clients  [grp]
            term_o = term_orders [grp]
            term_c = term_clients[grp]
            rate_c = (term_c/active) if active else 0
            rate_o = (term_o/ords ) if ords   else 0

            age_records.append({
                "month":                 tag,
                "age_group":             grp,
                "active_orders_age":     ords,
                "active_clients_age":    clnts,
                "entered_orders_age":    ent_o,
                "entered_clients_age":   ent_c,
                "exited_orders_age":     term_o,
                "exited_clients_age":    term_c,
                "termination_rate_clients": rate_c,
                "termination_rate_orders":  rate_o
            })

    # build DataFrames
    flows_df = pd.DataFrame(flow_records)
    ages_df  = pd.DataFrame(age_records)

    # save CSVs
    flows_df.to_csv(os.path.join(output_base, f"flows_{first_month}_to_{last_month}.csv"), index=False)
    ages_df .to_csv(os.path.join(output_base, f"ages_{first_month}_to_{last_month}.csv"), index=False)

    # yearly summary
    flows_df["year"] = pd.to_datetime(flows_df["month"], format="%Y-%m").dt.year
    year_recs = []
    for yr, g in flows_df.groupby("year"):
        year_recs.append({
            "year":             yr,
            "active_orders":    g["active_current"].sum(),
            "active_clients":   len(set().union(*(snapshots_cur[m] for m in g["month"]))),
            "entered_orders":   g["entered_orders"].sum(),
            "entered_clients":  len(set().union(*(entered_sets[m]   for m in g["month"]))),
            "exited_orders":    g["exited_orders"].sum(),
            "exited_clients":   len(set().union(*(exited_sets[m]    for m in g["month"])))
        })
    yearly_df = pd.DataFrame(year_recs)
    print("\n=== Yearly Summary ===")
    print(yearly_df.to_string(index=False))

    # monthly summary
    month_recs = []
    for _, r in flows_df.iterrows():
        m = r["month"]
        month_recs.append({
            "month":           m,
            "active_orders":   r["active_current"],
            "active_clients":  len(snapshots_cur[m]),
            "entered_orders":  r["entered_orders"],
            "entered_clients": len(entered_sets[m]),
            "exited_orders":   r["exited_orders"],
            "exited_clients":  len(exited_sets[m])
        })
    monthly_df = pd.DataFrame(month_recs)
    print("\n=== Monthly Summary ===")
    print(monthly_df.to_string(index=False))

    return flows_df, ages_df, yearly_df, monthly_df


In [None]:
# Append current and forecasted tables
def get_combined_age_deputyship_table(tbl1, tbl2):
    combined = pd.concat(
        [tbl1, tbl2],
        ignore_index=True
    )
    return combined

In [None]:
def calculate_yearonyear_flows_and_age_rates(
    first_month: str,
    last_month: str,
    output_base: str = "output",
    redistribute_unknown_age: bool = False,
    age_bins: tuple = None,
    age_labels: tuple = None
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Calculate year-on-year flows (entries/exits) and age-specific rates, with optional
    imputation of missing ages ("Unknown") via proportional redistribution into
    integer age groups. When redistribution is enabled, each Unknown row is reassigned
    to a concrete integer age group using integer allocations (Hamilton apportionment),
    and the 'age_group' value for those rows is updated accordingly.
    """

    # --- Local imports for numerical helpers (keeps dependency scope clear)
    import numpy as np

    # --- Logging the operation and key parameters for traceability
    logging.info(
        f"Calculating year-on-year flows and age rates from {first_month} to {last_month}, "
        f"redistribute_unknown_age={redistribute_unknown_age}"
    )

    # --- Ensure the output directory exists to avoid file write errors later
    os.makedirs(output_base, exist_ok=True)

    # --- Establish age bins and labels (default is 0–106 inclusive as integer groups)
    # If caller didn't supply custom bins/labels, build the defaults.
    if age_bins is None or age_labels is None:
        age_bins = list(range(0, 107))                     # Edges for [0,1), [1,2), ..., [106,107)
        age_labels = [str(a) for a in age_bins[:-1]]       # String labels '0'...'106' (no "Unknown")

    # --- Make sure we work with labels that do NOT include 'Unknown' for final outputs
    # If a user passed labels that contain 'Unknown', remove it from the cut-labels.
    labels_for_cut = [lbl for lbl in age_labels if lbl != "Unknown"]  # Used by pd.cut
    final_age_labels = labels_for_cut[:]                               # Final index order for outputs

    # --- Helper: cut ages into groups and set 'Unknown' for out-of-range/missing ages
    def _assign_age_groups_inplace(df: pd.DataFrame) -> None:
        """Add/overwrite df['age_group'] using bins/labels; out-of-range/missing -> 'Unknown'."""
        # Compute categorical age groups for known ages using left-closed bins
        df["age_group"] = pd.cut(
            df["age_in_years"],               # Source age column (assumed present)
            bins=age_bins,                    # Integer bin edges (e.g., [0,1), [1,2), ...)
            labels=labels_for_cut,            # Only concrete integer labels
            right=False,                      # Include left edge, exclude right edge
            include_lowest=True               # Include the lowest bound
        )
        # Convert NaN categories (missing/out-of-range) into the literal string "Unknown"
        df["age_group"] = df["age_group"].astype(object).where(df["age_group"].notna(), "Unknown")

    # --- Helper: proportional integer allocation (Hamilton apportionment)
    def _proportional_integer_allocation(known_counts: pd.Series, unknown_total: int) -> pd.Series:
        """
        Allocate 'unknown_total' integer units across index labels of 'known_counts'
        proportionally to known_counts (or uniformly if all zeros).
        Returns a Series of integer allocations indexed like known_counts.
        """
        # Ensure index order matches final labels and fill missing with zero
        known_counts = known_counts.reindex(final_age_labels, fill_value=0)

        # Sum the known counts to derive proportions
        total_known = known_counts.sum()

        # If no known info exists, split uniformly; else use proportional shares
        if total_known == 0:
            # Uniform shares across all age groups
            raw = pd.Series(np.full(len(final_age_labels), unknown_total / max(len(final_age_labels), 1.0)),
                            index=final_age_labels, dtype=float)
        else:
            # Proportional shares: each group's fraction times unknown_total
            raw = (known_counts / total_known) * unknown_total

        # Base integer allocations via floor
        base = np.floor(raw).astype(int)

        # Remaining units to distribute due to flooring
        remainder = int(unknown_total - base.sum())

        # Fractional remainders for Hamilton method
        frac = raw - base

        # Deterministic tie-break: sort by fractional part desc, then by label asc
        order = sorted(final_age_labels, key=lambda x: (-frac.loc[x], x))

        # Distribute one-by-one to the top 'remainder' labels
        for i in range(remainder):
            base.loc[order[i]] += 1

        # Return allocations as a Series aligned to final_age_labels
        return base.reindex(final_age_labels).astype(int)

    # --- Helper: impute unknown ages row-wise, deterministically, in-place
    def _impute_unknowns_inplace(df: pd.DataFrame, id_col: str = "casenumber") -> None:
        """
        For a given df that already has 'age_group' with some 'Unknown',
        reassign 'Unknown' rows to concrete integer age groups.

        Allocation weights are derived from the df's own composition using
        unique {id_col} counts by known age group. The assignment to rows is
        deterministic (sorted index order) to ensure reproducibility.
        """
        # Identify which rows are currently Unknown
        unknown_idx = df.index[df["age_group"] == "Unknown"]
        # If nothing to impute, bail early
        if len(unknown_idx) == 0:
            return

        # Build weights using unique case counts by known age group
        # (drop Unknown to avoid circularity)
        known_unique = (
            df.loc[df["age_group"] != "Unknown", ["age_group", id_col]]
              .drop_duplicates()
              .groupby("age_group", observed=False)[id_col]
              .nunique()
              .reindex(final_age_labels, fill_value=0)
        )

        # Compute integer allocations across age groups for the Unknown total
        allocations = _proportional_integer_allocation(known_unique, len(unknown_idx))

        # Deterministic row assignment: sort unknown indices so results are stable
        unknown_idx_sorted = sorted(unknown_idx.tolist())

        # Pointer into the unknown index list as we assign chunks
        cursor = 0

        # Assign each block of Unknown rows to its allocated age group
        for lbl in final_age_labels:
            k = int(allocations.get(lbl, 0))     # How many Unknown rows to assign to this label
            if k > 0:
                take = unknown_idx_sorted[cursor: cursor + k]  # Slice next k rows
                df.loc[take, "age_group"] = lbl                # Set their age_group to the label
                cursor += k                                    # Advance the pointer

        # Safety: if any Unknowns remain due to edge cases, place them in the smallest label
        if (df["age_group"] == "Unknown").any():
            leftovers = df.index[df["age_group"] == "Unknown"]
            fallback = final_age_labels[0] if final_age_labels else "0"
            df.loc[leftovers, "age_group"] = fallback

    # --- Containers to accumulate per-month analytics (flows and age-rate details)
    flow_records = []         # List of dicts: overall monthly counts (active, entered, exited)
    age_rate_records = []     # List of dicts: per-month x age-group counts and rates
    snapshots_cur = {}        # Dict: month tag -> set of active casenumbers (current month)
    snapshots_prev = {}       # Dict: month tag -> set of active casenumbers (prev-year same month)
    entered_sets = {}         # Dict: month tag -> set of casenumbers entered this month
    exited_sets = {}          # Dict: month tag -> set of casenumbers exited this month

    # --- Iterate each month in the requested window
    for dt in generate_month_list(first_month, last_month):
        # Compute the month exactly one year earlier for YoY comparisons
        prev_dt = dt - relativedelta(years=1)

        # Skip early months that don't have a prior-year comparison within window
        if prev_dt < parse_month(first_month):
            continue

        # Create a YYYY-MM tag for logging and indexing
        tag = dt.strftime("%Y-%m")
        logging.info(f"Processing month {tag}")

        # Fetch snapshots of active cases at month-end for current and prior-year month
        df_cur = fetch_cases_for_date(last_day_of_month(dt))
        df_prev = fetch_cases_for_date(last_day_of_month(prev_dt))

        # Convert to sets of IDs for fast set arithmetic
        set_cur = set(df_cur["casenumber"])
        set_prev = set(df_prev["casenumber"])

        # Persist these snapshots for later summaries
        snapshots_cur[tag] = set_cur
        snapshots_prev[tag] = set_prev

        # Entrants are in current but not in previous; exits are the opposite
        entered = set_cur - set_prev
        exited = set_prev - set_cur

        # Cache the entrant/exit sets for people-level yearly/monthly summaries
        entered_sets[tag] = entered
        exited_sets[tag] = exited

        # Record high-level flow counts for this month
        flow_records.append({
            "month":                 tag,
            "active_count_current":  len(set_cur),
            "active_count_previous": len(set_prev),
            "entered":               len(entered),
            "exited":                len(exited)
        })

        # Build three DataFrames for age analysis:
        #  - df_term: those who exited (from last year's snapshot)
        #  - df_in:   those who entered (into this year's snapshot)
        #  - df_base: the base population (last year's snapshot)
        df_term = df_prev[df_prev["casenumber"].isin(exited)].copy()
        df_in   = df_cur[df_cur["casenumber"].isin(entered)].copy()
        df_base = df_prev.copy()

        # Assign initial age groups with "Unknown" for missing/out-of-range
        _assign_age_groups_inplace(df_term)
        _assign_age_groups_inplace(df_in)
        _assign_age_groups_inplace(df_base)

        # Optionally impute Unknown ages by redistributing them into integer groups
        if redistribute_unknown_age:
            _impute_unknowns_inplace(df_term)   # Replace 'Unknown' with concrete age_group
            _impute_unknowns_inplace(df_in)     # Replace 'Unknown' with concrete age_group
            _impute_unknowns_inplace(df_base)   # Replace 'Unknown' with concrete age_group

        # --- Diagnostics (can be converted to logging.debug if preferred)
        # print("All records in base:", len(df_base))
        # print("Records with age_group assigned (incl. imputed):", df_base['age_group'].notna().sum())

        # --- Compute counts by integer age group (final_age_labels), filling missing with zeros
        # People entered per age group (unique casenumbers)
        in_counts = (
            df_in.groupby("age_group", observed=False)["casenumber"]
                 .nunique()
                 .reindex(final_age_labels, fill_value=0)
        )
        # People exited per age group (unique casenumbers)
        term_counts = (
            df_term.groupby("age_group", observed=False)["casenumber"]
                  .nunique()
                  .reindex(final_age_labels, fill_value=0)
        )
        # Active people in base per age group (unique casenumbers)
        base_counts = (
            df_base.groupby("age_group", observed=False)["casenumber"]
                  .nunique()
                  .reindex(final_age_labels, fill_value=0)
        )

        # Orders (row counts) by age group — useful if multiple rows per person exist
        in_order_counts = (
            df_in.groupby("age_group", observed=False)["casenumber"]
                 .count()
                 .reindex(final_age_labels, fill_value=0)
        )
        term_order_counts = (
            df_term.groupby("age_group", observed=False)["casenumber"]
                  .count()
                  .reindex(final_age_labels, fill_value=0)
        )
        order_counts = (
            df_base.groupby("age_group", observed=False)["casenumber"]
                  .count()
                  .reindex(final_age_labels, fill_value=0)
        )

        # --- Build age-rate rows for this month across all integer age groups
        for grp in final_age_labels:
            active      = int(base_counts[grp])                 # Active unique people in base
            orders_age  = int(order_counts[grp])                # Active orders (rows) in base
            clients_age = active                                # Alias kept for continuity
            term        = int(term_counts[grp])                 # Exits (unique people)
            ent         = int(in_counts[grp])                   # Entries (unique people)
            rate        = round(term / active, 4) if active else 0.0  # Termination rate
            retention   = 1 - rate if rate >= 0 else 1.0        # Retention (1 - termination)

            # Append a fully specified record for this (month, age_group)
            age_rate_records.append({
                "month":              tag,
                "age_group":          grp,
                "active_count":       active,
                "active_orders_age":  orders_age,
                "active_clients_age": clients_age,
                "entered":            ent,
                "terminations":       term,
                "termination_rate":   rate,
                "retention_rate":     retention
            })

    # --- Convert accumulated lists to DataFrames for downstream use
    flows_df = pd.DataFrame(flow_records)       # Month-level flows
    ages_df  = pd.DataFrame(age_rate_records)   # Month x age-group metrics

    # --- Persist the outputs for reproducibility/auditing
    flows_df.to_csv(
        os.path.join(output_base, f"yearonyear_flows_{first_month}_to_{last_month}.csv"),
        index=False
    )
    ages_df.to_csv(
        os.path.join(output_base, f"termination_and_entry_rates_by_age_{first_month}_to_{last_month}.csv"),
        index=False
    )

    # --- Yearly people-level summary (unique people across months per year)
    flows_df["year"] = pd.to_datetime(flows_df["month"], format="%Y-%m").dt.year  # Extract calendar year
    summary_records = []                                                           # Collector for yearly rows

    # Iterate each year and union people across the year's months (entered/exited/active)
    for year, grp in flows_df.groupby("year"):
        months_in_year = grp["month"].tolist()                                     # Months in this year
        entered_people = len(set().union(*(entered_sets[m] for m in months_in_year)))  # Unique entrants
        exited_people  = len(set().union(*(exited_sets[m]  for m in months_in_year)))  # Unique exits
        active_clients = len(set().union(*(snapshots_cur[m] for m in months_in_year))) # Unique active

        # Append the yearly summary row
        summary_records.append({
            "year":           year,
            "entered_people": entered_people,
            "exited_people":  exited_people,
            "active_clients": active_clients
        })

    # Materialize yearly summary table
    summary_df = pd.DataFrame(summary_records)

    # --- Console print for a quick glance (can swap to logging.info if preferred)
    print("\n=== Yearly Summary: Orders & Clients ===")
    print(summary_df.to_string(index=False))

    # --- Monthly people-level summary using the cached sets
    monthly_records = []  # Collector for monthly rows

    for _, row in flows_df.iterrows():
        m = row["month"]                                # Month tag
        monthly_records.append({
            "month":           m,
            "entered_people":  len(entered_sets[m]),    # Unique entrants that month
            "exited_people":   len(exited_sets[m]),     # Unique exits that month
            "active_clients":  len(snapshots_cur[m])    # Unique active that month
        })

    # Materialise monthly summary table
    monthly_summary_df = pd.DataFrame(monthly_records)

    # Console print for a quick glance
    print("\n=== Monthly Summary: Orders & Clients ===")
    print(monthly_summary_df.to_string(index=False))

    # --- Final log to indicate successful completion
    logging.info("Completed calculation of year-on-year flows and age rates")

    # --- Return the four primary outputs: flows, age metrics, yearly and monthly people summaries
    return flows_df, ages_df, summary_df, monthly_summary_df


In [None]:
def stop_flow_forecast(
    ages_df: pd.DataFrame,
    periods: int = 12
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    “Stop‐flow” forecast:
      active_t    = active_{t–1} + entered_{t–12} – terminated_{t–12}

    Returns:
      - per_age_df          : month, age_group, active_forecast,
                              active_orders_age_fc, active_clients_age_fc
      - monthly_summary_df  : month, total_active_orders, total_active_clients
      - yearly_summary_df   : year, yearly_active_orders, yearly_active_clients
      - base_df             : the input ages_df (for reference)
    """
    # Make a copy of the input data so we don't change the original
    df = ages_df.copy()

    # Ensure 'month' is a column; if it's an index, reset it, otherwise error
    if 'month' not in df.columns:
        if 'month' in df.index.names:
            df = df.reset_index()
        else:
            raise ValueError("Input ages_df must have a 'month' column or index level")

    # Convert 'month' column to date format, just to be sure
    df['month'] = pd.to_datetime(df['month'])

    # Find the last (most recent) month in the historical data
    last_hist = df['month'].max()

    # Make a list of months to forecast (e.g. next 12 months)
    fc_months = [last_hist + relativedelta(months=i) for i in range(1, periods+1)]

    # Get the data for the last historical month
    base = df[df['month']==last_hist]
    
    
    # Store the active counts for each age group from the last month as a starting point
    prev_active      = base.set_index('age_group')['active_count'].to_dict()
    prev_orders      = base.set_index('age_group')['active_orders_age'].to_dict()  # (Commented out)
    prev_clients     = base.set_index('age_group')['active_clients_age'].to_dict()

    records = []  # This will store forecast results for each month and age group

    # For each month in the forecast period
    for m in fc_months:
        print(f"month: {m}")
        # Load all active cases at the end of the current and previous year’s month
        #df_cur  = fetch_cases_for_date(last_day_of_month(pd.to_datetime(m)))
        #print(df_cur)
        
        # Find the matching month from 12 months ago (for stop-flow calculation)
        lag = m - relativedelta(years=1)
        # For each age group
        for age in df['age_group'].unique():
            #print(f"age: {age}")
            # Get previous forecasted counts, or 0 if not found
            a_prev     = prev_active.get(age, 0)
            #print(f"1.a_prev: {a_prev}")
            o_prev     = prev_orders .get(age, 0)   # (Commented out)
            c_prev     = prev_clients.get(age, 0)
            #print(f"c_prev: {c_prev}")
            
            # Find the data for this age group from 12 months ago (if it exists)
            row        = df[(df['month']==lag)&(df['age_group']==age)]
            # Get 'entered' and 'terminations' values; use 0 if missing
            entered    = int(row['entered'     ].iloc[0]) if not row.empty else 0
            #print(f"entered: {entered}")
            
            term       = int(row['terminations'].iloc[0]) if not row.empty else 0
            #print(f"term: {term}")
            
            # Calculate new forecast: previous + entered - terminated (but not below zero)
            a_fc = max(0, a_prev + entered - term)
            #print(f"a_fc: {a_fc}")
            o_fc = max(0, o_prev + entered - term)   # (Commented out)
            c_fc = max(0, c_prev + entered - term)
            #print(f"c_fc: {c_fc}")
            
            # Store the result for this month and age group
            records.append({
                'month':                 m,
                'age_group':             age,
                'active_forecast':       a_fc, 
                'active_orders_age_fc':  o_fc,    # (Commented out)
                'active_clients_age_fc': c_fc
            }) #active_clients_age_fc

            # Update the previous values for the next month in the loop
            prev_active[age]  = a_fc
            prev_orders[age]  = o_fc   # (Commented out)
            prev_clients[age] = c_fc

    # Convert all forecast records into a DataFrame (table)
    per_age_df = pd.DataFrame(records)

    # Make a summary table for each forecast month (total across ages)
    monthly = (
        per_age_df
        .groupby('month')
        .agg(
            total_active_orders=('active_orders_age_fc' , 'sum'),    # (Commented out)
            total_active_clients=('active_clients_age_fc', 'sum')
            #total_active=('active_forecast', 'sum')
        )
        .reset_index()
    )
    # print("\n=== Monthly Stop‐Flow Summary ===")
    # print(monthly.to_string(index=False))

    # Add a 'year' column for yearly summary
    monthly['year'] = monthly['month'].dt.year

    # Make a summary table for each year (totals across months)
    yearly = (
        monthly
        .groupby('year')
        .agg(
            #yearly_active_orders = ('total_active_orders', 'sum'),   # (Commented out)
            yearly_active_clients= ('total_active_clients','sum')
        )
        .reset_index()
    )
    # print("\n=== Yearly Stop‐Flow Summary ===")
    # print(yearly.to_string(index=False))

    # Return all results and the original input data
    return per_age_df, monthly, yearly, df


# Virtualisation: Plotting age-specific active caseloads, termination rate, and new deputyships over time

In [None]:
# -----------------------------------------------
# Visualisation & Insight Analysis (append below)
# -----------------------------------------------
def visualize_and_analyze_deputyship_forecasts(
    combined_df: pd.DataFrame,
    output_dir: str = "output",
    hist_last_month: "pd.Timestamp|str|None" = None,
    top_k: int = 8,
    axis_start_month: "pd.Timestamp|str|None" = None,   # <-- add this
) -> dict:
    """
    Visualize historical + forecasted active caseloads (clients & orders) and produce key insights.

    Inputs
    ------
    combined_df : DataFrame with at least:
        - 'month' (datetime or string)
        - 'age' (string or int age group)
        - 'active_caseloads_clients' (or fallbacks: 'active_clients_age', 'active_clients_age_fc', 'active_forecast')
        - 'active_caseloads_orders'  (or fallbacks: 'active_orders_age',  'active_orders_age_fc',  'active_forecast')
    output_dir : where to save PNGs and insights markdown.
    hist_last_month : last historical month (for a vertical cutoff line). If None, no cutoff line is drawn.
    top_k : how many age groups to highlight in stacked area and top-movers charts.

    Returns
    -------
    A small dict of computed summary metrics for programmatic use.
    """

    os.makedirs(output_dir, exist_ok=True)

    df = combined_df.copy()

    # --- Ensure 'month' is datetime (handles '2025-07' and 'Jul-25' style strings)
    if not np.issubdtype(df['month'].dtype, np.datetime64):
        # try robust parsing; attempt two common formats
        try:
            df['month'] = pd.to_datetime(df['month'])
        except Exception:
            df['month'] = pd.to_datetime(df['month'], format="%b-%y")

    # --- Standardise column names via safe coalescing (guards against earlier rename differences)
    def coalesce_col(frame, candidates, new_name):
        for c in candidates:
            if c in frame.columns:
                frame.rename(columns={c: new_name}, inplace=True)
                return new_name
        # if none exist, create an empty numeric column
        frame[new_name] = 0
        return new_name

    clients_col = coalesce_col(
        df,
        ["active_caseloads_clients", "active_clients_age", "active_clients_age_fc"],
        "active_caseloads_clients"
    )
    orders_col = coalesce_col(
        df,
        ["active_caseloads_orders", "active_orders_age", "active_orders_age_fc"],
        "active_caseloads_orders"
    )

    # --- NEW: parse / infer axis_start_month safely ---
    if axis_start_month is not None and not isinstance(axis_start_month, pd.Timestamp):
        axis_start_month = pd.to_datetime(axis_start_month)

    if axis_start_month is None:
        # Auto-start at first non-zero month to avoid leading zeros
        totals_all = (
            df.groupby('month')[[clients_col, orders_col]]
              .sum(min_count=1)
              .fillna(0)
        )
        nz = totals_all.sum(axis=1) > 0
        axis_start_month = nz.index[nz.argmax()] if nz.any() else df['month'].min()

    
    # --- Tidy 'age' to string and keep a stable order
    if 'age' not in df.columns:
        # Fallback if upstream still calls the column 'age_group'
        if 'age_group' in df.columns:
            df.rename(columns={'age_group': 'age'}, inplace=True)
        else:
            raise ValueError("Expected an 'age' (or 'age_group') column in combined_df.")
    df['age'] = df['age'].astype(str)

    # --- Sort for plotting
    df = df.sort_values(['month', 'age']).reset_index(drop=True)

    # --- Identify last historical month (line on charts) if provided as str
    if hist_last_month is not None and not isinstance(hist_last_month, pd.Timestamp):
        hist_last_month = pd.to_datetime(hist_last_month)

    # # ========== 1) Total caseloads over time (line) ==========
    # totals = (
    #     df.groupby('month', as_index=False)
    #       .agg(total_clients=(clients_col, 'sum'),
    #            total_orders=(orders_col, 'sum'))
    # )

    # # Plot
    # plt.figure(figsize=(11, 6))
    # plt.plot(totals['month'], totals['total_clients'], label='Total active clients')
    # if df[orders_col].sum() > 0:
    #     plt.plot(totals['month'], totals['total_orders'], label='Total active orders')
    # if hist_last_month is not None:
    #     plt.axvline(hist_last_month, linestyle='--', linewidth=1, label='Last historical month')
    # plt.title('Active caseloads over time: clients vs orders')
    # plt.xlabel('Month'); plt.ylabel('Count'); plt.legend(); plt.tight_layout()
    # path_total = os.path.join(output_dir, "01_totals_clients_orders.png")
    # plt.savefig(path_total, dpi=180); plt.close()

    # ========== 7) Total caseloads over time (line + 95% CIs) ==========
    
    totals = (
        df.groupby('month', as_index=False)
          .agg(total_clients=(clients_col, 'sum'),
               total_orders=(orders_col, 'sum'))
    )
    
    # Optional: start x-axis at axis_start_month if your function has that param
    if 'axis_start_month' in locals() and axis_start_month is not None:
        if not isinstance(axis_start_month, pd.Timestamp):
            axis_start_month = pd.to_datetime(axis_start_month)
        totals = totals[totals['month'] >= axis_start_month]
    
    # def _poisson_ci(series: pd.Series, z: float = 1.96):
    #     """95% CI for counts via Poisson approx: x ± z*sqrt(x), floored at 0."""
    #     x = series.to_numpy(dtype=float)
    #     sd = np.sqrt(np.clip(x, 0, None))
    #     lower = np.maximum(0, x - z * sd)
    #     upper = x + z * sd
    #     return lower, upper

    def _poisson_ci(series: pd.Series, z: float = 1.96, phi: float = 1.0):
        """95% CI for counts via Poisson approx: x ± z*sqrt(x), floored at 0."""
        x = series.to_numpy(dtype=float)
        sd = np.sqrt(phi * np.clip(x, 0, None))  # phi=1 → Poisson; phi>1 → over-dispersed
        lower = np.maximum(0, x - z * sd)
        upper = x + z * sd
        return lower, upper

    # Clients CI
    c_lo, c_hi = _poisson_ci(totals['total_clients'])
    
    # Orders CI (only if any orders exist)
    has_orders = (df[orders_col].sum() > 0)
    if has_orders:
        o_lo, o_hi = _poisson_ci(totals['total_orders'])
    
    plt.figure(figsize=(11, 6))
    
    # Plot CIs first so lines sit on top
    plt.fill_between(totals['month'], c_lo, c_hi, alpha=0.2, label='95% CI (clients)', zorder=1)
    if has_orders:
        plt.fill_between(totals['month'], o_lo, o_hi, alpha=0.15, label='95% CI (orders)', zorder=1)
    
    # Now the lines
    plt.plot(totals['month'], totals['total_clients'], label='Total active clients', zorder=2)
    if has_orders:
        plt.plot(totals['month'], totals['total_orders'], label='Total active orders', zorder=2)
    
    # Historical cutoff marker
    if hist_last_month is not None:
        plt.axvline(hist_last_month, linestyle='--', linewidth=1, label='Last historical month')
    
    plt.title('Active caseloads over time: clients vs orders (with 95% CIs)')
    plt.xlabel('Month'); plt.ylabel('Count'); plt.legend(); plt.tight_layout()
    
    path_total = os.path.join(output_dir, "01_totals_clients_orders.png")
    plt.savefig(path_total, dpi=180); plt.close()

    # # ========== 2) Stacked area by age (clients, top_k) ==========
    # # pick top_k age groups by average presence across period
    # top_ages = (
    #     df.groupby('age', as_index=False)[clients_col].mean()
    #       .sort_values(clients_col, ascending=False)['age']
    #       .head(top_k)
    #       .tolist()
    # )
    # df_area = df.copy()
    # df_area['age_area'] = np.where(df_area['age'].isin(top_ages), df_area['age'], 'Other')

    # area_wide = (
    #     df_area.groupby(['month', 'age_area'], as_index=False)[clients_col].sum()
    #            .pivot(index='month', columns='age_area', values=clients_col)
    #            .fillna(0)
    # )

    # plt.figure(figsize=(11, 6))
    # # stack in deterministic order: top ages (descending by latest), then Other if present
    # ordered_cols = [c for c in top_ages if c in area_wide.columns]
    # if 'Other' in area_wide.columns:
    #     ordered_cols = ordered_cols + ['Other']
    # plt.stackplot(area_wide.index, area_wide[ordered_cols].T, labels=ordered_cols)
    # if hist_last_month is not None:
    #     plt.axvline(hist_last_month, linestyle='--', linewidth=1, label='Last historical month')
    # plt.title(f'Active clients by age (stacked), top {top_k} groups')
    # plt.xlabel('Month'); plt.ylabel('Active clients'); plt.legend(loc='upper left'); plt.tight_layout()
    # path_area = os.path.join(output_dir, "02_clients_stacked_area_by_age.png")
    # plt.savefig(path_area, dpi=180); plt.close()


    # ===== 2) Stacked area by age (clients) — 5-year bands from 0, axis starts at start_month, CI band =====
    # axis_start_month can be a string (e.g., "2022-07") or Timestamp; if None, keep all months
    if axis_start_month is not None and not isinstance(axis_start_month, pd.Timestamp):
        axis_start_month = pd.to_datetime(axis_start_month)
    
    # Filter from axis_start_month forward to avoid plotting pre-start zeros
    df_band = df.copy()
    if axis_start_month is not None:
        df_band = df_band[df_band['month'] >= axis_start_month]
    
    # Parse ages to integers (robust to labels like "70-74" by taking the first number)
    def _to_int_age(x):
        s = str(x)
        num = ''.join(ch for ch in s.split('-')[0] if ch.isdigit())
        try:
            return int(num)
        except Exception:
            return np.nan
    
    df_band['age_int'] = df_band['age'].apply(_to_int_age)
    df_band = df_band[df_band['age_int'].notna()].copy()
    df_band['age_int'] = df_band['age_int'].astype(int)
    
    # Map to 5-year bands (START AT 0), cap high ages so the final band is 105–109 (covers 105+)
    def _band_label(a, width=5, cap=109):
        a = max(0, min(int(a), cap))
        lo = (a // width) * width
        hi = lo + width - 1
        return f"{lo:02d}-{hi:02d}"
    
    df_band['age_band'] = df_band['age_int'].apply(_band_label)
    
    # Build a COMPLETE ordered list of bands from 0 up to the cap (ensures we start at 00-04)
    _width = 5
    _cap   = 109
    full_bands = [f"{lo:02d}-{lo+_width-1:02d}" for lo in range(0, _cap + 1, _width)]  # 00-04, 05-09, ..., 105-109
    
    # Wide table: month x 5-year band (sum clients per band and month), then reindex to include all bands from 0
    area_wide = (
        df_band.groupby(['month', 'age_band'], as_index=False)[clients_col].sum()
               .pivot(index='month', columns='age_band', values=clients_col)
               .reindex(columns=full_bands, fill_value=0)   # <-- force presence from 00-04 upward
               .fillna(0)
    )
    
    plt.figure(figsize=(11, 6))
    # Plot in strict ascending band order from 00-04 upwards
    plt.stackplot(area_wide.index, area_wide[full_bands].T, labels=full_bands)
    
    # Vertical line for last historical month if provided
    if hist_last_month is not None:
        plt.axvline(hist_last_month, linestyle='--', linewidth=1, label='Last historical month')
    
    # ---- Uncertainty band (95% Poisson CI) around the total clients series ----
    totals_band = area_wide.sum(axis=1)
    lower = np.maximum(0, totals_band - 1.96 * np.sqrt(np.clip(totals_band, a_min=0, a_max=None)))
    upper = totals_band + 1.96 * np.sqrt(np.clip(totals_band, a_min=0, a_max=None))
    plt.fill_between(area_wide.index, lower, upper, alpha=0.2, label='95% CI (total)')
    
    plt.title('Active clients by age (5-year bands from 0)')
    plt.xlabel('Month'); plt.ylabel('Active clients')
    plt.legend(loc='upper left', ncol=3)  # more columns so the legend fits many bands
    plt.tight_layout()
    
    path_area = os.path.join(output_dir, "02_clients_stacked_area_5yr_bands_from0.png")
    plt.savefig(path_area, dpi=180); plt.close()


    # ========== 3) Heatmap (clients) — 5-year age bands x month ==========

    # 1) Parse 'age' to an integer (handles "70" or "70-74" by taking the left number)
    def _to_int_age(x):
        s = str(x)
        left = s.split('-')[0]
        num = ''.join(ch for ch in left if ch.isdigit())
        try:
            return int(num)
        except Exception:
            return np.nan
    
    df_heat = df.copy()
    df_heat['age_int'] = df_heat['age'].apply(_to_int_age)
    df_heat = df_heat[df_heat['age_int'].notna()].copy()
    df_heat['age_int'] = df_heat['age_int'].astype(int)
    
    # 2) Map to 5-year bands (cap at 109 so final band is 105–109 catching 105+)
    def _band_label(a, width=5, cap=109):
        a = max(0, min(int(a), cap))
        lo = (a // width) * width
        hi = lo + width - 1
        return f"{lo:02d}-{hi:02d}"
    
    df_heat['age_band'] = df_heat['age_int'].apply(_band_label)
    
    # 3) Build band x month matrix (sum of clients per band/month)
    heat = (
        df_heat.pivot_table(index='age_band', columns='month', values=clients_col, aggfunc='sum')
               .fillna(0)
    )
    
    # 4) Sort age bands numerically by their lower bound
    heat = heat.reindex(sorted(heat.index, key=lambda s: int(s.split('-')[0])))
    
    # 5) Plot
    plt.figure(figsize=(12, 7))
    plt.imshow(heat.values, aspect='auto', interpolation='nearest')
    plt.colorbar(label='Active clients')
    
    # y-axis: band labels
    plt.yticks(ticks=np.arange(len(heat.index)), labels=heat.index)
    
    # x-axis: month labels (downsample to ~12 ticks for readability)
    x_idx = np.arange(len(heat.columns))
    step = max(1, len(heat.columns)//12)
    plt.xticks(
        ticks=x_idx[::step],
        labels=[m.strftime('%Y-%m') for m in heat.columns][::step],
        rotation=45, ha='right'
    )
    
    plt.title('Heatmap: Active clients by 5-year age band and month')
    plt.tight_layout()
    path_heat = os.path.join(output_dir, "03_heatmap_clients_ageband_month.png")
    plt.savefig(path_heat, dpi=180); plt.close()


    # ========== 4) Top movers across forecast horizon (delta by age) ==========
    # Determine anchor months for delta
    # If we have a historical cutoff, compare last hist vs last overall; else earliest vs latest.
    if hist_last_month is not None and (df['month'] <= hist_last_month).any():
        m0 = df.loc[df['month'] <= hist_last_month, 'month'].max()
    else:
        m0 = df['month'].min()
    m1 = df['month'].max()

    # snap0 = df[df['month'] == m0].groupby('age', as_index=False)[clients_col, orders_col].sum()
    # snap1 = df[df['month'] == m1].groupby('age', as_index=False)[clients_col, orders_col].sum()

    cols_for_delta = list(dict.fromkeys([clients_col, orders_col]))  # de-dup, keep order
    snap0 = df[df['month'] == m0].groupby('age', as_index=False)[cols_for_delta].sum()
    snap1 = df[df['month'] == m1].groupby('age', as_index=False)[cols_for_delta].sum()

    delta = snap1.merge(snap0, on='age', suffixes=('_end', '_start'), how='outer').fillna(0)
    delta['delta_clients'] = delta[f'{clients_col}_end'] - delta[f'{clients_col}_start']
    delta['delta_orders']  = delta[f'{orders_col}_end']  - delta[f'{orders_col}_start']

    # Top increases & decreases for clients
    inc_clients = delta.sort_values('delta_clients', ascending=False).head(top_k)
    dec_clients = delta.sort_values('delta_clients', ascending=True).head(top_k)

    # Plot clients movers (bar)
    plt.figure(figsize=(11, 6))
    plt.bar(inc_clients['age'], inc_clients['delta_clients'], label='Increases')
    plt.bar(dec_clients['age'], dec_clients['delta_clients'], label='Decreases')
    plt.title(f'Top movers by age (clients): {m0:%Y-%m} → {m1:%Y-%m}')
    plt.xlabel('Age'); plt.ylabel('Δ Active clients'); plt.legend(); plt.tight_layout()
    path_movers_clients = os.path.join(output_dir, "04_top_movers_clients.png")
    plt.savefig(path_movers_clients, dpi=180); plt.close()

    # If orders present, do the same
    if df[orders_col].sum() > 0:
        inc_orders = delta.sort_values('delta_orders', ascending=False).head(top_k)
        dec_orders = delta.sort_values('delta_orders', ascending=True).head(top_k)

        plt.figure(figsize=(11, 6))
        plt.bar(inc_orders['age'], inc_orders['delta_orders'], label='Increases')
        plt.bar(dec_orders['age'], dec_orders['delta_orders'], label='Decreases')
        plt.title(f'Top movers by age (orders): {m0:%Y-%m} → {m1:%Y-%m}')
        plt.xlabel('Age'); plt.ylabel('Δ Active orders'); plt.legend(); plt.tight_layout()
        path_movers_orders = os.path.join(output_dir, "05_top_movers_orders.png")
        plt.savefig(path_movers_orders, dpi=180); plt.close()
    else:
        path_movers_orders = None

    # ========== 5) Ratios & peaks ==========
    # Ratio: orders per 100 clients (where clients > 0)
    totals['orders_per_100_clients'] = np.where(
        totals['total_clients'] > 0,
        totals['total_orders'] * 100.0 / totals['total_clients'],
        np.nan
    )

    plt.figure(figsize=(11, 5))
    plt.plot(totals['month'], totals['orders_per_100_clients'])
    if hist_last_month is not None:
        plt.axvline(hist_last_month, linestyle='--', linewidth=1, label='Last historical month')
    plt.title('Orders per 100 clients (level & trend)')
    plt.xlabel('Month'); plt.ylabel('Orders per 100 clients') 
    if hist_last_month is not None: plt.legend()
    plt.tight_layout()
    path_ratio = os.path.join(output_dir, "06_orders_per_100_clients.png")
    plt.savefig(path_ratio, dpi=180); plt.close()

    # Peak months
    peak_clients_idx = totals['total_clients'].idxmax()
    peak_orders_idx  = totals['total_orders'].idxmax() if df[orders_col].sum() > 0 else None
    peak_clients_month = totals.loc[peak_clients_idx, 'month']
    peak_orders_month  = totals.loc[peak_orders_idx,  'month'] if peak_orders_idx is not None else None

    # ========== 6) Insights (markdown) ==========
    hist_total_clients = totals.loc[totals['month'] == m0, 'total_clients'].sum() if (totals['month'] == m0).any() else np.nan
    end_total_clients  = totals.loc[totals['month'] == m1, 'total_clients'].sum()
    hist_total_orders  = totals.loc[totals['month'] == m0, 'total_orders' ].sum() if (totals['month'] == m0).any() else np.nan
    end_total_orders   = totals.loc[totals['month'] == m1, 'total_orders' ].sum()

    abs_change_clients = end_total_clients - hist_total_clients if pd.notna(hist_total_clients) else np.nan
    pct_change_clients = (abs_change_clients / hist_total_clients * 100.0) if pd.notna(hist_total_clients) and hist_total_clients else np.nan

    abs_change_orders  = end_total_orders - hist_total_orders if pd.notna(hist_total_orders) else np.nan
    pct_change_orders  = (abs_change_orders / hist_total_orders * 100.0) if pd.notna(hist_total_orders) and hist_total_orders else np.nan

    # Contribution of top movers (clients)
    movers_clients = delta[['age', 'delta_clients']].sort_values('delta_clients', ascending=False)
    pos_sum = movers_clients[movers_clients['delta_clients'] > 0]['delta_clients'].sum()
    top_contrib = movers_clients.head(top_k)['delta_clients'].sum()
    share_topk = (top_contrib / pos_sum * 100.0) if pos_sum else np.nan

    insights_lines = [
        "# Deputyship caseload forecast — key insights",
        f"- **Horizon compared:** {m0:%Y-%m} → {m1:%Y-%m}",
        f"- **Total clients:** {int(end_total_clients):,} at end; change = {int(abs_change_clients):,} ({pct_change_clients:0.1f}%)" if pd.notna(pct_change_clients) else f"- **Total clients (end):** {int(end_total_clients):,}",
        f"- **Total orders:** {int(end_total_orders):,} at end; change = {int(abs_change_orders):,} ({pct_change_orders:0.1f}%)" if pd.notna(pct_change_orders) else f"- **Total orders (end):** {int(end_total_orders):,}",
        f"- **Peak clients month:** {peak_clients_month:%Y-%m} (value: {int(totals.loc[peak_clients_idx,'total_clients']):,})",
    ]
    if peak_orders_month is not None:
        insights_lines.append(f"- **Peak orders month:** {peak_orders_month:%Y-%m} (value: {int(totals.loc[peak_orders_idx,'total_orders']):,})")
    if pd.notna(share_topk):
        insights_lines.append(f"- **Top {top_k} age groups account for ~{share_topk:0.1f}% of the positive change in clients.**")

    # List the top 5 client growers & decliners
    top_incr = movers_clients.head(5)
    top_decl = movers_clients.tail(5).sort_values('delta_clients')
    insights_lines.append("\n**Top 5 age increases (clients):** " + ", ".join(f"{a} (+{int(d):,})" for a, d in zip(top_incr['age'], top_incr['delta_clients'])))
    insights_lines.append("**Top 5 age decreases (clients):** " + ", ".join(f"{a} ({int(d):,})" for a, d in zip(top_decl['age'], top_decl['delta_clients'])))

    # Write markdown
    insights_path = os.path.join(output_dir, "00_forecast_insights.md")
    with open(insights_path, "w", encoding="utf-8") as f:
        f.write("\n".join(insights_lines))

    # Console echo for quick read
    print("\n".join(insights_lines))
    print(f"\nSaved charts:\n- {path_total}\n- {path_area}\n- {path_heat}\n- {path_movers_clients}\n"
          f"{'- ' + path_movers_orders if path_movers_orders else ''}\n- {path_ratio}\nInsights → {insights_path}")

    # Return a small metrics dict if you want to log/store programmatically
    return {
        "horizon_start": m0,
        "horizon_end": m1,
        "end_total_clients": int(end_total_clients),
        "end_total_orders":  int(end_total_orders),
        "abs_change_clients": int(abs_change_clients) if pd.notna(abs_change_clients) else None,
        "pct_change_clients": float(pct_change_clients) if pd.notna(pct_change_clients) else None,
        "abs_change_orders":  int(abs_change_orders) if pd.notna(abs_change_orders) else None,
        "pct_change_orders":  float(pct_change_orders) if pd.notna(pct_change_orders) else None,
        "peak_clients_month": peak_clients_month,
        "peak_orders_month":  peak_orders_month
    }




# Running the Deputyship forecasting model

In [None]:
# Running the Deputyship forecasting model

if __name__ == "__main__":
    start_year = 2022
    end_year = 2025
    start_month = "2022-11"
    end_month = "2025-11"
    output_base="output"

    # Prepare output directory
    os.makedirs(output_base, exist_ok=True)
    # **Clear the output directory, not the Excel filepath**
    clear_directory(output_base)
    
    combined_df, summary_df = export_monthly_reports(start_month, end_month)
    summary_df
    print(combined_df)
    
    active_df, monthly_df, yearly_summary = calculate_monthly_active_cases(combined_df, start_month, end_month, output_base="output")
    yearly_summary
    print(monthly_df)
    print(active_df)
    

    # Calculate historical flows and age rates
    final_df, ages_df, summary_df, monthly_summary_df = calculate_yearonyear_flows_and_age_rates(
         start_month, end_month,
         redistribute_unknown_age=True)
    
    print(summary_df)
    print(monthly_summary_df)
    print(ages_df)
    print(final_df)

    # Compute 2-year stop-flow forecast
    per_age_df, monthly, yearly, df = stop_flow_forecast(ages_df, periods=12)    
    
    print(per_age_df)
    print(monthly)
    print(yearly)
    print(df)

    # Combine the historical data and forecasts
    current_age_specific_deputyship_agg = ages_df.copy()
    current_age_specific_deputyship_agg = current_age_specific_deputyship_agg.rename(
        columns={
            'age_group': 'age',
            #'active_count': 'active_caseloads',
            'entered': 'new_deputyships',
            'terminations': 'terminated',
            'active_clients_age': 'active_caseloads_clients',
            'active_orders_age':  'active_caseloads_orders'

        }
    )
    
    
    forecasted_age_specific_deputyship_agg = per_age_df.copy()
    forecasted_age_specific_deputyship_agg = forecasted_age_specific_deputyship_agg.rename(
        columns={
            'age_group': 'age',
            #'active_forecast': 'active_caseloads',
            'active_clients_age_fc': 'active_caseloads_clients',
            'active_orders_age_fc': 'active_caseloads_orders'
        }
    )
    #forecasted_age_specific_deputyship_agg['month'] = pd.to_datetime(forecasted_age_specific_deputyship_agg['month'], format='%Y-%m')
    #current_age_specific_deputyship_agg['month'] = pd.to_datetime(forecasted_age_specific_deputyship_agg['month'], format='%Y-%m')
    # Final tforcast and actuals
    combined_table = get_combined_age_deputyship_table(current_age_specific_deputyship_agg, forecasted_age_specific_deputyship_agg)
    # Ensure 'month' is datetime
    #combined_table['month'] = pd.to_datetime(combined_table['month'], format='%Y-%m')
    final_deputyship_historical_forecasts = combined_table[['month', 'age', 'active_caseloads_clients', 'active_caseloads_orders', 'new_deputyships', 'terminated']]
    final_deputyship_historical_forecasts['month'] = pd.to_datetime(final_deputyship_historical_forecasts['month']).dt.strftime("%b-%y")
    
    # Save in CSV
    final_deputyship_historical_forecasts.to_csv(f"output/final_deputyship_historical_forecasts_{start_year}_{end_year}.csv")
    print(final_deputyship_historical_forecasts)


    # Use end_month as the last historical month to draw a vertical line on charts
    _ = visualize_and_analyze_deputyship_forecasts(
        final_deputyship_historical_forecasts,
        output_dir=output_base,
        hist_last_month=pd.to_datetime(end_month),
        axis_start_month=start_month,   # <-- important
        top_k=10
    )

In [None]:
ages_df

In [None]:
# Virtualisation: Plotting age-specific active caseloads, termination rate, and new deputyships over time

# Active Caseloads by Age Group
active_pivot = combined_table.pivot(index='month', columns='age', values='active_caseloads')
    
# Pivot for plotting: month on x‐axis, each age a line
pivot = per_age_df.pivot(index='month', columns='age_group', values='active_forecast')
pivot.index = pd.to_datetime(pivot.index)
    
# Plot
fig, ax = plt.subplots(figsize=(10, 6))
for age, series in pivot.items():
    ax.plot(
        series.index, series.values,
        label=f"{int(age)} yrs",
        marker='o',
        linewidth=2,
        alpha=0.8
    )

In [None]:
# # Forecasting Active Caseloads:
# def stop_flow_forecast(ages_df: pd.DataFrame, periods: int = 12) -> pd.DataFrame:
#     """
#     “Stop‐flow” forecast:
#       active_t  = active_{t–1} + entered_{t–12} – terminated_{t–12}
#     for each age_group, recursively, starting from the last historical month.
    
#     Returns a DataFrame with columns:
#       month (datetime), age_group, active_forecast (int)
#     """
#     # 1) Copy and ensure month is a column
#     df = ages_df.copy()
#     if 'month' not in df.columns:
#         # If month is in the index, pull it out
#         if 'month' in df.index.names:
#             df = df.reset_index()
#         else:
#             raise ValueError("Input ages_df must have a 'month' column or index level")
    
#     # 2) Convert month to datetime flexibly
#     df['month'] = pd.to_datetime(df['month'])
    
#     # 3) Find the last historical month
#     last_hist = df['month'].max()
#     #print(f"last_hist: {last_hist}")
    
#     # 4) Build the list of future months (1..periods ahead)
#     first_fc = last_hist + relativedelta(months=1)
#     #print(f"first_fc: {first_fc}")
    
#     fc_months = [first_fc + relativedelta(months=i) for i in range(periods)]
#     #print(f"fc_months: {fc_months}")
    
#     # 5) Seed the "previous active" from last historical month
#     prev_active = (
#         df[df['month'] == last_hist]
#         .set_index('age_group')['active_count']
#         .to_dict()
#     )
#     #print(f"prev_active: {prev_active}")
#     records = []
#     age_groups = df['age_group'].unique()
    
#     for m in fc_months:
#         lag = m - relativedelta(years=1)
#         #print(f"lag: {lag}")
        
#         for age in age_groups:
#             a_prev = prev_active.get(age, 0)
#             #print(f"a_prev: {a_prev}")
            
#             # grab last year's entered/terminated for this age & lag month
#             row = df[(df['month'] == lag) & (df['age_group'] == age)]
#             #print(f"row: {row}")
            
#             entered = int(row['entered'].iloc[0]) if not row.empty else 0
#             #print(f"entered: {entered}")
            
#             term    = int(row['terminations'].iloc[0]) if not row.empty else 0
#             #print(f"term: {term}")
            
            
#             # stop‐flow formula
#             a_fc = a_prev + entered - term if (a_prev + entered) > term else 0
#             #print(f"a_fc = {a_prev} + {entered} - {term}: {a_prev + entered - term}")
            
#             #print(f"a_fc,: {a_fc}")
            
#             records.append({
#                 'month':           m,
#                 'age_group':       age,
#                 'active_forecast': a_fc
#             })
#             prev_active[age] = a_fc  # update for next iteration
#             #print(f"age: {age}")
#             #print(f"prev_active[age]: {a_fc}")
            
#     return pd.DataFrame(records)


In [None]:
# def calculate_yearonyear_flows_and_age_rates(
#     first_month: str,
#     last_month: str,
#     output_base: str = "output",
#     redistribute_unknown_age: bool = False,
#     age_bins: tuple = None,
#     age_labels: tuple = None
# ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
#     # Log a message about the operation being performed, with some parameters
#     logging.info(f"Calculating year-on-year flows and age rates from {first_month} to {last_month}, redistribute_unknown_age={redistribute_unknown_age}")

#     # Make the output directory if it doesn't exist
#     os.makedirs(output_base, exist_ok=True)

#     # If age bins or labels aren't given, make defaults: 0-106 years, each as a group
#     if age_bins is None or age_labels is None:
#         age_bins   = list(range(0, 107))
#         age_labels = [str(a) for a in age_bins[:-1]]

#     # Prepare lists and dicts to store our results as we process each month
#     flow_records = []        # To store overall monthly entry/exit counts
#     age_rate_records = []    # To store age-specific rates and counts per month
#     snapshots_cur = {}       # Dict to store sets of active cases for each month
#     snapshots_prev = {}      # Dict to store sets of active cases a year earlier
#     entered_sets   = {}      # Dict to store who entered in each month
#     exited_sets    = {}      # Dict to store who exited in each month

#     # For every month between first and last month
#     for dt in generate_month_list(first_month, last_month):
#         # Find the month one year earlier
#         prev_dt = dt - relativedelta(years=1)
#         # If previous year’s month is before the first month, skip
#         if prev_dt < parse_month(first_month):
#             continue

#         # Create a string tag for this month (e.g., '2023-05')
#         tag = dt.strftime("%Y-%m")
#         logging.info(f"Processing month {tag}")

#         # Load all active cases at the end of the current and previous year’s month
#         df_cur  = fetch_cases_for_date(last_day_of_month(dt))
#         df_prev = fetch_cases_for_date(last_day_of_month(prev_dt))

#         # Get sets of case numbers for current and previous months
#         set_cur  = set(df_cur["casenumber"])
#         set_prev = set(df_prev["casenumber"])

#         # Save these sets for later reference
#         snapshots_cur[tag]  = set_cur
#         snapshots_prev[tag] = set_prev

#         # Determine who entered: present this year, not last year
#         entered = set_cur  - set_prev
#         # Determine who exited: present last year, not this year
#         exited  = set_prev - set_cur

#         # Store entered/exited sets for summary stats later
#         entered_sets[tag] = entered
#         exited_sets[tag]  = exited

#         # Save summary counts for this month
#         flow_records.append({
#             "month":                 tag,
#             "active_count_current":  len(set_cur),
#             "active_count_previous": len(set_prev),
#             "entered":               len(entered),
#             "exited":                len(exited)
#         })

#         # DataFrames for people who exited, entered, and the whole base for age analysis
#         df_term = df_prev[df_prev["casenumber"].isin(exited)].copy()
#         df_in   = df_cur [df_cur ["casenumber"].isin(entered)].copy()
#         df_base = df_prev.copy()

#         # add an “Unknown” group if it isn’t already in your age labels:
#         if age_bins is None or age_labels is None:
#             age_bins   = list(range(0, 107))
#             age_labels = [str(a) for a in age_bins[:-1]]
#         # Add "Unknown" group
#         if "Unknown" not in age_labels:
#             age_labels = list(age_labels) + ["Unknown"]
            
#         # Assign an age group (e.g. '70', '71',...) to each person in each DataFrame
#         for df_ in (df_term, df_in, df_base):
#             df_["age_group"] = pd.cut(
#                 df_["age_in_years"],
#                 bins=age_bins,
#                 labels=age_labels[:-1],  # exclude Unknown for pd.cut, will assign after
#                 right=False,
#                 include_lowest=True
#             )
#             df_["age_group"] = df_["age_group"].astype(object).where(
#                 df_["age_group"].notna(), "Unknown"
#             )


#         # Count total rows vs. those with valid age_group
#         print("All records in base:", len(df_base))
#         print("Records with age_group assigned:", df_base['age_group'].notna().sum())

#         # Count number of people entered per age group
#         in_counts   = df_in.groupby("age_group", observed=False)["casenumber"]\
#                              .nunique().reindex(age_labels, fill_value=0)        # Count number of people exited per age group
#         term_counts = df_term.groupby("age_group", observed=False)["casenumber"]\
#                              .nunique().reindex(age_labels, fill_value=0)        # Count number of people in base per age group
#         base_counts = df_base.groupby("age_group", observed=False)["casenumber"]\
#                              .nunique().reindex(age_labels, fill_value=0)
#         # #base_counts = df_base.groupby("age_group", observed=False)["casenumber"]\
                             
#         # # Count number of orders in base per age group (same as above unless multiple orders per client)
#         # order_counts = df_base.groupby("age_group", observed=False)["casenumber"]\
#         #                       .count().reindex(age_labels, fill_value=0)

#         # 1. Count number of people entered per age group
#         in_order_counts   = df_in.groupby("age_group", observed=False)["casenumber"].count().reindex(age_labels, fill_value=0)
#         # 2. Count number of people exited per age group
#         term_order_counts = df_term.groupby("age_group", observed=False)["casenumber"].count().reindex(age_labels, fill_value=0)
#         # 3. Count number of people in base per age group
#         # base_counts = df_base.groupby("age_group", observed=False)["casenumber"].count().reindex(age_labels, fill_value=0)
#         # 4. Orders (usually same as base unless orders/client > 1)
#         order_counts = df_base.groupby("age_group", observed=False)["casenumber"].count().reindex(age_labels, fill_value=0)
    
#         if redistribute_unknown_age:
#             # --- Redistribute unknown "entered" ---
#             unknown_in = df_in["age_group"].isna().sum()
#             if unknown_in > 0:
#                 total_known_in = in_counts.loc[age_labels[:-1]].sum()  # exclude Unknown group itself
#                 props_in = in_counts.loc[age_labels[:-1]] / total_known_in if total_known_in > 0 else pd.Series(1/len(age_labels[:-1]), index=age_labels[:-1])
#                 alloc_in = (props_in * unknown_in).round().astype(int)
#                 in_counts.loc[age_labels[:-1]] += alloc_in
#                 in_counts.loc["Unknown"] = 0  # Set to zero after redistribution
    
#             # --- Redistribute unknown "terminations" ---
#             unknown_term = df_term["age_group"].isna().sum()
#             if unknown_term > 0:
#                 total_known_term = term_counts.loc[age_labels[:-1]].sum()
#                 props_term = term_counts.loc[age_labels[:-1]] / total_known_term if total_known_term > 0 else pd.Series(1/len(age_labels[:-1]), index=age_labels[:-1])
#                 alloc_term = (props_term * unknown_term).round().astype(int)
#                 term_counts.loc[age_labels[:-1]] += alloc_term
#                 term_counts.loc["Unknown"] = 0
        
#             # --- Redistribute unknown "base" (active) ---
#             unknown_base = df_base["age_group"].isna().sum()
#             if unknown_base > 0:
#                 total_known_base = base_counts.loc[age_labels[:-1]].sum()
#                 props_base = base_counts.loc[age_labels[:-1]] / total_known_base if total_known_base > 0 else pd.Series(1/len(age_labels[:-1]), index=age_labels[:-1])
#                 alloc_base = (props_base * unknown_base).round().astype(int)
#                 base_counts.loc[age_labels[:-1]] += alloc_base
#                 base_counts.loc["Unknown"] = 0
        
#             # --- (Optional) Orders, if you want similar logic applied ---
#             unknown_orders = df_base["age_group"].isna().sum()
#             if unknown_orders > 0:
#                 total_known_orders = order_counts.loc[age_labels[:-1]].sum()
#                 props_orders = order_counts.loc[age_labels[:-1]] / total_known_orders if total_known_orders > 0 else pd.Series(1/len(age_labels[:-1]), index=age_labels[:-1])
#                 alloc_orders = (props_orders * unknown_orders).round().astype(int)
#                 order_counts.loc[age_labels[:-1]] += alloc_orders
#                 order_counts.loc["Unknown"] = 0



#         # # Find out how many entrants have unknown age
#         # unknown = df_in["age_group"].isna().sum()
#         # # If redistribution is on, and there are unknowns, share them proportionally across age groups
#         # if redistribute_unknown_age and unknown > 0:
#         #     total_known = in_counts.sum()
#         #     # Compute the proportion for each age group
#         #     props = in_counts / total_known if total_known>0 else pd.Series(1/len(age_labels), 
#         #                                                                     index=age_labels)
#         #     # Allocate the unknowns proportionally
#         #     alloc = (props * unknown).round().astype(int)
#         #     # Add allocated unknowns to each age group count
#         #     in_counts += alloc



#         # For each age group, calculate stats and append them to the age_rate_records
#         for grp in age_labels:
#             active       = int(base_counts[grp])
#             orders_age   = int(order_counts[grp])
#             clients_age  = active
#             term         = int(term_counts[grp])
#             ent          = int(in_counts[grp])
#             rate         = round(term/active,4) if active else 0.0      # Avoid division by zero
#             retention    = 1 - rate if rate>=0 else 1.0

#             age_rate_records.append({
#                 "month":              tag,
#                 "age_group":          grp,
#                 "active_count":       active,
#                 "active_orders_age":  orders_age,
#                 "active_clients_age": clients_age,
#                 "entered":            ent,
#                 "terminations":       term,
#                 "termination_rate":   rate,
#                 "retention_rate":     retention
#             })

#     # Convert flow and age stats into DataFrames (tables)
#     flows_df = pd.DataFrame(flow_records)
#     ages_df  = pd.DataFrame(age_rate_records)

#     # Save the DataFrames as CSV files for later use
#     flows_df.to_csv(
#         os.path.join(output_base, f"yearonyear_flows_{first_month}_to_{last_month}.csv"),
#         index=False
#     )
#     ages_df.to_csv(
#         os.path.join(output_base, f"termination_and_entry_rates_by_age_{first_month}_to_{last_month}.csv"),
#         index=False
#     )

#     # --- Yearly summary ---
#     # Add a 'year' column for grouping
#     flows_df["year"] = pd.to_datetime(flows_df["month"], format="%Y-%m").dt.year
#     summary_records = []
#     # For each year, summarize all the flows and unique people across months
#     for year, grp in flows_df.groupby("year"):
#         months_in_year  = grp["month"].tolist()
#         entered_orders  = grp["entered"].sum()
#         exited_orders   = grp["exited"].sum()
#         active_orders   = grp["active_count_current"].sum()
#         entered_people  = len(set().union(*(entered_sets[m] for m in months_in_year)))
#         exited_people   = len(set().union(*(exited_sets[m]  for m in months_in_year)))
#         active_clients  = len(set().union(*(snapshots_cur[m]   for m in months_in_year)))

#         summary_records.append({
#             "year":           year,
#             #"entered_orders": entered_orders,     # (Commented out)
#             "entered_people": entered_people,
#             #"exited_orders":  exited_orders,      # (Commented out)
#             "exited_people":  exited_people,
#             #"active_orders":  active_orders,      # (Commented out)
#             "active_clients": active_clients
#         })

#     summary_df = pd.DataFrame(summary_records)
#     print("\n=== Yearly Summary: Orders & Clients ===")
#     print(summary_df.to_string(index=False))

#     # --- Monthly summary ---
#     monthly_records = []
#     for idx, row in flows_df.iterrows():
#         m = row["month"]
#         monthly_records.append({
#             "month":           m,
#             #"entered_orders":  row["entered"],   # (Commented out)
#             "entered_people":  len(entered_sets[m]),
#             #"exited_orders":   row["exited"],    # (Commented out)
#             "exited_people":   len(exited_sets[m]),
#             #"active_orders":   row["active_count_current"],    # (Commented out)
#             "active_clients":  len(snapshots_cur[m])
#         })

#     monthly_summary_df = pd.DataFrame(monthly_records)
#     print("\n=== Monthly Summary: Orders & Clients ===")
#     print(monthly_summary_df.to_string(index=False))
 
#     logging.info("Completed calculation of year-on-year flows and age rates")

#     # Return all four tables: flows, ages, yearly and monthly summaries
#     return flows_df, ages_df, summary_df, monthly_summary_df


In [None]:

# if __name__ == "__main__":
#     start_year = 2022
#     end_year = 2025
#     start_month = "2022-06"
#     end_month = "2025-06"

#     # Calculate historical flows and age rates
#     final_df, ages_df, summary_df, monthly_summary_df = calculate_yearonyear_flows_and_age_rates(
#          start_month, end_month,
#          redistribute_unknown_age=True
#      )


#     # Compute 2-year stop-flow forecast
#     sf_fc = stop_flow_forecast(ages_df, periods=12)
    
#     # # Pivot for plotting: month on x‐axis, each age a line
#     # pivot = sf_fc.pivot(index='month', columns='age_group', values='active_forecast')
#     # pivot.index = pd.to_datetime(pivot.index)
    
#     # Plot
#     # fig, ax = plt.subplots(figsize=(10, 6))
#     # for age, series in pivot.items():
#     #     ax.plot(
#     #         series.index, series.values,
#     #         label=f"{int(age)} yrs",
#     #         marker='o',
#     #         linewidth=2,
#     #         alpha=0.8
#     #     )
    
#     current_age_specific_deputyship_agg = ages_df.copy()
#     current_age_specific_deputyship_agg = current_age_specific_deputyship_agg.rename(
#         columns={
#             'age_group': 'age',
#             'active_count': 'active_caseloads',
#             'entered': 'new_deputyships',
#             'terminations': 'terminated'
#         }
#     )
    
    
#     forecasted_age_specific_deputyship_agg = sf_fc.copy()
#     forecasted_age_specific_deputyship_agg = forecasted_age_specific_deputyship_agg.rename(
#         columns={
#             'age_group': 'age',
#             'active_forecast': 'active_caseloads'
#         }
#     )
#     #forecasted_age_specific_deputyship_agg['month'] = pd.to_datetime(forecasted_age_specific_deputyship_agg['month'], format='%Y-%m')
#     #current_age_specific_deputyship_agg['month'] = pd.to_datetime(forecasted_age_specific_deputyship_agg['month'], format='%Y-%m')
#     # Final tforcast and actuals
#     combined_table = get_combined_age_deputyship_table(current_age_specific_deputyship_agg, forecasted_age_specific_deputyship_agg)
#     # Ensure 'month' is datetime
#     #combined_table['month'] = pd.to_datetime(combined_table['month'], format='%Y-%m')
#     final_deputyship_historical_forecasts = combined_table
#     final_deputyship_historical_forecasts['month'] = pd.to_datetime(final_deputyship_historical_forecasts['month']).dt.strftime("%b-%y")
    
#     # Save in CSV
#     final_deputyship_historical_forecasts.to_csv(f"output/final_deputyship_historical_forecasts_{start_year}_{end_year}.csv")
#     # Plotting age-specific active caseloads, termination rate, and new deputyships over time
    

    
#     # # Active Caseloads by Age Group
#     # active_pivot = combined_table.pivot(index='month', columns='age', values='active_caseloads')

