In [None]:

!pip install pydbtools
!pip install --force numpy==1.24

import os
import calendar
import shutil
from datetime import datetime,timedelta
import pydbtools
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import pandas as pd
import boto3
import numpy as np
import matplotlib.pyplot as plt
import logging
import botocore
import s3fs
from dateutil.relativedelta import relativedelta

# suppress that specific pandas RuntimeWarning
import warnings
warnings.filterwarnings(
    "ignore",
    category=RuntimeWarning,
    message=".*invalid value encountered in cast.*"
)

# Ensure pydbtools is available
try:
    import pydbtools
except ImportError:
    raise ImportError("The 'pydbtools' package is required. Install it with: pip install pydbtools")

In [None]:


def clear_directory(path):
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)  # Remove file or symbolic link
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)  # Remove directory and its contents
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')


def fetch_cases_for_date(run_date: str) -> pd.DataFrame:
    """
    Fetch all cases & their fee reductions for the given run_date (YYYY-MM-DD)
    using pydbtools.read_sql_query, which returns a pandas DataFrame.
    """
    query = f"""
    WITH active_fee_reductions AS (
      SELECT
        fc.client_id,
        SUBSTRING(fr.type,1,1) || LOWER(SUBSTRING(fr.type,2)) AS type,
        DATE(fr.startdate) AS startdate,
        DATE(fr.enddate)   AS enddate,
        fc.payment_method
      FROM opg_sirius_prod.fee_reduction fr
      JOIN opg_sirius_prod.finance_client fc
        ON fc.id = fr.finance_client_id
       AND fc.glueexporteddate = DATE('{run_date}')
      JOIN (
        SELECT
          MAX(id)           AS id,
          finance_client_id
        FROM opg_sirius_prod.fee_reduction
        WHERE enddate           >= DATE('{run_date}')
          AND startdate         <= DATE('{run_date}')
          AND deleted            = FALSE
          AND glueexporteddate   = DATE('{run_date}')
        GROUP BY finance_client_id
      ) latest ON latest.id = fr.id
      WHERE fr.glueexporteddate = DATE('{run_date}')
    )
    SELECT
      c.glueexporteddate,
      c.caserecnumber            AS casenumber,
      c.uid                      AS siriusid,
      (
        SELECT supervisionlevel
        FROM opg_sirius_prod.supervision_level_log sll
        WHERE sll.order_id         = c.id
          AND sll.glueexporteddate = DATE('{run_date}')
        ORDER BY sll.appliesfrom DESC
        LIMIT 1
      ) AS casesupervisionlevel,
      p.risk_score               AS CREC,
      c.casesubtype              AS orderType,
      c.orderdate                AS ordermadedate,
      c.orderstatus              AS orderStatus,
      afr.type                   AS feereductiontype,
      p.dob,
      CASE
        WHEN FLOOR(DATE_DIFF('day', p.dob, p.createddate) / 365.25) < 0 THEN 0
        ELSE ROUND(DATE_DIFF('day', p.dob, p.createddate) / 365.25)
      END AS age_in_years
    FROM opg_sirius_prod.persons p
    JOIN opg_sirius_prod.cases c
      ON p.id                   = c.client_id
     AND c.glueexporteddate     = DATE('{run_date}')
    LEFT JOIN active_fee_reductions afr
      ON afr.client_id          = p.id
    WHERE c.orderstatus IN ('OPEN','ACTIVE','DUPLICATE')
      AND p.glueexporteddate     = DATE('{run_date}')
    ORDER BY c.orderdate;
    """
    return pydbtools.read_sql_query(query)


def parse_month(month_str: str) -> datetime:
    """Strip quotes/whitespace and parse 'YYYY-MM' → datetime."""
    cleaned = month_str.strip().strip("'\"")
    return datetime.strptime(cleaned, "%Y-%m")


def generate_month_list(start_month: str, end_month: str):
    """
    Return a list of datetime objects for each month-start
    from start_month to end_month inclusive.
    """
    start_dt = parse_month(start_month)
    end_dt = parse_month(end_month)
    if start_dt > end_dt:
        raise ValueError(f"Start month ({start_month}) is after end month ({end_month})")

    months = []
    current = start_dt
    while current <= end_dt:
        months.append(current)
        current += relativedelta(months=1)
    return months

def last_day_of_month(dt: datetime) -> str:
    """Return the last day of dt's month as 'YYYY-MM-DD'."""
    day = calendar.monthrange(dt.year, dt.month)[1]
    return dt.replace(day=day).strftime("%Y-%m-%d")
        
def export_monthly_reports(first_month: str, last_month: str, output_base="output"):
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last = last_month.strip().strip("'\"")

    # Generate all months
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return

    # Prepare output directory
    os.makedirs(output_base, exist_ok=True)
    excel_filename = f"cases_{clean_first}_to_{clean_last}.xlsx"
    excel_path = os.path.join(output_base, excel_filename)    
    clear_directory(excel_path)
    
    # List to accumulate each month's DataFrame
    all_months = []

    # Create Excel workbook and write each month's sheet
    with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
        for dt in months:
            month_tag = dt.strftime("%Y-%m")
            run_date = last_day_of_month(dt)

            # Fetch data for this month-end
            df = fetch_cases_for_date(run_date)

            # Tag the DataFrame with its month, then collect it
            df["month"] = month_tag
            all_months.append(df)

            # Save CSV for this month
            month_folder = os.path.join(output_base, month_tag)
            os.makedirs(month_folder, exist_ok=True)
            csv_path = os.path.join(month_folder, f"cases_{month_tag}.csv")
            df.to_csv(csv_path, index=False)

            # Add to Excel workbook
            df.to_excel(writer, sheet_name=month_tag, index=False)

            print(f"→ Saved CSV for {month_tag}: {csv_path}")

        print(f"→ Combined Excel workbook saved at: {excel_path}")

    # After all sheets are written, concatenate & export one big CSV
    if all_months:
        combined_df = pd.concat(all_months, ignore_index=True)
        combined_csv_path = os.path.join(
            output_base,
            f"all_cases_{clean_first}_to_{clean_last}.csv"
        )
        combined_df.to_csv(combined_csv_path, index=False)
        print(f"→ Combined CSV for all months saved at: {combined_csv_path}")
        

def calculate_monthly_active_cases(first_month: str, last_month: str, output_base="output") -> pd.DataFrame:
    """
    For each month between first_month and last_month (inclusive), fetch the data,
    filter to ACTIVE cases, then aggregate unique casenumber counts by orderType.
    Writes a CSV 'monthly_active_cases_<first>_to_<last>.csv' under output_base,
    and returns the aggregated DataFrame.
    """
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last = last_month.strip().strip("'\"")

    # Generate all months
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return pd.DataFrame()

    # Prepare output directory
    os.makedirs(output_base, exist_ok=True)

    # List to collect each month's summary
    summaries = []

    for dt in months:
        month_tag = dt.strftime("%Y-%m")
        run_date = last_day_of_month(dt)

        # Fetch data for this month-end
        df = fetch_cases_for_date(run_date)

        # Keep only ACTIVE cases
        df_active = df[df["orderstatus"] == "ACTIVE"].copy()
        if df_active.empty:
            # still record zero counts for completeness
            summaries.append(
                pd.DataFrame([{"month": month_tag, "orderType": None, "active_case_count": 0}])
            )
            continue

        # Tag with month for grouping
        df_active["month"] = month_tag

        # Aggregate unique casenumbers per orderType
        summary = (
            df_active
            .groupby(["month", "ordertype"])["casenumber"]
            .nunique()
            .reset_index(name="active_case_count")
        )
        summaries.append(summary)

        print(f"→ Aggregated ACTIVE cases for {month_tag}")

    # Combine all month summaries
    result_df = pd.concat(summaries, ignore_index=True)

    # Fill any missing orderTypes/months with zeros if you want full matrix:
    # pivot = result_df.pivot_table(index="month", columns="orderType",
    #                               values="active_case_count", fill_value=0).reset_index()

    # Write out CSV
    out_csv = os.path.join(
        output_base,
        f"monthly_active_cases_{clean_first}_to_{clean_last}.csv"
    )
    result_df.to_csv(out_csv, index=False)
    print(f"→ Monthly ACTIVE cases CSV saved at: {out_csv}")

    return result_df

# Forecasting Active Caseloads:

## entered: count of cases newly appearing in the active caseload each month

## exited: count of cases that dropped out since the prior month

def calculate_monthly_flow(first_month: str, last_month: str, output_base="output") -> pd.DataFrame:
    """
    For each month from first_month to last_month (inclusive),
    snapshot the set of active casenumbers, then compare to the prior month
    to count how many entered and exited the active caseload.
    Writes 'monthly_flow_<first>_to_<last>.csv' under output_base,
    and returns the flow DataFrame.
    """
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last  = last_month.strip().strip("'\"")

    # Generate all months
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return pd.DataFrame()

    # Prepare snapshots dict: { "YYYY-MM": set_of_casenumbers }
    snapshots = {}
    for dt in months:
        month_tag = dt.strftime("%Y-%m")
        run_date  = last_day_of_month(dt)
        df        = fetch_cases_for_date(run_date)
        # Only unique casenumbers
        snapshots[month_tag] = set(df["casenumber"].unique())
        print(f"→ Snapshot for {month_tag}: {len(snapshots[month_tag])} active cases")

    # Build flow records
    flow_records = []
    prev_month = None
    for month in sorted(snapshots):
        current = snapshots[month]
        if prev_month is None:
            # First month: all are "new", none have "exited"
            flow_records.append({
                "month": month,
                "entered": len(current),
                "exited":  0
            })
        else:
            prev = snapshots[prev_month]
            entered = current - prev
            exited  = prev - current
            flow_records.append({
                "month":  month,
                "entered": len(entered),
                "exited":  len(exited)
            })
        prev_month = month

    # Create DataFrame and write CSV
    flow_df = pd.DataFrame(flow_records)
    os.makedirs(output_base, exist_ok=True)
    out_csv = os.path.join(
        output_base,
        f"monthly_flow_{clean_first}_to_{clean_last}.csv"
    )
    flow_df.to_csv(out_csv, index=False)
    print(f"→ Monthly flow CSV saved at: {out_csv}")

    return flow_df


def calculate_yearonyear_flows_and_age_rates(
    first_month: str,
    last_month: str,
    output_base: str = "output",
    redistribute_unknown_age: bool = False,
    age_bins: tuple = None,
    age_labels: tuple = None
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Combines granular 1-year and custom-bin versions.
    - Default: 1-year bins 0-120 with optional unknown-age redistribution.
    - Custom: pass age_bins & age_labels for coarse bins.

    Returns flows_df, age_rates_df and writes two CSVs to output_base.
    """
    os.makedirs(output_base, exist_ok=True)

    # Default bins: 1-year ages 0-120
    if age_bins is None or age_labels is None:
        age_bins = list(range(0, 122))
        age_labels = [str(a) for a in age_bins[:-1]]

    months = generate_month_list(first_month, last_month)
    flow_records = []
    age_rate_records = []

    for dt in months:
        prev_dt = dt - relativedelta(years=1)
        if prev_dt < parse_month(first_month):
            continue

        tag = dt.strftime("%Y-%m")
        # fetch current and prior snapshots
        df_cur = fetch_cases_for_date(last_day_of_month(dt))
        df_prev = fetch_cases_for_date(last_day_of_month(prev_dt))

        set_cur = set(df_cur["casenumber"])
        set_prev = set(df_prev["casenumber"])
        entered_set = set_cur - set_prev
        exited_set  = set_prev - set_cur

        # record overall flows
        flow_records.append({
            "month": tag,
            "entered": len(entered_set),
            "exited": len(exited_set)
        })

        # slice data for age groups
        df_term = df_prev[df_prev["casenumber"].isin(exited_set)].copy()
        df_in  = df_cur [df_cur ["casenumber"].isin(entered_set)].copy()
        df_base= df_prev.copy()

        # assign age_group
        for df_ in (df_term, df_in, df_base):
            df_["age_group"] = pd.cut(
                df_["age_in_years"],
                bins=age_bins,
                labels=age_labels,
                right=False,
                include_lowest=True
            )

        # count known-age inflows and optionally redistribute unknowns
        in_counts = df_in.groupby("age_group", observed=False)["casenumber"].nunique().reindex(age_labels, fill_value=0)
        unknown = df_in["age_group"].isna().sum()
        
        missing = ""
        
        if redistribute_unknown_age and unknown > 0:
            missing = "redistributed_missing"
            # compute proportions
            total_known = in_counts.sum()
            
            if total_known > 0:
                props = in_counts / total_known
            else:
                props = pd.Series(1/len(age_labels), index=age_labels)
            alloc = (props * unknown).round().astype(int)
            in_counts += alloc

        ter_counts  = df_term.groupby("age_group", observed=False)["casenumber"].nunique().reindex(age_labels, fill_value=0)
        base_counts = df_base.groupby("age_group", observed=False)["casenumber"].nunique().reindex(age_labels, fill_value=0)
        
        # compile age-rate records
        for grp in age_labels:
            active = int(base_counts[grp])
            term   = int(ter_counts[grp])
            ent    = int(in_counts[grp])
            rate   = round(term/active, 4) if active else 0.0
            age_rate_records.append({
                "month": tag,
                "age_group": grp,
                "active_count": active,
                "entered": ent,
                "terminations": term,
                "termination_rate": rate
            })

    print(f"count known-age inflows: {in_counts}")
    print(f"count unknown-age inflows: {unknown}")
    print(f"termination counts: {ter_counts}")
    print(f"base counts: {base_counts}")
    print(f"total known-age inflows: {total_known}")
    
    flows_df = pd.DataFrame(flow_records)
    ages_df  = pd.DataFrame(age_rate_records)

    clear_directory(os.path.join(output_base))
    # write outputs
    flows_df.to_csv(os.path.join(output_base, f"yearonyear_flows_{first_month}_to_{last_month}_{missing}.csv"), index=False)
    ages_df .to_csv(os.path.join(output_base, f"termination_and_entry_rates_by_age_{first_month}_to_{last_month}_{missing}.csv"), index=False)

    print(f"→ Saved flows to {output_base}/yearonyear_flows_{first_month}_to_{last_month}_{missing}.csv")
    print(f"→ Saved age rates to {output_base}/termination_and_entry_rates_by_age_{first_month}_to_{last_month}_{missing}.csv")

    return flows_df, ages_df




In [None]:
# # Suppose entry_counts_by_age is from 2023-2025
# age_distribution = entry_counts_by_age / entry_counts_by_age.sum()
# print(age_distribution)

# # Distribute forecasted totals (one row per month)
# forecast_by_age = pd.DataFrame({
#     "month": forecast.index,
#     **{age: forecast.values * prop for age, prop in age_distribution.items()}
# })
# print(forecast_by_age)

In [None]:
# from statsmodels.tsa.holtwinters import ExponentialSmoothing

# flows_df["month"] = pd.to_datetime(flows_df["month"])
# monthly_new_orders = flows_df.set_index("month")["entered"].resample("M").sum()

# model = ExponentialSmoothing(monthly_new_orders, trend="add", seasonal="add", seasonal_periods=12)
# fit = model.fit()
# forecast = fit.forecast(12)  # Predict 12 months into 2025


In [None]:
# if __name__ == "__main__":
#     first_month = input("Enter first month (YYYY-MM): ")
#     last_month  = input("Enter last month  (YYYY-MM): ")

    # export_monthly_reports(first_month, last_month)
    # active_summary = calculate_monthly_active_cases(first_month, last_month)
    # flow_summary  = calculate_monthly_flow(first_month, last_month)

    # # now do month‐to‐month style year‐on‐year deputyship flows:
    
    # # compute year-on-year flows + age rates across all months
    # yoy_flows, age_rates = calculate_yearonyear_flows_and_age_rates(first_month, last_month)

    # # after export_monthly_reports("2023-05", "2025-05")
    # print("\nYear-on-year deputyship flows:")
    # print(yoy_flows)
    # # for each month and age‐group, the number of entered (new deputyships) alongside terminations and termination rates:
    # print("\nAge-specific termination rates over time:")
    # print(age_rates)





# # 1) Run without redistribution
# flows1, rates1 = calculate_yearonyear_flows_and_age_rates("2023-04","2025-04", "out1")

# # 2) Run with redistribution
# flows2, rates2 = calculate_yearonyear_flows_and_age_rates("2023-04","2025-04", "out2", redistribute_unknown_age=True)

In [None]:
# def forecast_monthly_new_orders(
#     flows_df: pd.DataFrame,
#     periods: int = 12,
#     freq: str = 'M'
# ) -> pd.Series:
#     """
#     Forecast total monthly new orders using Holt-Winters.
#     Expects flows_df with columns ['month','entered'].
#     Returns a pd.Series indexed by forecast months.
#     """
#     ts = flows_df.copy()
#     ts['month'] = pd.to_datetime(ts['month'])
#     ts = ts.set_index('month').resample(freq)['entered'].sum()

#     model = ExponentialSmoothing(ts, trend='add', seasonal='add', seasonal_periods=12)
#     fit = model.fit(optimized=True)
#     forecast = fit.forecast(periods)
#     return forecast


# def distribute_forecast_by_age(
#     forecast: pd.Series,
#     age_rates_df: pd.DataFrame,
#     weight_years = (2023, 2025)
# ) -> pd.DataFrame:
#     """
#     Distribute forecast totals by age group using average historical proportions.
#     """
#     # filter historical entries
#     hist = age_rates_df.copy()
#     hist['month'] = pd.to_datetime(hist['month'])
#     hist = hist[hist['month'].dt.year.between(*weight_years)]

#     # compute average distribution
#     dist = (
#         hist.groupby("age_group", observed=False)['entered'].sum()
#         / hist['entered'].sum()
#     )

#     # build DataFrame
#     rows = []
#     for month, total in forecast.items():
#         for age, prop in dist.items():
#             rows.append({'month': month.strftime('%Y-%m'), 'age_group': age, 'forecasted_new': round(total * prop)})
#     return pd.DataFrame(rows)


# def plot_forecast_and_distribution(
#     forecast: pd.Series,
#     forecast_by_age: pd.DataFrame
# ):
#     """
#     Generates two charts:
#       1. Total new orders forecast
#       2. Stacked area by age group
#     """
#     # 1. Total forecast
#     plt.figure()
#     forecast.plot(marker='o')
#     plt.title('Monthly New Orders Forecast for 2026')
#     plt.ylabel('New Orders')
#     plt.xlabel('Month')
#     plt.tight_layout()
#     plt.show()

#     # 2. Stacked area chart
#     pivot = forecast_by_age.pivot(index='month', columns='age_group', values='forecasted_new')
#     pivot.index = pd.to_datetime(pivot.index)
#     plt.figure()
#     pivot.plot.area()
#     plt.title('Forecasted New Orders by Age Group')
#     plt.ylabel('New Orders')
#     plt.xlabel('Month')
#     plt.tight_layout()
#     plt.show()


In [None]:
# flows, ages = calculate_yearonyear_flows_and_age_rates("2023-01", "2025-06", redistribute_unknown_age=True)
# forecast = forecast_monthly_new_orders(flows, periods=12)
# by_age = distribute_forecast_by_age(forecast, ages, weight_years=(2023,2025))
# plot_forecast_and_distribution(forecast, by_age)


In [None]:

# 1) EXISTING HELPERS:
#flows_df, ages_df = calculate_yearonyear_flows_and_age_rates("2023-01", "2025-06", redistribute_unknown_age=True)

# 2) NEW: Forecast with prediction intervals
def forecast_with_pi(
    flows_df: pd.DataFrame,
    periods: int = 12,
    alpha: float = 0.05,
    seasonal_periods: int = 12
) -> pd.DataFrame:
    """
    Forecast monthly new orders with 95% prediction intervals.
    Falls back to trend-only if insufficient data for seasonality.
    Returns DataFrame with columns ['mean','lower','upper'] indexed by period end.
    """
    ts = flows_df.set_index(pd.to_datetime(flows_df['month']))['entered'].resample('ME').sum()
    nobs = len(ts)
    # Determine model configuration
    if seasonal_periods and nobs >= 2 * seasonal_periods:
        model = ExponentialSmoothing(ts, trend='add', seasonal='add', seasonal_periods=seasonal_periods)
    else:
        model = ExponentialSmoothing(ts, trend='add', seasonal=None)
    fit = model.fit(optimized=True)

    mu = fit.forecast(periods)
    resid = fit.resid.dropna()
    sigma = resid.std()
    z = abs(pd.Series(resid).quantile([alpha/2, 1-alpha/2]).iloc[1])
    lower = mu - z * sigma
    upper = mu + z * sigma
    return pd.DataFrame({'mean': mu, 'lower': lower, 'upper': upper})


def distribute_pi_by_age(
    pi_df: pd.DataFrame,
    ages_df: pd.DataFrame,
    weight_years: tuple[int, int] = (2023, 2025)
) -> pd.DataFrame:
    """
    Allocate forecast mean and PI bounds by age group per historical proportions.
    """
    hist = ages_df.copy()
    hist['month'] = pd.to_datetime(hist['month'])
    hist = hist[hist['month'].dt.year.between(*weight_years)]
    dist = hist.groupby("age_group", observed=False)['entered'].sum() / hist['entered'].sum()

    rows = []
    for m, row in pi_df.iterrows():
        for age, p in dist.items():
            rows.append({
                'month': m.strftime('%Y-%m'),
                'age_group': age,
                'mean':   row['mean'] * p,
                'lower':  row['lower'] * p,
                'upper':  row['upper'] * p
            })
    return pd.DataFrame(rows)


def plot_total_forecast(pi_df: pd.DataFrame):
    plt.figure()
    x = pi_df.index
    plt.fill_between(x, pi_df['lower'], pi_df['upper'], alpha=0.3)
    plt.plot(x, pi_df['mean'], marker='o')
    plt.title('Monthly New Orders Forecast (95% PI)')
    plt.xlabel('Month'); plt.ylabel('New Orders')
    plt.tight_layout()
    return plt.gcf()


def plot_age_forecast(pi_age_df: pd.DataFrame):
    pivot = pi_age_df.pivot(index='month', columns='age_group', values='mean')
    pivot.index = pd.to_datetime(pivot.index)

    fig, ax = plt.subplots()
    pivot.plot.area(ax=ax)
    ax.set_title('Forecasted New Orders by Age Group (Mean)')
    ax.set_xlabel('Month'); ax.set_ylabel('New Orders')
    fig.tight_layout()
    return fig
    

if __name__ == "__main__":
    flows, ages = calculate_yearonyear_flows_and_age_rates(
        "2023-01", "2025-06",
        redistribute_unknown_age=True
    )

    pi     = forecast_with_pi(flows, periods=12)
    pi_age = distribute_pi_by_age(pi, ages, weight_years=(2022,2024))

    fig1 = plot_total_forecast(pi)
    fig2 = plot_age_forecast(pi_age)



In [None]:

# export_monthly_reports("2023-04","2023-05")

# active_summary = calculate_monthly_active_cases("2023-04","2023-05")
# print(active_summary.head())

# flow_summary  = calculate_monthly_flow("2023-04","2023-05")
# print(flow_summary.head())


In [None]:
!pip install pydbtools
# !pip install --force numpy==1.24
#!pip install pydbtools
# !pip install --force numpy==1.24

import os
import calendar
import shutil
from datetime import datetime,timedelta
import pydbtools
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import pandas as pd

import boto3
import numpy as np
import matplotlib.pyplot as plt
import logging
import botocore
import s3fs
from dateutil.relativedelta import relativedelta

In [None]:


def parse_month(month_str: str) -> datetime:
    """Parse 'YYYY-MM' to datetime."""
    return datetime.strptime(month_str.strip().strip("'\""), "%Y-%m")


def clear_directory(path):
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)  # Remove file or symbolic link
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)  # Remove directory and its contents
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')


def fetch_cases_for_date(run_date: str) -> pd.DataFrame:
    """
    Fetch all cases & their fee reductions for the given run_date (YYYY-MM-DD)
    using pydbtools.read_sql_query, which returns a pandas DataFrame.
    """
    query = f"""
    WITH active_fee_reductions AS (
      SELECT
        fc.client_id,
        SUBSTRING(fr.type,1,1) || LOWER(SUBSTRING(fr.type,2)) AS type,
        DATE(fr.startdate) AS startdate,
        DATE(fr.enddate)   AS enddate,
        fc.payment_method
      FROM opg_sirius_prod.fee_reduction fr
      JOIN opg_sirius_prod.finance_client fc
        ON fc.id = fr.finance_client_id
       AND fc.glueexporteddate = DATE('{run_date}')
      JOIN (
        SELECT
          MAX(id)           AS id,
          finance_client_id
        FROM opg_sirius_prod.fee_reduction
        WHERE enddate           >= DATE('{run_date}')
          AND startdate         <= DATE('{run_date}')
          AND deleted            = FALSE
          AND glueexporteddate   = DATE('{run_date}')
        GROUP BY finance_client_id
      ) latest ON latest.id = fr.id
      WHERE fr.glueexporteddate = DATE('{run_date}')
    )
    SELECT
      c.glueexporteddate,
      c.caserecnumber            AS casenumber,
      c.uid                      AS siriusid,
      (
        SELECT supervisionlevel
        FROM opg_sirius_prod.supervision_level_log sll
        WHERE sll.order_id         = c.id
          AND sll.glueexporteddate = DATE('{run_date}')
        ORDER BY sll.appliesfrom DESC
        LIMIT 1
      ) AS casesupervisionlevel,
      p.risk_score               AS CREC,
      c.casesubtype              AS orderType,
      c.orderdate                AS ordermadedate,
      c.orderstatus              AS orderStatus,
      afr.type                   AS feereductiontype,
      p.dob,
      CASE
        WHEN FLOOR(DATE_DIFF('day', p.dob, p.createddate) / 365.25) < 0 THEN 0
        ELSE ROUND(DATE_DIFF('day', p.dob, p.createddate) / 365.25)
      END AS age_in_years
    FROM opg_sirius_prod.persons p
    JOIN opg_sirius_prod.cases c
      ON p.id                   = c.client_id
     AND c.glueexporteddate     = DATE('{run_date}')
    LEFT JOIN active_fee_reductions afr
      ON afr.client_id          = p.id
    WHERE c.orderstatus IN ('OPEN','ACTIVE','DUPLICATE')
      AND p.glueexporteddate     = DATE('{run_date}')
    ORDER BY c.orderdate;
    """
    return pydbtools.read_sql_query(query)


def generate_month_list(start_month: str, end_month: str):
    """
    Return a list of datetime objects for each month-start
    from start_month to end_month inclusive.
    """
    start_dt = parse_month(start_month)
    end_dt = parse_month(end_month)
    if start_dt > end_dt:
        raise ValueError(f"Start month ({start_month}) is after end month ({end_month})")

    months = []
    current = start_dt
    while current <= end_dt:
        months.append(current)
        current += relativedelta(months=1)
    return months

def last_day_of_month(dt: datetime) -> str:
    """Return the last day of dt's month as 'YYYY-MM-DD'."""
    day = calendar.monthrange(dt.year, dt.month)[1]
    return dt.replace(day=day).strftime("%Y-%m-%d")
        
def export_monthly_reports(first_month: str, last_month: str, output_base="output"):
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last = last_month.strip().strip("'\"")

    # Generate all months
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return

    # Prepare output directory
    os.makedirs(output_base, exist_ok=True)
    excel_filename = f"cases_{clean_first}_to_{clean_last}.xlsx"
    excel_path = os.path.join(output_base, excel_filename)    
    clear_directory(excel_path)
    
    # List to accumulate each month's DataFrame
    all_months = []

    # Create Excel workbook and write each month's sheet
    with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
        for dt in months:
            month_tag = dt.strftime("%Y-%m")
            run_date = last_day_of_month(dt)

            # Fetch data for this month-end
            df = fetch_cases_for_date(run_date)

            # Tag the DataFrame with its month, then collect it
            df["month"] = month_tag
            all_months.append(df)

            # Save CSV for this month
            month_folder = os.path.join(output_base, month_tag)
            os.makedirs(month_folder, exist_ok=True)
            csv_path = os.path.join(month_folder, f"cases_{month_tag}.csv")
            df.to_csv(csv_path, index=False)

            # Add to Excel workbook
            df.to_excel(writer, sheet_name=month_tag, index=False)

            print(f"→ Saved CSV for {month_tag}: {csv_path}")

        print(f"→ Combined Excel workbook saved at: {excel_path}")

    # After all sheets are written, concatenate & export one big CSV
    if all_months:
        combined_df = pd.concat(all_months, ignore_index=True)
        combined_csv_path = os.path.join(
            output_base,
            f"all_cases_{clean_first}_to_{clean_last}.csv"
        )
        combined_df.to_csv(combined_csv_path, index=False)
        print(f"→ Combined CSV for all months saved at: {combined_csv_path}")
        

def calculate_monthly_active_cases(first_month: str, last_month: str, output_base="output") -> pd.DataFrame:
    """
    For each month between first_month and last_month (inclusive), fetch the data,
    filter to ACTIVE cases, then aggregate unique casenumber counts by orderType.
    Writes a CSV 'monthly_active_cases_<first>_to_<last>.csv' under output_base,
    and returns the aggregated DataFrame.
    """
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last = last_month.strip().strip("'\"")

    # Generate all months
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return pd.DataFrame()

    # Prepare output directory
    os.makedirs(output_base, exist_ok=True)

    # List to collect each month's summary
    summaries = []

    for dt in months:
        month_tag = dt.strftime("%Y-%m")
        run_date = last_day_of_month(dt)

        # Fetch data for this month-end
        df = fetch_cases_for_date(run_date)

        # Keep only ACTIVE cases
        df_active = df[df["orderstatus"] == "ACTIVE"].copy()
        if df_active.empty:
            # still record zero counts for completeness
            summaries.append(
                pd.DataFrame([{"month": month_tag, "orderType": None, "active_case_count": 0}])
            )
            continue

        # Tag with month for grouping
        df_active["month"] = month_tag

        # Aggregate unique casenumbers per orderType
        summary = (
            df_active
            .groupby(["month", "ordertype"])["casenumber"]
            .nunique()
            .reset_index(name="active_case_count")
        )
        summaries.append(summary)

        print(f"→ Aggregated ACTIVE cases for {month_tag}")

    # Combine all month summaries
    result_df = pd.concat(summaries, ignore_index=True)

    # Fill any missing orderTypes/months with zeros if you want full matrix:
    # pivot = result_df.pivot_table(index="month", columns="orderType",
    #                               values="active_case_count", fill_value=0).reset_index()

    # Write out CSV
    out_csv = os.path.join(
        output_base,
        f"monthly_active_cases_{clean_first}_to_{clean_last}.csv"
    )
    result_df.to_csv(out_csv, index=False)
    print(f"→ Monthly ACTIVE cases CSV saved at: {out_csv}")

    return result_df

# Forecasting Active Caseloads:

## entered: count of cases newly appearing in the active caseload each month

## exited: count of cases that dropped out since the prior month

def calculate_monthly_flow(first_month: str, last_month: str, output_base="output") -> pd.DataFrame:
    """
    For each month from first_month to last_month (inclusive),
    snapshot the set of active casenumbers, then compare to the prior month
    to count how many entered and exited the active caseload.
    Writes 'monthly_flow_<first>_to_<last>.csv' under output_base,
    and returns the flow DataFrame.
    """
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last  = last_month.strip().strip("'\"")

    # Generate all months
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return pd.DataFrame()

    # Prepare snapshots dict: { "YYYY-MM": set_of_casenumbers }
    snapshots = {}
    for dt in months:
        month_tag = dt.strftime("%Y-%m")
        run_date  = last_day_of_month(dt)
        df        = fetch_cases_for_date(run_date)
        # Only unique casenumbers
        snapshots[month_tag] = set(df["casenumber"].unique())
        print(f"→ Snapshot for {month_tag}: {len(snapshots[month_tag])} active cases")

    # Build flow records
    flow_records = []
    prev_month = None
    for month in sorted(snapshots):
        current = snapshots[month]
        if prev_month is None:
            # First month: all are "new", none have "exited"
            flow_records.append({
                "month": month,
                "entered": len(current),
                "exited":  0
            })
        else:
            prev = snapshots[prev_month]
            entered = current - prev
            exited  = prev - current
            flow_records.append({
                "month":  month,
                "entered": len(entered),
                "exited":  len(exited)
            })
        prev_month = month

    # Create DataFrame and write CSV
    flow_df = pd.DataFrame(flow_records)
    os.makedirs(output_base, exist_ok=True)
    out_csv = os.path.join(
        output_base,
        f"monthly_flow_{clean_first}_to_{clean_last}.csv"
    )
    flow_df.to_csv(out_csv, index=False)
    print(f"→ Monthly flow CSV saved at: {out_csv}")

    return flow_df



def calculate_yearonyear_flows_and_age_rates(
    first_month: str,
    last_month: str,
    output_base: str = "output",
    redistribute_unknown_age: bool = False,
    age_bins: tuple = None,
    age_labels: tuple = None
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Combines granular 1-year and custom-bin versions.
    - Default: 1-year bins 0-119 with optional unknown-age redistribution.
    - Custom: pass age_bins & age_labels for coarse bins.

    Returns flows_df, age_rates_df and writes two CSVs to output_base.
    """
    os.makedirs(output_base, exist_ok=True)

    if age_bins is None or age_labels is None:
        age_bins = list(range(0, 121))
        age_labels = [str(a) for a in age_bins[:-1]]

    months = generate_month_list(first_month, last_month)
    flow_records = []
    age_rate_records = []

    for dt in months:
        prev_dt = dt - relativedelta(years=1)
        if prev_dt < parse_month(first_month):
            continue

        tag = dt.strftime("%Y-%m")
        df_cur = fetch_cases_for_date(last_day_of_month(dt))
        df_prev = fetch_cases_for_date(last_day_of_month(prev_dt))

        set_cur = set(df_cur["casenumber"])
        set_prev = set(df_prev["casenumber"])
        entered_set = set_cur - set_prev
        exited_set  = set_prev - set_cur

        flow_records.append({"month": tag, "entered": len(entered_set), "exited": len(exited_set)})

        df_term = df_prev[df_prev["casenumber"].isin(exited_set)].copy()
        df_in   = df_cur [df_cur ["casenumber"].isin(entered_set)].copy()
        df_base = df_prev.copy()

        for df_ in (df_term, df_in, df_base):
            df_["age_group"] = pd.cut(
                df_["age_in_years"], bins=age_bins, labels=age_labels, right=False, include_lowest=True
            )

        in_counts = df_in.groupby("age_group", observed=False)["casenumber"].nunique().reindex(age_labels, fill_value=0)
        unknown = df_in["age_group"].isna().sum()
        if redistribute_unknown_age and unknown > 0:
            total_known = in_counts.sum()
            props = (in_counts / total_known) if total_known > 0 else pd.Series(1/len(age_labels), index=age_labels)
            in_counts += (props * unknown).round().astype(int)

        ter_counts  = df_term.groupby("age_group", observed=False)["casenumber"].nunique().reindex(age_labels, fill_value=0)
        base_counts = df_base.groupby("age_group", observed=False)["casenumber"].nunique().reindex(age_labels, fill_value=0)

        for grp in age_labels:
            active = int(base_counts[grp])
            term   = int(ter_counts[grp])
            ent    = int(in_counts[grp])
            rate   = round(term/active, 4) if active else 0.0
            age_rate_records.append({
                "month": tag,
                "age_group": grp,
                "active_count": active,
                "entered": ent,
                "terminations": term,
                "termination_rate": rate
            })

    flows_df = pd.DataFrame(flow_records)
    ages_df  = pd.DataFrame(age_rate_records)

    flows_df.to_csv(os.path.join(output_base, f"yearonyear_flows_{first_month}_to_{last_month}.csv"), index=False)
    ages_df .to_csv(os.path.join(output_base, f"termination_and_entry_rates_by_age_{first_month}_to_{last_month}.csv"), index=False)

    return flows_df, ages_df


def forecast_with_pi(
    flows_df: pd.DataFrame,
    periods: int = 12,
    alpha: float = 0.05,
    seasonal_periods: int = 12
) -> pd.DataFrame:
    """
    Forecast monthly new orders with 95% prediction intervals.
    Falls back to trend-only if insufficient data for seasonality.
    """
    ts = flows_df.set_index(pd.to_datetime(flows_df['month']))['entered'].resample('ME').sum()
    nobs = len(ts)
    if seasonal_periods and nobs >= 2 * seasonal_periods:
        model = ExponentialSmoothing(ts, trend='add', seasonal='add', seasonal_periods=seasonal_periods)
    else:
        model = ExponentialSmoothing(ts, trend='add', seasonal=None)
    fit = model.fit(optimized=True)
    mu = fit.forecast(periods)
    resid = fit.resid.dropna()
    sigma = resid.std()
    z = abs(pd.Series(resid).quantile([alpha/2, 1-alpha/2]).iloc[1])
    lower = mu - z * sigma
    upper = mu + z * sigma
    return pd.DataFrame({'mean': mu, 'lower': lower, 'upper': upper})


def distribute_pi_by_age(
    pi_df: pd.DataFrame,
    ages_df: pd.DataFrame,
    weight_years: tuple[int, int] # = (start_year, end_year)
) -> pd.DataFrame:
    """
    Allocate forecast mean and PI bounds by age group per historical proportions.
    """
    hist = ages_df.copy()
    hist['month'] = pd.to_datetime(hist['month'])
    hist = hist[hist['month'].dt.year.between(*weight_years)]
    dist = hist.groupby('age_group')['entered'].sum() / hist['entered'].sum()

    rows = []
    for m, row in pi_df.iterrows():
        for age, p in dist.items():
            rows.append({
                'month': m.strftime('%Y-%m'),
                'age_group': age,
                'mean':   row['mean'] * p,
                'lower':  row['lower'] * p,
                'upper':  row['upper'] * p
            })
    return pd.DataFrame(rows)


def forecast_age_specific_active(
    ages_df: pd.DataFrame,
    periods: int = 12,
    seasonal_periods: int = 12
) -> pd.DataFrame:
    """
    Forecasts active caseload by age by separately forecasting entries and termination rates.

    - Fits a time series to each age_group's 'entered' and 'termination_rate'.
    - Forecasts both series for `periods` months.
    - Iteratively projects: active[t] = active[t-1] * (1 - term_rate[t]) + entered[t].
    """
    # Pivot historical series
    hist = ages_df.copy()
    hist['month'] = pd.to_datetime(hist['month'])
    entered_ts = hist.pivot(index='month', columns='age_group', values='entered')
    termrate_ts = hist.pivot(index='month', columns='age_group', values='termination_rate')

    # Forecast containers
    entered_fc = {}
    termrate_fc = {}

    for age in entered_ts.columns:
        # Prepare series
        series_e = entered_ts[age].resample('ME').sum()
        series_t = termrate_ts[age].resample('ME').mean()
        # Entry forecast
        if len(series_e.dropna()) >= 2 * seasonal_periods:
            m_e = ExponentialSmoothing(series_e, trend='add', seasonal='add', seasonal_periods=seasonal_periods)
        else:
            m_e = ExponentialSmoothing(series_e, trend='add', seasonal=None)
        fit_e = m_e.fit(optimized=True)
        entered_fc[age] = fit_e.forecast(periods)
        # Termination rate forecast
        if len(series_t.dropna()) >= 2 * seasonal_periods:
            m_t = ExponentialSmoothing(series_t, trend='add', seasonal='add', seasonal_periods=seasonal_periods)
        else:
            m_t = ExponentialSmoothing(series_t, trend='add', seasonal=None)
        fit_t = m_t.fit(optimized=True)
        termrate_fc[age] = fit_t.forecast(periods)

    # Starting active counts
    last = hist[hist['month'] == hist['month'].max()].set_index('age_group')['active_count']
    active_prev = last.astype(float)

    records = []
    months = pd.date_range(
        start=entered_ts.index.max() + pd.offsets.MonthBegin(1),
        periods=periods, freq='ME'
    )

    for i, month in enumerate(months):
        for age in entered_ts.columns:
            ent = entered_fc[age].iloc[i]
            tr  = termrate_fc[age].iloc[i]
            act = active_prev[age] * (1 - tr) + ent
            records.append({
                'month': month.strftime('%Y-%m'),
                'age_group': age,
                'active_forecast': act
            })
            active_prev[age] = act

    return pd.DataFrame(records)


def plot_total_forecast(pi_df: pd.DataFrame):
    plt.figure()
    x = pi_df.index
    plt.fill_between(x, pi_df['lower'], pi_df['upper'], alpha=0.3)
    plt.plot(x, pi_df['mean'], marker='o')
    plt.title('Monthly New Orders Forecast (95% PI)')
    plt.xlabel('Month'); plt.ylabel('New Orders')
    plt.tight_layout()
    return plt.gcf()


def plot_age_forecast(pi_age_df: pd.DataFrame):
    pivot = pi_age_df.pivot(index='month', columns='age_group', values='mean')
    pivot.index = pd.to_datetime(pivot.index)

    fig, ax = plt.subplots()
    pivot.plot.area(ax=ax)
    ax.set_title('Forecasted New Orders by Age Group (Mean)')
    ax.set_xlabel('Month'); ax.set_ylabel('New Orders')
    fig.tight_layout()
    return fig



def forecast_age_specific_active_with_components(
    ages_df: pd.DataFrame,
    periods: int = 12,
    seasonal_periods: int = 12
) -> pd.DataFrame:
    """
    Forecasts by age: active, entered, term rate, retention = 1-term_rate.
    Returns ['month','age_group','entered_forecast','term_rate_forecast','retention_rate','active_forecast']
    """
    hist = ages_df.copy()
    hist['month'] = pd.to_datetime(hist['month'])
    entered_ts = hist.pivot(index='month', columns='age_group', values='entered')
    termrate_ts= hist.pivot(index='month', columns='age_group', values='termination_rate')
    entered_fc = {}
    termrate_fc = {}
    for age in entered_ts.columns:
        se = entered_ts[age].resample('ME').sum()
        st = termrate_ts[age].resample('ME').mean()
        me = ExponentialSmoothing(se, trend='add', seasonal='add', seasonal_periods=seasonal_periods) if len(se.dropna())>=2*seasonal_periods else ExponentialSmoothing(se, trend='add', seasonal=None)
        fe = me.fit(optimized=True).forecast(periods)
        mt = ExponentialSmoothing(st, trend='add', seasonal='add', seasonal_periods=seasonal_periods) if len(st.dropna())>=2*seasonal_periods else ExponentialSmoothing(st, trend='add', seasonal=None)
        ft = mt.fit(optimized=True).forecast(periods)
        entered_fc[age] = fe
        termrate_fc[age] = ft
    last = hist[hist['month']==hist['month'].max()].set_index('age_group')['active_count'].astype(float)
    active_prev = last.copy()
    records = []
    months = pd.date_range(start=entered_ts.index.max()+pd.offsets.MonthBegin(1), periods=periods, freq='M')
    for i, mo in enumerate(months):
        for age in entered_ts.columns:
            ent = entered_fc[age].iloc[i]
            tr = termrate_fc[age].iloc[i]
            ret = 1 - tr
            act = active_prev[age] * ret + ent
            records.append({'month': mo.strftime('%Y-%m'), 'age_group': age, 'entered_forecast': ent, 'term_rate_forecast': tr, 'retention_rate': ret, 'active_forecast': act})
            active_prev[age] = act
    return pd.DataFrame(records)




if __name__ == "__main__":
    start_year = 2022
    end_year = 2024
    start_month = "2022-01"
    end_month = "2024-12"

    # Calculate historical flows and age rates
    flows, ages = calculate_yearonyear_flows_and_age_rates(
        start_month, end_month,
        redistribute_unknown_age=True
    )

    # Forecast total new orders with prediction intervals
    pi = forecast_with_pi(flows, periods=12)
    # Distribute PI by age
    pi_age = distribute_pi_by_age(pi, ages, weight_years=(start_year, end_year))
    # Forecast active caseload by age
    active_proj = forecast_age_specific_active(ages, periods=12)

    # Component forecasts: entries, termination rate, retention, active
    comp_df = forecast_age_specific_active_with_components(ages, periods=12)

    # Convert age_group to numeric and sort by age_group then month
    comp_df['age_group'] = comp_df['age_group'].astype(int)
    comp_df_sorted = comp_df.sort_values(['age_group', 'month'])

    # Print the sorted table
    print(comp_df_sorted.to_string(index=False))


In [None]:
comp_df_sorted = comp_df    
# convert age_group to numeric and sort
comp_df_sorted['age_group'] = comp_df_sorted['age_group'].astype(int)
comp_df_sorted = comp_df_sorted.sort_values(['month', 'age_group'])
print(comp_df_sorted.to_string(index=False))
comp_df_sorted.to_csv(f"output/comp_df_sorted_{start_year}_{end_year}.csv")

In [None]:

# Time Series Forecasting
# For forecasting purposes what we need to know is the number of cases actually flowing onto the active caseload each month 
# and the numbers actually leaving. I need to know ACTIVE cases going in and out of the system 
# and to draw by logical conclusion what we mean by the active caseload so that 
# we can deduce from this new cases that have been added (each month) and those that have left. 
# The simplest way to do that, without having to separately define inputs cases and outputs (terminations) 
# is to take a snapshot of the active cases each month (by using glueexporteddate as monthly active caseloads, 
#                                                       which is already implemented in the code below) 
# and compare it with the previous month so that we can see who has flowed on and who has left.


# a simple 12-month projection of how many cases expected to enter and leave the active caseload each month.

# Indexing: Converts month strings into a DatetimeIndex at month-start (asfreq('MS')).
# Model: Uses ExponentialSmoothing with additive trend+seasonality (period=12).
# Forecast: model.forecast(12) gives the next 12 monthly points.
# Output: A DataFrame with columns month, entered_forecast, exited_forecast, saved to CSV.



def forecast_monthly_flow(flow_df: pd.DataFrame,
                          first_month: str,
                          last_month: str,
                          output_base="output") -> pd.DataFrame:
    """
    Given a flow_df with columns ['month','entered','exited'],
    fit Holt–Winters models (additive trend+seasonality, period=12)
    separately on 'entered' and 'exited', then forecast the next 12 months.
    Writes 'forecast_flow_<first>_to_<last>_next12.csv' under output_base
    and returns the forecast DataFrame.
    """
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last  = last_month.strip().strip("'\"")
    os.makedirs(output_base, exist_ok=True)

    # Prepare the time index
    df = flow_df.copy()
    #df['month_dt'] = pd.to_datetime(df['month'] + '-01')
    df['month_dt'] = pd.to_datetime(df['month'], format='%Y-%m')
    df = df.set_index('month_dt').asfreq('MS')  # monthly start frequency

    # our two series
    entered = df['entered']
    exited  = df['exited']

    # fit Holt–Winters (additive trend+season, period=12)
    hw_entered = ExponentialSmoothing(entered,
                                      trend='add',
                                      seasonal='mul',
                                      seasonal_periods=12).fit()
    
    hw_exited  = ExponentialSmoothing(exited,
                                      trend='add',
                                      seasonal='add',
                                      seasonal_periods=12).fit()

    # forecast next 12
    f_entered = hw_entered.forecast(12)
    f_exited  = hw_exited.forecast(12)

    # assemble forecast DataFrame
    forecast_months = f_entered.index.strftime('%Y-%m')
    # convert to posivitive array
    f_entered = np.abs(f_entered)
    forecast_df = pd.DataFrame({
        'month':            forecast_months,
        'entered_forecast': f_entered.values.astype(int),
        'exited_forecast':  f_exited.values.astype(int),
    })

    # write to CSV
    out_csv = os.path.join(
        output_base,
        f"forecast_flow_{clean_first}_to_{clean_last}_next12.csv"
    )
    forecast_df.to_csv(out_csv, index=False)
    print(f"→ Forecast CSV for next 12 months saved at: {out_csv}")

    return forecast_df


# forecast next 12 months
forecast_summary = forecast_monthly_flow(flow_summary,
                                         first_month,
                                         last_month)
print("\nForecasted flow for next 12 months:")
print(forecast_summary)
    
flow_sum = forecast_summary
# Convert 'month' from string → datetime
flow_sum['month'] = pd.to_datetime(flow_sum['month'], format='%Y-%m')
    
# Convert both 'month' columns to datetime at the very start
flow_sum['month']          = pd.to_datetime(flow_sum['month'],          format='%Y-%m')
active_summary['month']    = pd.to_datetime(active_summary['month'],    format='%Y-%m')

    
# Compute “same month last year” for each forecast row
flow_sum['lookup_month'] = flow_sum['month'] - pd.DateOffset(years=1)

# Aggregate active_summary by month (if you have multiple rows per month)
active_agg = (
    active_summary
    .groupby('month', as_index=False)['active_case_count']
    .sum()
)
    
# Merge (left‐join) the previous‐year active counts in
flow_with_active = (
    flow_sum
    .merge(
        active_agg,
        left_on='lookup_month',
        right_on='month',
        how='left',
        suffixes=('','_prev')
    )
)
    
# Tidy up
flow_with_active = (
    flow_with_active
    .rename(columns={
        'month': 'month',                      # forecast month
        'active_case_count': 'active_case_count_prev_year'
    })
    .drop(columns=['month_prev','lookup_month'])
)

# add another column to the resulted table to calculate for each year-month 
# the forecasted active caseloads = active_case_count_prev_year + entered_forecast  - exited_forecast
flow_with_active['forecasted_active_caseload'] = (
    flow_with_active['active_case_count_prev_year']
    + flow_with_active['entered_forecast']
    - flow_with_active['exited_forecast']
)


print("\nForecasted flow with active cases from the previous year for next 12 months:")
print(flow_with_active)
    
    


In [None]:
!pip install pydbtools
# !pip install --force numpy==1.24
#!pip install pydbtools
# !pip install --force numpy==1.24

import os
import calendar
import shutil
from datetime import datetime,timedelta
import pydbtools
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import pandas as pd

import boto3
import numpy as np
import matplotlib.pyplot as plt
import logging
import botocore
import s3fs
from dateutil.relativedelta import relativedelta


def parse_month(month_str: str) -> datetime:
    """Parse 'YYYY-MM' to datetime."""
    return datetime.strptime(month_str.strip().strip("'\""), "%Y-%m")


def clear_directory(path):
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)  # Remove file or symbolic link
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)  # Remove directory and its contents
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')


def fetch_cases_for_date(run_date: str) -> pd.DataFrame:
    """
    Fetch all cases & their fee reductions for the given run_date (YYYY-MM-DD)
    using pydbtools.read_sql_query, which returns a pandas DataFrame.
    """
    query = f"""
    WITH active_fee_reductions AS (
      SELECT
        fc.client_id,
        SUBSTRING(fr.type,1,1) || LOWER(SUBSTRING(fr.type,2)) AS type,
        DATE(fr.startdate) AS startdate,
        DATE(fr.enddate)   AS enddate,
        fc.payment_method
      FROM opg_sirius_prod.fee_reduction fr
      JOIN opg_sirius_prod.finance_client fc
        ON fc.id = fr.finance_client_id
       AND fc.glueexporteddate = DATE('{run_date}')
      JOIN (
        SELECT
          MAX(id)           AS id,
          finance_client_id
        FROM opg_sirius_prod.fee_reduction
        WHERE enddate           >= DATE('{run_date}')
          AND startdate         <= DATE('{run_date}')
          AND deleted            = FALSE
          AND glueexporteddate   = DATE('{run_date}')
        GROUP BY finance_client_id
      ) latest ON latest.id = fr.id
      WHERE fr.glueexporteddate = DATE('{run_date}')
    )
    SELECT
      c.glueexporteddate,
      c.caserecnumber            AS casenumber,
      c.uid                      AS siriusid,
      (
        SELECT supervisionlevel
        FROM opg_sirius_prod.supervision_level_log sll
        WHERE sll.order_id         = c.id
          AND sll.glueexporteddate = DATE('{run_date}')
        ORDER BY sll.appliesfrom DESC
        LIMIT 1
      ) AS casesupervisionlevel,
      p.risk_score               AS CREC,
      c.casesubtype              AS orderType,
      c.orderdate                AS ordermadedate,
      c.orderstatus              AS orderStatus,
      afr.type                   AS feereductiontype,
      p.dob,
      CASE
        WHEN FLOOR(DATE_DIFF('day', p.dob, p.createddate) / 365.25) < 0 THEN 121
        WHEN FLOOR(DATE_DIFF('day', p.dob, p.createddate) / 365.25) > 120 THEN 121
        ELSE ROUND(DATE_DIFF('day', p.dob, p.createddate) / 365.25)
      END AS age_in_years
    FROM opg_sirius_prod.persons p
    JOIN opg_sirius_prod.cases c
      ON p.id                   = c.client_id
     AND c.glueexporteddate     = DATE('{run_date}')
    LEFT JOIN active_fee_reductions afr
      ON afr.client_id          = p.id
    WHERE c.orderstatus IN ('OPEN','ACTIVE','DUPLICATE')
      AND p.glueexporteddate     = DATE('{run_date}')
    ORDER BY c.orderdate;
    """
    return pydbtools.read_sql_query(query)


def generate_month_list(start_month: str, end_month: str):
    """
    Return a list of datetime objects for each month-start
    from start_month to end_month inclusive.
    """
    start_dt = parse_month(start_month)
    end_dt = parse_month(end_month)
    if start_dt > end_dt:
        raise ValueError(f"Start month ({start_month}) is after end month ({end_month})")

    months = []
    current = start_dt
    while current <= end_dt:
        months.append(current)
        current += relativedelta(months=1)
    return months

def last_day_of_month(dt: datetime) -> str:
    """Return the last day of dt's month as 'YYYY-MM-DD'."""
    day = calendar.monthrange(dt.year, dt.month)[1]
    return dt.replace(day=day).strftime("%Y-%m-%d")
        
def export_monthly_reports(first_month: str, last_month: str, output_base="output"):
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last = last_month.strip().strip("'\"")

    # Generate all months
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return

    # Prepare output directory
    os.makedirs(output_base, exist_ok=True)
    excel_filename = f"cases_{clean_first}_to_{clean_last}.xlsx"
    excel_path = os.path.join(output_base, excel_filename)    
    clear_directory(excel_path)
    
    # List to accumulate each month's DataFrame
    all_months = []

    # Create Excel workbook and write each month's sheet
    with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
        for dt in months:
            month_tag = dt.strftime("%Y-%m")
            run_date = last_day_of_month(dt)

            # Fetch data for this month-end
            df = fetch_cases_for_date(run_date)

            # Tag the DataFrame with its month, then collect it
            df["month"] = month_tag
            all_months.append(df)

            # Save CSV for this month
            month_folder = os.path.join(output_base, month_tag)
            os.makedirs(month_folder, exist_ok=True)
            csv_path = os.path.join(month_folder, f"cases_{month_tag}.csv")
            df.to_csv(csv_path, index=False)

            # Add to Excel workbook
            df.to_excel(writer, sheet_name=month_tag, index=False)

            print(f"→ Saved CSV for {month_tag}: {csv_path}")

        print(f"→ Combined Excel workbook saved at: {excel_path}")

    # After all sheets are written, concatenate & export one big CSV
    if all_months:
        combined_df = pd.concat(all_months, ignore_index=True)
        combined_csv_path = os.path.join(
            output_base,
            f"all_cases_{clean_first}_to_{clean_last}.csv"
        )
        combined_df.to_csv(combined_csv_path, index=False)
        print(f"→ Combined CSV for all months saved at: {combined_csv_path}")
        

def calculate_monthly_active_cases(first_month: str, last_month: str, output_base="output") -> pd.DataFrame:
    """
    For each month between first_month and last_month (inclusive), fetch the data,
    filter to ACTIVE cases, then aggregate unique casenumber counts by orderType.
    Writes a CSV 'monthly_active_cases_<first>_to_<last>.csv' under output_base,
    and returns the aggregated DataFrame.
    """
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last = last_month.strip().strip("'\"")

    # Generate all months
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return pd.DataFrame()

    # Prepare output directory
    os.makedirs(output_base, exist_ok=True)

    # List to collect each month's summary
    summaries = []

    for dt in months:
        month_tag = dt.strftime("%Y-%m")
        run_date = last_day_of_month(dt)

        # Fetch data for this month-end
        df = fetch_cases_for_date(run_date)

        # Keep only ACTIVE cases
        df_active = df[df["orderstatus"] == "ACTIVE"].copy()
        if df_active.empty:
            # still record zero counts for completeness
            summaries.append(
                pd.DataFrame([{"month": month_tag, "orderType": None, "active_case_count": 0}])
            )
            continue

        # Tag with month for grouping
        df_active["month"] = month_tag

        # Aggregate unique casenumbers per orderType
        summary = (
            df_active
            .groupby(["month", "ordertype"])["casenumber"]
            .nunique()
            .reset_index(name="active_case_count")
        )
        summaries.append(summary)

        print(f"→ Aggregated ACTIVE cases for {month_tag}")

    # Combine all month summaries
    result_df = pd.concat(summaries, ignore_index=True)

    # Fill any missing orderTypes/months with zeros if you want full matrix:
    # pivot = result_df.pivot_table(index="month", columns="orderType",
    #                               values="active_case_count", fill_value=0).reset_index()

    # Write out CSV
    out_csv = os.path.join(
        output_base,
        f"monthly_active_cases_{clean_first}_to_{clean_last}.csv"
    )
    result_df.to_csv(out_csv, index=False)
    print(f"→ Monthly ACTIVE cases CSV saved at: {out_csv}")

    return result_df

# Forecasting Active Caseloads:

## entered: count of cases newly appearing in the active caseload each month

## exited: count of cases that dropped out since the prior month

def calculate_monthly_flow(first_month: str, last_month: str, output_base="output") -> pd.DataFrame:
    """
    For each month from first_month to last_month (inclusive),
    snapshot the set of active casenumbers, then compare to the prior month
    to count how many entered and exited the active caseload.
    Writes 'monthly_flow_<first>_to_<last>.csv' under output_base,
    and returns the flow DataFrame.
    """
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last  = last_month.strip().strip("'\"")

    # Generate all months
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return pd.DataFrame()

    # Prepare snapshots dict: { "YYYY-MM": set_of_casenumbers }
    snapshots = {}
    for dt in months:
        month_tag = dt.strftime("%Y-%m")
        run_date  = last_day_of_month(dt)
        df        = fetch_cases_for_date(run_date)
        # Only unique casenumbers
        snapshots[month_tag] = set(df["casenumber"].unique())
        print(f"→ Snapshot for {month_tag}: {len(snapshots[month_tag])} active cases")

    # Build flow records
    flow_records = []
    prev_month = None
    for month in sorted(snapshots):
        current = snapshots[month]
        if prev_month is None:
            # First month: all are "new", none have "exited"
            flow_records.append({
                "month": month,
                "entered": len(current),
                "exited":  0
            })
        else:
            prev = snapshots[prev_month]
            entered = current - prev
            exited  = prev - current
            flow_records.append({
                "month":  month,
                "entered": len(entered),
                "exited":  len(exited)
            })
        prev_month = month

    # Create DataFrame and write CSV
    flow_df = pd.DataFrame(flow_records)
    os.makedirs(output_base, exist_ok=True)
    out_csv = os.path.join(
        output_base,
        f"monthly_flow_{clean_first}_to_{clean_last}.csv"
    )
    flow_df.to_csv(out_csv, index=False)
    print(f"→ Monthly flow CSV saved at: {out_csv}")

    return flow_df



def calculate_yearonyear_flows_and_age_rates(
    first_month: str,
    last_month: str,
    output_base: str = "output",
    redistribute_unknown_age: bool = False,
    age_bins: tuple = None,
    age_labels: tuple = None
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Combines granular 1-year and custom-bin versions.
    - Default: 1-year bins 0-119 with optional unknown-age redistribution.
    - Custom: pass age_bins & age_labels for coarse bins.

    Returns flows_df, age_rates_df and writes two CSVs to output_base.
    """
    os.makedirs(output_base, exist_ok=True)

    if age_bins is None or age_labels is None:
        age_bins = list(range(0, 121))
        age_labels = [str(a) for a in age_bins[:-1]]

    months = generate_month_list(first_month, last_month)
    flow_records = []
    age_rate_records = []

    for dt in months:
        prev_dt = dt - relativedelta(years=1)
        if prev_dt < parse_month(first_month):
            continue

        tag = dt.strftime("%Y-%m")
        df_cur = fetch_cases_for_date(last_day_of_month(dt))
        df_prev = fetch_cases_for_date(last_day_of_month(prev_dt))

        set_cur = set(df_cur["casenumber"])
        set_prev = set(df_prev["casenumber"])
        entered_set = set_cur - set_prev
        exited_set  = set_prev - set_cur

        flow_records.append({"month": tag, "entered": len(entered_set), "exited": len(exited_set)})

        df_term = df_prev[df_prev["casenumber"].isin(exited_set)].copy()
        df_in   = df_cur [df_cur ["casenumber"].isin(entered_set)].copy()
        df_base = df_prev.copy()

        for df_ in (df_term, df_in, df_base):
            df_["age_group"] = pd.cut(
                df_["age_in_years"], bins=age_bins, labels=age_labels, right=False, include_lowest=True
            )

        in_counts = df_in.groupby("age_group", observed=False)["casenumber"].nunique().reindex(age_labels, fill_value=0)
        unknown = df_in["age_group"].isna().sum()
        if redistribute_unknown_age and unknown > 0:
            total_known = in_counts.sum()
            props = (in_counts / total_known) if total_known > 0 else pd.Series(1/len(age_labels), index=age_labels)
            in_counts += (props * unknown).round().astype(int)

        ter_counts  = df_term.groupby("age_group", observed=False)["casenumber"].nunique().reindex(age_labels, fill_value=0)
        base_counts = df_base.groupby("age_group", observed=False)["casenumber"].nunique().reindex(age_labels, fill_value=0)

        for grp in age_labels:
            active = int(base_counts[grp])
            term   = int(ter_counts[grp])
            ent    = int(in_counts[grp])
            rate   = round(term/active, 4) if active else 0.0
            age_rate_records.append({
                "month": tag,
                "age_group": grp,
                "active_count": active,
                "entered": ent,
                "terminations": term,
                "termination_rate": rate
            })

    flows_df = pd.DataFrame(flow_records)
    ages_df  = pd.DataFrame(age_rate_records)

    flows_df.to_csv(os.path.join(output_base, f"yearonyear_flows_{first_month}_to_{last_month}.csv"), index=False)
    ages_df .to_csv(os.path.join(output_base, f"termination_and_entry_rates_by_age_{first_month}_to_{last_month}.csv"), index=False)

    return flows_df, ages_df


def forecast_with_pi(
    flows_df: pd.DataFrame,
    periods: int = 1,
    alpha: float = 0.05,
    seasonal_periods: int = 1
) -> pd.DataFrame:
    """
    Forecast monthly new orders with 95% prediction intervals.
    Falls back to trend-only if insufficient data for seasonality.
    """
    ts = flows_df.set_index(pd.to_datetime(flows_df['month']))['entered'].resample('ME').sum()
    nobs = len(ts)
    if seasonal_periods and nobs >= 2 * seasonal_periods:
        model = ExponentialSmoothing(ts, trend='add', seasonal='add', seasonal_periods=seasonal_periods)
    else:
        model = ExponentialSmoothing(ts, trend='add', seasonal=None)
    fit = model.fit(optimized=True)
    mu = fit.forecast(periods)
    resid = fit.resid.dropna()
    sigma = resid.std()
    z = abs(pd.Series(resid).quantile([alpha/2, 1-alpha/2]).iloc[1])
    lower = mu - z * sigma
    upper = mu + z * sigma
    return pd.DataFrame({'mean': mu, 'lower': lower, 'upper': upper})


def distribute_pi_by_age(
    pi_df: pd.DataFrame,
    ages_df: pd.DataFrame,
    weight_years: tuple[int, int] # = (start_year, end_year)
) -> pd.DataFrame:
    """
    Allocate forecast mean and PI bounds by age group per historical proportions.
    """
    hist = ages_df.copy()
    hist['month'] = pd.to_datetime(hist['month'])
    hist = hist[hist['month'].dt.year.between(*weight_years)]
    dist = hist.groupby('age_group')['entered'].sum() / hist['entered'].sum()

    rows = []
    for m, row in pi_df.iterrows():
        for age, p in dist.items():
            rows.append({
                'month': m.strftime('%Y-%m'),
                'age_group': age,
                'mean':   row['mean'] * p,
                'lower':  row['lower'] * p,
                'upper':  row['upper'] * p
            })
    return pd.DataFrame(rows)


def forecast_age_specific_active(
    ages_df: pd.DataFrame,
    periods: int = 12,
    seasonal_periods: int = 12
) -> pd.DataFrame:
    """
    Forecasts active caseload by age by separately forecasting entries and termination rates.

    - Fits a time series to each age_group's 'entered' and 'termination_rate'.
    - Forecasts both series for `periods` months.
    - Iteratively projects: active[t] = active[t-1] * (1 - term_rate[t]) + entered[t].
    """
    # Pivot historical series
    hist = ages_df.copy()
    hist['month'] = pd.to_datetime(hist['month'])
    entered_ts = hist.pivot(index='month', columns='age_group', values='entered')
    termrate_ts = hist.pivot(index='month', columns='age_group', values='termination_rate')

    # Forecast containers
    entered_fc = {}
    termrate_fc = {}

    for age in entered_ts.columns:
        # Prepare series
        series_e = entered_ts[age].resample('ME').sum()
        series_t = termrate_ts[age].resample('ME').mean()
        # Entry forecast
        if len(series_e.dropna()) >= 2 * seasonal_periods:
            m_e = ExponentialSmoothing(series_e, trend='add', seasonal='add', seasonal_periods=seasonal_periods)
        else:
            m_e = ExponentialSmoothing(series_e, trend='add', seasonal=None)
        fit_e = m_e.fit(optimized=True)
        entered_fc[age] = fit_e.forecast(periods)
        # Termination rate forecast
        if len(series_t.dropna()) >= 2 * seasonal_periods:
            m_t = ExponentialSmoothing(series_t, trend='add', seasonal='add', seasonal_periods=seasonal_periods)
        else:
            m_t = ExponentialSmoothing(series_t, trend='add', seasonal=None)
        fit_t = m_t.fit(optimized=True)
        termrate_fc[age] = fit_t.forecast(periods)

    # Starting active counts
    last = hist[hist['month'] == hist['month'].max()].set_index('age_group')['active_count']
    active_prev = last.astype(float)

    records = []
    months = pd.date_range(
        start=entered_ts.index.max() + pd.offsets.MonthBegin(1),
        periods=periods, freq='ME'
    )

    for i, month in enumerate(months):
        for age in entered_ts.columns:
            ent = entered_fc[age].iloc[i]
            tr  = termrate_fc[age].iloc[i]
            ret = 1 - tr if tr >= 0 else 1.0
            act = active_prev[age] * (ret) + ent
            records.append({
                'month': month.strftime('%Y-%m'),
                'age_group': age,
                'active_forecast': act
            })
            active_prev[age] = act

    return pd.DataFrame(records)




def forecast_age_specific_active_with_components(
    ages_df: pd.DataFrame,
    periods: int = 12,
    seasonal_periods: int = 12
) -> pd.DataFrame:
    """
    Forecasts by age: active, entered, term rate, retention = 1-term_rate.
    Returns ['month','age_group','entered_forecast','term_rate_forecast','retention_rate','active_forecast']
    """
    hist = ages_df.copy()
    hist['month'] = pd.to_datetime(hist['month'])
    entered_ts = hist.pivot(index='month', columns='age_group', values='entered')
    termrate_ts= hist.pivot(index='month', columns='age_group', values='termination_rate')
    entered_fc = {}
    termrate_fc = {}
    for age in entered_ts.columns:
        se = entered_ts[age].resample('ME').sum()
        st = termrate_ts[age].resample('ME').mean()
        me = ExponentialSmoothing(se, trend='add', seasonal='add', seasonal_periods=seasonal_periods) if len(se.dropna())>=2*seasonal_periods else ExponentialSmoothing(se, trend='add', seasonal=None)
        fe = me.fit(optimized=True).forecast(periods)
        mt = ExponentialSmoothing(st, trend='add', seasonal='add', seasonal_periods=seasonal_periods) if len(st.dropna())>=2*seasonal_periods else ExponentialSmoothing(st, trend='add', seasonal=None)
        ft = mt.fit(optimized=True).forecast(periods)
        entered_fc[age] = fe
        termrate_fc[age] = ft
    last = hist[hist['month']==hist['month'].max()].set_index('age_group')['active_count'].astype(float)
    active_prev = last.copy()
    records = []
    months = pd.date_range(start=entered_ts.index.max()+pd.offsets.MonthBegin(1), periods=periods, freq='M')
    for i, mo in enumerate(months):
        for age in entered_ts.columns:
            ent = entered_fc[age].iloc[i]
            tr = termrate_fc[age].iloc[i]
            ret = 1 - tr if tr >= 0 else 1.0
            act = active_prev[age] * ret + ent
            records.append({'month': mo.strftime('%Y-%m'), 'age_group': age, 'entered_forecast': ent, 'term_rate_forecast': tr, 'retention_rate': ret, 'active_forecast': act})
            active_prev[age] = act
    return pd.DataFrame(records)





In [None]:
def forecast_age_specific_naive(
    ages_df: pd.DataFrame,
    periods: int = 12
) -> pd.DataFrame:
    """
    Naive age-specific forecast using last year's values:
    - For each forecast month, use the same month one year ago to pull:
        * base_active = active_count_{t-12}
        * entries_naive = entered_{t-12}
        * term_rate_naive = termination_rate_{t-12}
    - Compute retention_rate = 1 - term_rate_naive
    - Forecast active_naive = base_active * retention_rate + entries_naive
    Returns ['month','age_group','entries_naive','term_rate_naive','retention_rate','active_naive'] for each age_group and forecast month.
    """
    # Prepare historical lookup
    hist = ages_df.copy()
    hist['month_dt'] = pd.to_datetime(hist['month'], format='%Y-%m')
    # build dicts by (age_group, month->values)
    entries_map = {
        (row['age_group'], row['month_dt']): row['entered']
        for _, row in hist.iterrows()
    }
    term_map = {
        (row['age_group'], row['month_dt']): row['termination_rate']
        for _, row in hist.iterrows()
    }
    active_map = {
        (row['age_group'], row['month_dt']): row['active_count']
        for _, row in hist.iterrows()
    }
    # Determine forecast months
    last_month = hist['month_dt'].max()
    months = pd.date_range(start=last_month + pd.offsets.MonthBegin(1), periods=periods, freq='MS')
    records = []
    for mo in months:
        ref = mo - relativedelta(years=1)
        for age in hist['age_group'].unique():
            base_active = active_map.get((age, ref), 0)
            entries_naive = entries_map.get((age, ref), 0)
            tr = term_map.get((age, ref), 0.0)
            retention = 1 - tr if tr >= 0 else 1.0
            act_naive = base_active * retention + entries_naive
            records.append({
                'month': mo.strftime('%Y-%m'),
                'age_group': age,
                'entries_naive': entries_naive,
                'term_rate_naive': tr,
                'retention_rate': retention,
                'active_naive': act_naive,
                'termination_rate'
            })
    return pd.DataFrame(records)

# if __name__ == "__main__":
#     start_year = 2023
#     end_year = 2025
#     start_month = "2023-06"
#     end_month = "2025-06"

#     # Calculate historical flows and age rates
#     flows, ages = calculate_yearonyear_flows_and_age_rates(
#         start_month, end_month,
#         redistribute_unknown_age=True
#     )

    # # Forecast total new orders with prediction intervals
    # pi = forecast_with_pi(flows, periods=12)
    # # Distribute PI by age
    # pi_age = distribute_pi_by_age(pi, ages, weight_years=(start_year, end_year))
    # Forecast active caseload by age
    # active_proj = forecast_age_specific_active(ages, periods=12)

    # # Component forecasts: entries, termination rate, retention, active
    # comp_df = forecast_age_specific_active_with_components(ages, periods=12)

    # # Convert age_group to numeric and sort by age_group then month
    # comp_df['age_group'] = comp_df['age_group'].astype(int)
    # comp_df_sorted = comp_df.sort_values(['month', 'age_group'])

    # Print the sorted table
    # print(comp_df_sorted.to_string(index=False))


In [None]:
#!pip install pydbtools

import os
import calendar
import shutil
from datetime import datetime, timedelta
import pydbtools
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import pandas as pd
import boto3
import numpy as np
import matplotlib.pyplot as plt
import logging
import botocore
import s3fs
from dateutil.relativedelta import relativedelta
from matplotlib.dates import MonthLocator, DateFormatter
# Configure logging
import warnings
# Suppress statsmodels AIC/BIC divide-by-zero runtime warnings
warnings.filterwarnings("ignore", message=".*divide by zero encountered in log.*")
#logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Disable logging output
logging.disable(logging.CRITICAL)

def parse_month(month_str: str) -> datetime:
    """Parse 'YYYY-MM' to datetime."""
    result = datetime.strptime(month_str.strip().strip("'\""), "%Y-%m")
    logging.debug(f"parse_month: parsed '{month_str}' to {result}")
    return result


def clear_directory(path):
    logging.info(f"clear_directory: clearing path {path}")
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
                logging.debug(f"Deleted file {file_path}")
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
                logging.debug(f"Deleted directory {file_path}")
        except Exception as e:
            logging.error(f"Failed to delete {file_path}. Reason: {e}")


def fetch_cases_for_date(run_date: str) -> pd.DataFrame:
    logging.info(f"fetch_cases_for_date: fetching data for {run_date}")
    query = f"..."  # trimmed for brevity
    df = pydbtools.read_sql_query(query)
    logging.info(f"fetch_cases_for_date: returned {len(df)} rows for {run_date}")
    return df


def generate_month_list(start_month: str, end_month: str):
    logging.info(f"generate_month_list: from {start_month} to {end_month}")
    start_dt = parse_month(start_month)
    end_dt = parse_month(end_month)
    if start_dt > end_dt:
        raise ValueError(f"Start month ({start_month}) is after end month ({end_month})")
    months = []
    current = start_dt
    while current <= end_dt:
        months.append(current)
        logging.debug(f"Added month {current}")
        current += relativedelta(months=1)
    logging.info(f"Generated {len(months)} months")
    return months


def last_day_of_month(dt: datetime) -> str:
    day = calendar.monthrange(dt.year, dt.month)[1]
    result = dt.replace(day=day).strftime("%Y-%m-%d")
    logging.debug(f"last_day_of_month: for {dt} result {result}")
    return result



def fetch_cases_for_date(run_date: str) -> pd.DataFrame:
    """
    Fetch all cases & their fee reductions for the given run_date (YYYY-MM-DD)
    using pydbtools.read_sql_query, which returns a pandas DataFrame.
    """
    query = f"""
    WITH active_fee_reductions AS (
      SELECT
        fc.client_id,
        SUBSTRING(fr.type,1,1) || LOWER(SUBSTRING(fr.type,2)) AS type,
        DATE(fr.startdate) AS startdate,
        DATE(fr.enddate)   AS enddate,
        fc.payment_method
      FROM opg_sirius_prod.fee_reduction fr
      JOIN opg_sirius_prod.finance_client fc
        ON fc.id = fr.finance_client_id
       AND fc.glueexporteddate = DATE('{run_date}')
      JOIN (
        SELECT
          MAX(id)           AS id,
          finance_client_id
        FROM opg_sirius_prod.fee_reduction
        WHERE enddate           >= DATE('{run_date}')
          AND startdate         <= DATE('{run_date}')
          AND deleted            = FALSE
          AND glueexporteddate   = DATE('{run_date}')
        GROUP BY finance_client_id
      ) latest ON latest.id = fr.id
      WHERE fr.glueexporteddate = DATE('{run_date}')
    )
    SELECT
      c.glueexporteddate,
      c.caserecnumber            AS casenumber,
      c.uid                      AS siriusid,
      (
        SELECT supervisionlevel
        FROM opg_sirius_prod.supervision_level_log sll
        WHERE sll.order_id         = c.id
          AND sll.glueexporteddate = DATE('{run_date}')
        ORDER BY sll.appliesfrom DESC
        LIMIT 1
      ) AS casesupervisionlevel,
      p.risk_score               AS CREC,
      c.casesubtype              AS orderType,
      c.orderdate                AS ordermadedate,
      c.orderstatus              AS orderStatus,
      afr.type                   AS feereductiontype,
      p.dob,
      CASE
        WHEN FLOOR(DATE_DIFF('day', p.dob, p.createddate) / 365.25) < 0 THEN 121
        WHEN FLOOR(DATE_DIFF('day', p.dob, p.createddate) / 365.25) > 120 THEN 121
        ELSE ROUND(DATE_DIFF('day', p.dob, p.createddate) / 365.25)
      END AS age_in_years
    FROM opg_sirius_prod.persons p
    JOIN opg_sirius_prod.cases c
      ON p.id                   = c.client_id
     AND c.glueexporteddate     = DATE('{run_date}')
    LEFT JOIN active_fee_reductions afr
      ON afr.client_id          = p.id
    WHERE c.orderstatus IN ('OPEN','ACTIVE','DUPLICATE')
      AND p.glueexporteddate     = DATE('{run_date}')
    ORDER BY c.orderdate;
    """
    return pydbtools.read_sql_query(query)


def generate_month_list(start_month: str, end_month: str):
    """
    Return a list of datetime objects for each month-start
    from start_month to end_month inclusive.
    """
    start_dt = parse_month(start_month)
    end_dt = parse_month(end_month)
    if start_dt > end_dt:
        raise ValueError(f"Start month ({start_month}) is after end month ({end_month})")

    months = []
    current = start_dt
    while current <= end_dt:
        months.append(current)
        current += relativedelta(months=1)
    return months

def last_day_of_month(dt: datetime) -> str:
    """Return the last day of dt's month as 'YYYY-MM-DD'."""
    day = calendar.monthrange(dt.year, dt.month)[1]
    return dt.replace(day=day).strftime("%Y-%m-%d")
        
def export_monthly_reports(first_month: str, last_month: str, output_base="output"):
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last = last_month.strip().strip("'\"")

    # Generate all months
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return

    # Prepare output directory
    os.makedirs(output_base, exist_ok=True)
    excel_filename = f"cases_{clean_first}_to_{clean_last}.xlsx"
    excel_path = os.path.join(output_base, excel_filename)    
    clear_directory(excel_path)
    
    # List to accumulate each month's DataFrame
    all_months = []

    # Create Excel workbook and write each month's sheet
    with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
        for dt in months:
            month_tag = dt.strftime("%Y-%m")
            run_date = last_day_of_month(dt)

            # Fetch data for this month-end
            df = fetch_cases_for_date(run_date)

            # Tag the DataFrame with its month, then collect it
            df["month"] = month_tag
            all_months.append(df)

            # Save CSV for this month
            month_folder = os.path.join(output_base, month_tag)
            os.makedirs(month_folder, exist_ok=True)
            csv_path = os.path.join(month_folder, f"cases_{month_tag}.csv")
            df.to_csv(csv_path, index=False)

            # Add to Excel workbook
            df.to_excel(writer, sheet_name=month_tag, index=False)

            print(f"→ Saved CSV for {month_tag}: {csv_path}")

        print(f"→ Combined Excel workbook saved at: {excel_path}")

    # After all sheets are written, concatenate & export one big CSV
    if all_months:
        combined_df = pd.concat(all_months, ignore_index=True)
        combined_csv_path = os.path.join(
            output_base,
            f"all_cases_{clean_first}_to_{clean_last}.csv"
        )
        combined_df.to_csv(combined_csv_path, index=False)
        print(f"→ Combined CSV for all months saved at: {combined_csv_path}")
        

def calculate_monthly_active_cases(first_month: str, last_month: str, output_base="output") -> pd.DataFrame:
    """
    For each month between first_month and last_month (inclusive), fetch the data,
    filter to ACTIVE cases, then aggregate unique casenumber counts by orderType.
    Writes a CSV 'monthly_active_cases_<first>_to_<last>.csv' under output_base,
    and returns the aggregated DataFrame.
    """
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last = last_month.strip().strip("'\"")

    # Generate all months
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return pd.DataFrame()

    # Prepare output directory
    os.makedirs(output_base, exist_ok=True)

    # List to collect each month's summary
    summaries = []

    for dt in months:
        month_tag = dt.strftime("%Y-%m")
        run_date = last_day_of_month(dt)

        # Fetch data for this month-end
        df = fetch_cases_for_date(run_date)

        # Keep only ACTIVE cases
        df_active = df[df["orderstatus"] == "ACTIVE"].copy()
        if df_active.empty:
            # still record zero counts for completeness
            summaries.append(
                pd.DataFrame([{"month": month_tag, "orderType": None, "active_case_count": 0}])
            )
            continue

        # Tag with month for grouping
        df_active["month"] = month_tag

        # Aggregate unique casenumbers per orderType
        summary = (
            df_active
            .groupby(["month", "ordertype"])["casenumber"]
            .nunique()
            .reset_index(name="active_case_count")
        )
        summaries.append(summary)

        print(f"→ Aggregated ACTIVE cases for {month_tag}")

    # Combine all month summaries
    result_df = pd.concat(summaries, ignore_index=True)

    # Fill any missing orderTypes/months with zeros if you want full matrix:
    # pivot = result_df.pivot_table(index="month", columns="orderType",
    #                               values="active_case_count", fill_value=0).reset_index()

    # Write out CSV
    out_csv = os.path.join(
        output_base,
        f"monthly_active_cases_{clean_first}_to_{clean_last}.csv"
    )
    result_df.to_csv(out_csv, index=False)
    print(f"→ Monthly ACTIVE cases CSV saved at: {out_csv}")

    return result_df

# Forecasting Active Caseloads:

## entered: count of cases newly appearing in the active caseload each month

## exited: count of cases that dropped out since the prior month

def calculate_monthly_flow(first_month: str, last_month: str, output_base="output") -> pd.DataFrame:
    """
    For each month from first_month to last_month (inclusive),
    snapshot the set of active casenumbers, then compare to the prior month
    to count how many entered and exited the active caseload.
    Writes 'monthly_flow_<first>_to_<last>.csv' under output_base,
    and returns the flow DataFrame.
    """
    # Clean inputs
    clean_first = first_month.strip().strip("'\"")
    clean_last  = last_month.strip().strip("'\"")

    # Generate all months
    months = generate_month_list(clean_first, clean_last)
    if not months:
        print("No months in range; nothing to do.")
        return pd.DataFrame()

    # Prepare snapshots dict: { "YYYY-MM": set_of_casenumbers }
    snapshots = {}
    for dt in months:
        month_tag = dt.strftime("%Y-%m")
        run_date  = last_day_of_month(dt)
        df        = fetch_cases_for_date(run_date)
        # Only unique casenumbers
        snapshots[month_tag] = set(df["casenumber"].unique())
        print(f"→ Snapshot for {month_tag}: {len(snapshots[month_tag])} active cases")

    # Build flow records
    flow_records = []
    prev_month = None
    for month in sorted(snapshots):
        current = snapshots[month]
        if prev_month is None:
            # First month: all are "new", none have "exited"
            flow_records.append({
                "month": month,
                "entered": len(current),
                "exited":  0
            })
        else:
            prev = snapshots[prev_month]
            entered = current - prev
            exited  = prev - current
            flow_records.append({
                "month":  month,
                "entered": len(entered),
                "exited":  len(exited)
            })
        prev_month = month

    # Create DataFrame and write CSV
    flow_df = pd.DataFrame(flow_records)
    os.makedirs(output_base, exist_ok=True)
    out_csv = os.path.join(
        output_base,
        f"monthly_flow_{clean_first}_to_{clean_last}.csv"
    )
    flow_df.to_csv(out_csv, index=False)
    print(f"→ Monthly flow CSV saved at: {out_csv}")

    return flow_df



def calculate_yearonyear_flows_and_age_rates(
    first_month: str,
    last_month: str,
    output_base: str = "output",
    redistribute_unknown_age: bool = False,
    age_bins: tuple = None,
    age_labels: tuple = None
) -> tuple[pd.DataFrame, pd.DataFrame]:
    logging.info(f"Calculating year-on-year flows and age rates from {first_month} to {last_month}, redistribute_unknown_age={redistribute_unknown_age}")
    os.makedirs(output_base, exist_ok=True)
    if age_bins is None or age_labels is None:
        age_bins = list(range(0, 121))
        age_labels = [str(a) for a in age_bins[:-1]]
    flow_records = []
    age_rate_records = []
    for dt in generate_month_list(first_month, last_month):
        prev_dt = dt - relativedelta(years=1)
        if prev_dt < parse_month(first_month):
            continue
        tag = dt.strftime("%Y-%m")
        logging.info(f"Processing month {tag}")
        df_cur = fetch_cases_for_date(last_day_of_month(dt))
        df_prev = fetch_cases_for_date(last_day_of_month(prev_dt))
        set_cur = set(df_cur["casenumber"])
        set_prev = set(df_prev["casenumber"])
        entered_set = set_cur - set_prev
        exited_set = set_prev - set_cur
        logging.debug(f"Month {tag}: entered={len(entered_set)}, exited={len(exited_set)}")
        flow_records.append({"month": tag, "entered": len(entered_set), "exited": len(exited_set)})
        df_term = df_prev[df_prev["casenumber"].isin(exited_set)].copy()
        df_in = df_cur[df_cur["casenumber"].isin(entered_set)].copy()
        df_base = df_prev.copy()
        for df_ in (df_term, df_in, df_base):
            df_["age_group"] = pd.cut(
                df_["age_in_years"], bins=age_bins, labels=age_labels,
                right=False, include_lowest=True
            )
        in_counts = df_in.groupby("age_group", observed=False)["casenumber"].nunique().reindex(age_labels, fill_value=0)
        unknown = df_in["age_group"].isna().sum()
        logging.debug(f"Unknown age entries: {unknown}")
        if redistribute_unknown_age and unknown > 0:
            total_known = in_counts.sum()
            props = in_counts / total_known if total_known > 0 else pd.Series([1/len(age_labels)]*len(age_labels), index=age_labels)
            alloc = (props * unknown).round().astype(int)
            in_counts += alloc
            logging.debug(f"Allocated unknowns: {alloc.to_dict()}")
        ter_counts = df_term.groupby("age_group", observed=False)["casenumber"].nunique().reindex(age_labels, fill_value=0)
        base_counts = df_base.groupby("age_group", observed=False)["casenumber"].nunique().reindex(age_labels, fill_value=0)
        for grp in age_labels:
            active = int(base_counts[grp])
            term = int(ter_counts[grp])
            ent = int(in_counts[grp])
            rate = round(term/active, 4) if active else 0.0
            retention = 1 - rate if rate >= 0 else 1.0
            age_rate_records.append({
                "month": tag,
                "age_group": grp,
                "active_count": active,
                "entered": ent,
                "terminations": term,
                "termination_rate": rate,
                "retention_rate": retention
            })
    flows_df = pd.DataFrame(flow_records)
    ages_df = pd.DataFrame(age_rate_records)
    flows_df.to_csv(os.path.join(output_base, f"yearonyear_flows_{first_month}_to_{last_month}.csv"), index=False)
    ages_df.to_csv(os.path.join(output_base, f"termination_and_entry_rates_by_age_{first_month}_to_{last_month}.csv"), index=False)
    logging.info("Completed calculation of year-on-year flows and age rates")
    return flows_df, ages_df


# Append current and forecasted tables
def get_combined_age_deputyship_table(tbl1, tbl2):
    combined = pd.concat(
        [tbl1, tbl2],
        ignore_index=True
    )
    return combined
        
def forecast_with_pi(
    flows_df: pd.DataFrame,
    periods: int = 1,
    alpha: float = 0.05,
    seasonal_periods: int = 1
) -> pd.DataFrame:
    """
    Forecast monthly new orders with 95% prediction intervals.
    Falls back to trend-only if insufficient data for seasonality.
    """
    ts = flows_df.set_index(pd.to_datetime(flows_df['month']))['entered'].resample('ME').sum()
    nobs = len(ts)
    if seasonal_periods and nobs >= 2 * seasonal_periods:
        model = ExponentialSmoothing(ts, trend='add', seasonal='add', seasonal_periods=seasonal_periods)
    else:
        model = ExponentialSmoothing(ts, trend='add', seasonal=None)
    fit = model.fit(optimized=True)
    mu = fit.forecast(periods)
    resid = fit.resid.dropna()
    sigma = resid.std()
    z = abs(pd.Series(resid).quantile([alpha/2, 1-alpha/2]).iloc[1])
    lower = mu - z * sigma
    upper = mu + z * sigma
    return pd.DataFrame({'mean': mu, 'lower': lower, 'upper': upper})


def distribute_pi_by_age(
    pi_df: pd.DataFrame,
    ages_df: pd.DataFrame,
    weight_years: tuple[int, int] # = (start_year, end_year)
) -> pd.DataFrame:
    """
    Allocate forecast mean and PI bounds by age group per historical proportions.
    """
    hist = ages_df.copy()
    hist['month'] = pd.to_datetime(hist['month'])
    hist = hist[hist['month'].dt.year.between(*weight_years)]
    dist = hist.groupby('age_group')['entered'].sum() / hist['entered'].sum()

    rows = []
    for m, row in pi_df.iterrows():
        for age, p in dist.items():
            rows.append({
                'month': m.strftime('%Y-%m'),
                'age_group': age,
                'mean':   row['mean'] * p,
                'lower':  row['lower'] * p,
                'upper':  row['upper'] * p
            })
    return pd.DataFrame(rows)


def forecast_age_specific_active(
    ages_df: pd.DataFrame,
    periods: int = 12,
    seasonal_periods: int = 12
) -> pd.DataFrame:
    """
    Forecasts active caseload by age by separately forecasting entries and termination rates.

    - Fits a time series to each age_group's 'entered' and 'termination_rate'.
    - Forecasts both series for `periods` months.
    - Iteratively projects: active[t] = active[t-1] * (1 - term_rate[t]) + entered[t].
    """
    # Pivot historical series
    hist = ages_df.copy()
    hist['month'] = pd.to_datetime(hist['month'])
    entered_ts = hist.pivot(index='month', columns='age_group', values='entered')
    termrate_ts = hist.pivot(index='month', columns='age_group', values='termination_rate')

    # Forecast containers
    entered_fc = {}
    termrate_fc = {}

    for age in entered_ts.columns:
        # Prepare series
        series_e = entered_ts[age].resample('ME').sum()
        series_t = termrate_ts[age].resample('ME').mean()
        # Entry forecast
        if len(series_e.dropna()) >= 2 * seasonal_periods:
            m_e = ExponentialSmoothing(series_e, trend='add', seasonal='add', seasonal_periods=seasonal_periods)
        else:
            m_e = ExponentialSmoothing(series_e, trend='add', seasonal=None)
        fit_e = m_e.fit(optimized=True)
        entered_fc[age] = fit_e.forecast(periods)
        # Termination rate forecast
        if len(series_t.dropna()) >= 2 * seasonal_periods:
            m_t = ExponentialSmoothing(series_t, trend='add', seasonal='add', seasonal_periods=seasonal_periods)
        else:
            m_t = ExponentialSmoothing(series_t, trend='add', seasonal=None)
        fit_t = m_t.fit(optimized=True)
        termrate_fc[age] = fit_t.forecast(periods)

    # Starting active counts
    last = hist[hist['month'] == hist['month'].max()].set_index('age_group')['active_count']
    active_prev = last.astype(float)

    records = []
    months = pd.date_range(
        start=entered_ts.index.max() + pd.offsets.MonthBegin(1),
        periods=periods, freq='ME'
    )

    for i, month in enumerate(months):
        for age in entered_ts.columns:
            ent = entered_fc[age].iloc[i]
            tr  = termrate_fc[age].iloc[i]
            ret = 1 - tr if tr >= 0 else 1.0
            act = active_prev[age] * (ret) + ent
            records.append({
                'month': month.strftime('%Y-%m'),
                'age_group': age,
                'active_forecast': act
            })
            active_prev[age] = act

    return pd.DataFrame(records)




def forecast_age_specific_active_with_components(
    ages_df: pd.DataFrame,
    periods: int = 12,
    seasonal_periods: int = 12
) -> pd.DataFrame:
    """
    Forecasts by age: active, entered, term rate, retention = 1-term_rate.
    Returns ['month','age_group','entered_forecast','term_rate_forecast','retention_rate','active_forecast']
    """
    hist = ages_df.copy()
    hist['month'] = pd.to_datetime(hist['month'])
    entered_ts = hist.pivot(index='month', columns='age_group', values='entered')
    termrate_ts= hist.pivot(index='month', columns='age_group', values='termination_rate')
    entered_fc = {}
    termrate_fc = {}
    for age in entered_ts.columns:
        se = entered_ts[age].resample('ME').sum()
        st = termrate_ts[age].resample('ME').mean()
        me = ExponentialSmoothing(se, trend='add', seasonal='add', seasonal_periods=seasonal_periods) if len(se.dropna())>=2*seasonal_periods else ExponentialSmoothing(se, trend='add', seasonal=None)
        fe = me.fit(optimized=True).forecast(periods)
        mt = ExponentialSmoothing(st, trend='add', seasonal='add', seasonal_periods=seasonal_periods) if len(st.dropna())>=2*seasonal_periods else ExponentialSmoothing(st, trend='add', seasonal=None)
        ft = mt.fit(optimized=True).forecast(periods)
        entered_fc[age] = fe
        termrate_fc[age] = ft
    last = hist[hist['month']==hist['month'].max()].set_index('age_group')['active_count'].astype(float)
    active_prev = last.copy()
    records = []
    months = pd.date_range(start=entered_ts.index.max()+pd.offsets.MonthBegin(1), periods=periods, freq='M')
    for i, mo in enumerate(months):
        for age in entered_ts.columns:
            ent = entered_fc[age].iloc[i]
            tr = termrate_fc[age].iloc[i]
            ret = 1 - tr if tr >= 0 else 1.0
            act = active_prev[age] * ret + ent
            records.append({'month': mo.strftime('%Y-%m'), 'age_group': age, 'entered_forecast': ent, 'term_rate_forecast': tr, 'retention_rate': ret, 'active_forecast': act})
            active_prev[age] = act
    return pd.DataFrame(records)


def forecast_age_specific_naive(
    ages_df: pd.DataFrame,
    periods: int = 12
) -> pd.DataFrame:
    """
    Naive age-specific forecast using last year's values:
    - For each forecast month, use the same month one year ago to pull:
        * base_active = active_count_{t-1}
        * entries_naive = entered_{t-12}
        * term_rate_naive = termination_rate_{t-12}
    - Compute retention_rate = 1 - term_rate_naive
    - Forecast active_naive = base_active * retention_rate + entries_naive
    Returns ['month','age_group','entries_naive','term_rate_naive','retention_rate','active_naive'] for each age_group and forecast month.
    """
    # Prepare historical lookup
    hist = ages_df.copy()
    hist['month_dt'] = pd.to_datetime(hist['month'], format='%Y-%m')
    # build dicts by (age_group, month->values)
    entries_map = {
        (row['age_group'], row['month_dt']): row['entered']
        for _, row in hist.iterrows()
    }
    term_map = {
        (row['age_group'], row['month_dt']): row['termination_rate']
        for _, row in hist.iterrows()
    }
    active_map = {
        (row['age_group'], row['month_dt']): row['active_count']
        for _, row in hist.iterrows()
    }
    termination_map = {
        (row['age_group'], row['month_dt']): row['terminations']
        for _, row in hist.iterrows()
    }
    # Determine forecast months
    last_month = hist['month_dt'].max()
    months = pd.date_range(start=last_month + pd.offsets.MonthBegin(1), periods=periods, freq='MS')
    records = []
    for mo in months:
        ref = mo - relativedelta(years=1)
        for age in hist['age_group'].unique():
            base_active = active_map.get((age, ref), 0)
            entries_naive = entries_map.get((age, ref), 0)
            terminated = termination_map.get((age, ref), 0.0)
            tr = term_map.get((age, ref), 0.0)
            retention = 1 - tr if tr >= 0 else 1.0
            act_naive = base_active * retention + entries_naive
            records.append({
                'month': mo.strftime('%Y-%m'),
                'age_group': age,
                'entries_naive': entries_naive,
                'term_rate_naive': tr,
                'retention_rate': retention,
                'active_naive': act_naive,
                'termination_rate': tr,
                'terminated': terminated
            })
    return pd.DataFrame(records)




def stop_flow_forecast(ages_df: pd.DataFrame, periods: int = 24) -> pd.DataFrame:
    """
    “Stop‐flow” forecast:
      active_t  = active_{t–1} + entered_{t–12} – terminated_{t–12}
    for each age_group, recursively, starting from the last historical month.
    
    Returns a DataFrame with columns:
      month (datetime), age_group, active_forecast (int)
    """
    # 1) Copy and ensure month is a column
    df = ages_df.copy()
    if 'month' not in df.columns:
        # If month is in the index, pull it out
        if 'month' in df.index.names:
            df = df.reset_index()
        else:
            raise ValueError("Input ages_df must have a 'month' column or index level")
    
    # 2) Convert month to datetime flexibly
    df['month'] = pd.to_datetime(df['month'])
    
    # 3) Find the last historical month
    last_hist = df['month'].max()
    
    # 4) Build the list of future months (1..periods ahead)
    first_fc = last_hist + relativedelta(months=1)
    fc_months = [first_fc + relativedelta(months=i) for i in range(periods)]
    
    # 5) Seed the "previous active" from last historical month
    prev_active = (
        df[df['month'] == last_hist]
        .set_index('age_group')['active_count']
        .to_dict()
    )
    
    records = []
    age_groups = df['age_group'].unique()
    
    for m in fc_months:
        lag = m - relativedelta(years=1)
        for age in age_groups:
            a_prev = prev_active.get(age, 0)
            
            # grab last year's entered/terminated for this age & lag month
            row = df[(df['month'] == lag) & (df['age_group'] == age)]
            entered = int(row['entered'].iloc[0]) if not row.empty else 0
            term    = int(row['terminations'].iloc[0]) if not row.empty else 0
            
            # stop‐flow formula
            a_fc = a_prev + entered - term
            
            records.append({
                'month':           m,
                'age_group':       age,
                'active_forecast': a_fc
            })
            prev_active[age] = a_fc  # update for next iteration
    
    return pd.DataFrame(records)


## Age-Specific Forecasting Overview

This module implements two complementary age-specific forecasting approaches:

### 1. Component-Based Holt–Winters Forecast

1. **Historical Series Extraction**

   * For each age group, extract two monthly time series from `ages_df`:

     * **Entries**: Number of new deputyships (`entered`) per month.
     * **Termination rate**: Fraction of cases closing (`termination_rate`) per month.

2. **Fitting Models**

   * If at least two full seasonal cycles are present (≥24 months when `seasonal_periods=12`), fit an **additive trend + seasonality** Holt–Winters model; otherwise, fit a **trend-only** model.
   * Perform this separately for each age group’s entries and termination-rate series.

3. **Forecasting**

   * Forecast **`periods`** months ahead for both series, yielding for each future month *t* and age group *a*:

     * `entered_fc[a][t]` (predicted new entries)
     * `termlate_fc[a][t]` (predicted termination rate)

4. **Iterative Projection**

   * Start with the last observed active count for each age group:
     $\text{active}_{0,a} = \text{active\_count}_{\text{last month},a}$
   * For each forecast month *t* = 1…*periods*:

     $$
       \text{retention}_{t,a} = 1 - \text{termlate\_fc}[a][t]  \\
       \text{active}_{t,a} = \text{active}_{t-1,a} \times \text{retention}_{t,a} + \text{entered\_fc}[a][t]
     $$
   * Survivors carry forward (previous active × retention) plus new entries.

### 2. Naïve “Last Year” Benchmark

1. **Reference Lookup**

   * For each forecast month *t*, look back 12 months (*t*−12) to retrieve:

     * `base_active`: observed active count
     * `entries_naive`: observed new entries
     * `term_rate_naive`: observed termination rate

2. **Compute Retention**
   $\text{retention\_naive} = 1 - \text{term\_rate\_naive}$

3. **Project Forward**
   $\text{active\_naive}_t = \text{base\_active} \times \text{retention\_naive} + \text{entries\_naive}$

This simple year-over-year carry-forward forecast serves as a baseline to evaluate the performance of the more sophisticated Holt–Winters component model.

---

## Why These Methods?

* **Component models** capture both **trends** and **seasonal patterns** separately for inflows and outflows by age.
* The **naïve model** provides a straightforward baseline: “repeat what happened same month last year.”

By comparing both, you can quantify the added value of trend + seasonality modeling over simple repetition.


In [None]:

if __name__ == "__main__":
    start_year = 2023
    end_year = 2025
    start_month = "2023-06"
    end_month = "2025-06"

    # # Calculate historical flows and age rates
    # flows, ages = calculate_yearonyear_flows_and_age_rates(
    #     start_month, end_month,
    #     redistribute_unknown_age=True
    # )
    # ages_df = ages

    
    # # Forecast total new orders with prediction intervals
    # pi = forecast_with_pi(flows, periods=12)
    # # Distribute PI by age
    # pi_age = distribute_pi_by_age(pi, ages, weight_years=(start_year, end_year))
    # Forecast active caseload by age
    #active_proj = forecast_age_specific_active(ages, periods=12)

    # Component forecasts: entries, termination rate, retention, active
    #comp_df = forecast_age_specific_active_with_components(ages, periods=12)

    # Convert age_group to numeric and sort by age_group then month
    #comp_df['age_group'] = comp_df['age_group'].astype(int)
    #comp_df_sorted = comp_df.sort_values(['age_group', 'month'])
    #comp_df_sorted.to_csv(f"output/comp_df_sorted_{start_year}_{end_year}.csv")
    # Print the sorted table
    #print(comp_df_sorted.to_string(index=False))


    # 1) Compute 2-year stop-flow forecast
    sf_fc = stop_flow_forecast(ages_df, periods=24)
    
    # 2) Pivot for plotting: month on x‐axis, each age a line
    pivot = sf_fc.pivot(index='month', columns='age_group', values='active_forecast')
    pivot.index = pd.to_datetime(pivot.index)
    
    # 3) Plot
    fig, ax = plt.subplots(figsize=(10, 6))
    for age, series in pivot.items():
        ax.plot(
            series.index, series.values,
            label=f"{int(age)} yrs",
            marker='o',
            linewidth=2,
            alpha=0.8
        )
    
    current_age_specific_deputyship_agg = ages_df.copy()
    current_age_specific_deputyship_agg = current_age_specific_deputyship_agg.rename(
        columns={
            'age_group': 'age',
            'active_count': 'active_caseloads',
            'entered': 'new_deputyships',
            'terminations': 'terminated'
        }
    )
    
    # forecasted_age_specific_deputyship_agg = comp_df_sorted.copy()
    # forecasted_age_specific_deputyship_agg = forecasted_age_specific_deputyship_agg.rename(
    #     columns={
    #         'age_group': 'age',
    #         'active_forecast': 'active_caseloads',
    #         'entered_forecast': 'new_deputyships',
    #         'term_rate_forecast': 'termination_rate',
    #         'terminated': 'terminated'
    #     }
    # )
    
    forecasted_age_specific_deputyship_agg = sf_fc.copy()
    forecasted_age_specific_deputyship_agg = forecasted_age_specific_deputyship_agg.rename(
        columns={
            'age_group': 'age',
            'active_forecast': 'active_caseloads'
        }
    )
    
    # Final tforcast and actuals
    combined_table = get_combined_age_deputyship_table(current_age_specific_deputyship_agg, forecasted_age_specific_deputyship_agg)
    combined_table.to_csv(f"output/combined_deputyship_{start_year}_{end_year}.csv")
    # Plotting age-specific active caseloads, termination rate, and new deputyships over time
    
    # Ensure 'month' is datetime
    combined_table['month'] = pd.to_datetime(combined_table['month'], format='%Y-%m')
    
    # Active Caseloads by Age Group
    active_pivot = combined_table.pivot(index='month', columns='age', values='active_caseloads')
    # fig1, ax1 = plt.subplots()
    # active_pivot.plot(ax=ax1)
    # ax1.set_title('Active Caseloads by Age Group Over Time')
    # ax1.set_xlabel('Month')
    # ax1.set_ylabel('Active Caseloads')
    # plt.tight_layout()
    # plt.show()
    
    # # Termination Rate by Age Group
    # term_pivot = combined_table.pivot(index='month', columns='age', values='termination_rate')
    # fig2, ax2 = plt.subplots()
    # term_pivot.plot(ax=ax2)
    # ax2.set_title('Termination Rate by Age Group Over Time')
    # ax2.set_xlabel('Month')
    # ax2.set_ylabel('Termination Rate')
    # plt.tight_layout()
    # plt.show()
    
    # # New Deputyships by Age Group
    # new_pivot = combined_table.pivot(index='month', columns='age', values='new_deputyships')
    # fig3, ax3 = plt.subplots()
    # new_pivot.plot(ax=ax3)
    # ax3.set_title('New Deputyships by Age Group Over Time')
    # ax3.set_xlabel('Month')
    # ax3.set_ylabel('New Deputyships')
    # plt.tight_layout()
    # plt.show()


In [None]:
combined_table.tail

In [None]:




# x‐axis: every 2 months, formatted "Apr 25"
ax.xaxis.set_major_locator(MonthLocator(interval=2))
ax.xaxis.set_major_formatter(DateFormatter('%b %y'))
plt.setp(ax.get_xticklabels(), rotation=45, ha='right')

ax.set_title('Stop‐Flow Active Caseload Forecast by Age', fontsize=14, fontweight='bold')
ax.set_xlabel('Month', fontsize=12, fontweight='bold')
ax.set_ylabel('Active Caseloads (forecast)', fontsize=12, fontweight='bold')

# Legend outside
ax.legend(
    title='Age Group',
    bbox_to_anchor=(1.02, 1),
    loc='upper left',
    frameon=False
)

ax.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout()
plt.show()


In [None]:
from matplotlib.ticker import MultipleLocator
from matplotlib.ticker import MaxNLocator

# Define exactly which ages to show
ages_to_plot = [10, 20, 30, 40, 50, 60, 70]

# Filter your combined table
filtered = combined_table[combined_table['age'].isin(ages_to_plot)]

# Pivot to get one series per month
active_age_pivot = filtered.pivot(
    index='age',
    columns='month',
    values='active_caseloads'
)

# Choose a qualitative colormap with enough distinct colors
months = active_age_pivot.columns
cmap = plt.get_cmap('tab20')                     # 20 distinct colors
colors = cmap(np.linspace(0, 1, len(months)))     # pick evenly spaced

# Plot
fig, ax = plt.subplots(figsize=(10, 5))
for (month, series), color in zip(active_age_pivot.items(), colors):
    ax.plot(
        active_age_pivot.index,
        series,
        marker='o',
        alpha=0.85,
        label=month.strftime('%b %y'),
        color=color,
        linewidth=2
    )

# X‐axis only those ages
ax.set_xticks(ages_to_plot)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

# Labels & title
ax.set_xlabel('Age (years)', fontsize=12, fontweight='bold')
ax.set_ylabel('Active Caseloads', fontsize=12, fontweight='bold')
ax.set_title('Active Caseloads by Selected Age Groups', fontsize=14)

# Legend outside
ax.legend(
    bbox_to_anchor=(1.02, 1),
    loc='upper left',
    title='Month',
    fontsize=10,
    title_fontsize=11,
    frameon=False   # remove legend border for a cleaner look
)

plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()



In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.dates import MonthLocator, DateFormatter

# Define exactly which ages to plot
ages_to_plot = [10, 20, 30, 40, 50, 60, 70]

# Filter your combined table
filtered = combined_table[combined_table['age'].isin(ages_to_plot)].copy()

# Pivot so that each age group is its own series, with month on the index
active_month_pivot = filtered.pivot(
    index='month',
    columns='age',
    values='active_caseloads'
)

# Ensure the index is datetime for proper formatting
active_month_pivot.index = pd.to_datetime(active_month_pivot.index)

# Choose a qualitative colormap with distinct colors for each age
ages = active_month_pivot.columns.astype(int)
cmap = plt.get_cmap('tab10')
colors = cmap(np.linspace(0, 1, len(ages)))

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
for age, color in zip(ages, colors):
    ax.plot(
        active_month_pivot.index,
        active_month_pivot[age],
        marker='o',
        alpha=0.85,
        label=f"{age} yrs",
        color=color,
        linewidth=2
    )

# X‐axis formatting: every 2 months, labels like "Apr 25"
ax.xaxis.set_major_locator(MonthLocator(interval=2))
ax.xaxis.set_major_formatter(DateFormatter('%b %y'))
plt.setp(ax.get_xticklabels(), rotation=45, ha='right')

# Labels & title
ax.set_xlabel('Month', fontsize=12, fontweight='bold')
ax.set_ylabel('Active Caseloads', fontsize=12, fontweight='bold')
ax.set_title('Active Caseloads Forecast by Age Group', fontsize=14)

# Legend outside, keyed by age
ax.legend(
    bbox_to_anchor=(1.02, 1),
    loc='upper left',
    title='Age Group',
    fontsize=10,
    title_fontsize=11,
    frameon=False
)

# Grid and layout
ax.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()

plt.show()


In [None]:
# Active Caseloads across Age Groups for each month (Improved Representation)
from matplotlib.ticker import MaxNLocator
active_age_pivot = combined_table.pivot(index='age', columns='month', values='active_caseloads')
fig1, ax1 = plt.subplots(figsize=(10, 6))
# Plot each month's series with transparency and thicker lines
for mo in active_age_pivot.columns:
    ax1.plot(
        active_age_pivot.index,
        active_age_pivot[mo],
        label=mo.strftime('%b %y'),
        alpha=0.7,
        linewidth=2
    )
# Titles and labels with larger fonts
ax1.set_title('Active Caseloads by Age Group', fontsize=18, fontweight='bold')
ax1.set_xlabel('Age Group', fontsize=14)
ax1.set_ylabel('Active Caseloads', fontsize=14)
# Tidy grid for readability
ax1.grid(True, linestyle='--', alpha=0.3)
# X-axis integer ticks only
ax1.xaxis.set_major_locator(MaxNLocator(integer=True))
# Legend outside
ax1.legend(
    bbox_to_anchor=(1.02, 1),
    loc='upper left',
    title='Month',
    fontsize=10,
    title_fontsize=12
)
plt.tight_layout(rect=[0, 0, 0.85, 1])
# Save with transparency
plt.savefig(
    'output/Active_Caseloads_by_Age_Group.png',
    dpi=150,
    transparent=True
)
plt.show()


In [None]:
# Pivot so age is x-axis and month series are separate lines
# Active Caseloads across Age Groups for each month
active_age_pivot = combined_table.pivot(index='age', columns='month', values='active_caseloads')
fig1, ax1 = plt.subplots()
active_age_pivot.plot(ax=ax1)
ax1.set_title('Active Caseloads by Age Group')
ax1.set_xlabel('Age Group')
ax1.set_ylabel('Active Caseloads')
# Show every other month in legend, formatted 'Apr 25'
months = active_age_pivot.columns
labels = [dt.strftime('%b %y') for dt in months]
# Move legend outside
ax1.legend(labels=labels, bbox_to_anchor=(2.02, 1), loc='upper left', title='Month')
plt.tight_layout(rect=[30,30,25,30])
plt.savefig("output/Active Caseloads by Age Group.png")
plt.show()

# Termination Rate by Age Group for each month
term_age_pivot = combined_table.pivot(index='age', columns='month', values='termination_rate')
fig2, ax2 = plt.subplots()
term_age_pivot.plot(ax=ax2)
ax2.set_title('Termination Rate by Age Group')
ax2.set_xlabel('Age Group')
ax2.set_ylabel('Termination Rate')
labels = [dt.strftime('%b %y') for dt in term_age_pivot.columns]
ax2.legend(labels=labels, bbox_to_anchor=(1.02, 1), loc='upper left', title='Month')
plt.tight_layout(rect=[0,0,0.75,1])
plt.show()

# New Deputyships by Age Group for each month
new_age_pivot = combined_table.pivot(index='age', columns='month', values='new_deputyships')
fig3, ax3 = plt.subplots()
new_age_pivot.plot(ax=ax3)
ax3.set_title('New Deputyships by Age Group')
ax3.set_xlabel('Age Group')
ax3.set_ylabel('New Deputyships')
labels = [dt.strftime('%b %y') for dt in new_age_pivot.columns]
ax3.legend(labels=labels, bbox_to_anchor=(1.02, 1), loc='upper left', title='Month')
plt.tight_layout(rect=[0,0,0.75,1])
plt.show()


In [None]:


from matplotlib.dates import DateFormatter, MonthLocator

# Plot Active Caseloads by Age Group
active_pivot = combined_table.pivot(index='age', columns='month', values='active_caseloads')
fig1, ax1 = plt.subplots()
active_pivot.plot(ax=ax1)
ax1.set_title('Active Caseloads by Age Group Over Time')
ax1.set_xlabel('Age')
ax1.set_ylabel('Active Caseloads')
# Show tick every 2 months, formatted as 'Apr 25'
ax1.xaxis.set_major_locator(MonthLocator(interval=2))
ax1.xaxis.set_major_formatter(DateFormatter('%b %y'))
plt.setp(ax1.get_xticklabels(), rotation=45, ha='right')
# Move legend outside
ax1.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title='Month')
plt.tight_layout(rect=[5,5,5.85,6])
plt.show()

# Plot Termination Rate by Age Group
term_pivot = combined_table.pivot(index='age', columns='month', values='termination_rate')
fig2, ax2 = plt.subplots()
term_pivot.plot(ax=ax2)
ax2.set_title('Termination Rate by Age Group Over Time')
ax2.set_xlabel('Age')
ax2.set_ylabel('Termination Rate')
ax2.xaxis.set_major_locator(MonthLocator(interval=2))
ax2.xaxis.set_major_formatter(DateFormatter('%b %y'))
plt.setp(ax2.get_xticklabels(), rotation=45, ha='right')
ax2.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title='Month')
plt.tight_layout(rect=[0,0,0.85,1])
plt.show()

# Plot New Deputyships by Age Group
new_pivot = combined_table.pivot(index='age', columns='month', values='new_deputyships')
fig3, ax3 = plt.subplots()
new_pivot.plot(ax=ax3)
ax3.set_title('New Deputyships by Age Group Over Time')
ax3.set_xlabel('Age')
ax3.set_ylabel('New Deputyships')
ax3.xaxis.set_major_locator(MonthLocator(interval=2))
ax3.xaxis.set_major_formatter(DateFormatter('%b %y'))
plt.setp(ax3.get_xticklabels(), rotation=45, ha='right')
ax3.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title='Month')
plt.tight_layout(rect=[0,0,0.85,1])
plt.show()

