In [None]:
%%capture
import os
from pathlib import Path

import pandas as pd
from dj_notebook import activate

env_file = os.environ["META_ENV"]
reports_folder = Path(os.environ["META_REPORTS_FOLDER"])
analysis_folder = Path(os.environ["META_ANALYSIS_FOLDER"])
pharmacy_folder = Path(os.environ["META_PHARMACY_FOLDER"])
plus = activate(dotenv_file=env_file)
pd.set_option("future.no_silent_downcasting", True)

In [None]:
from datetime import date

import pdfkit
from edc_pdutils.dataframes import get_subject_visit
from great_tables import md
from scipy.stats import chi2

from meta_analytics.dataframes import GlucoseEndpointsByDate, get_eos_df
from meta_analytics.utils import df_as_great_table

In [None]:
html_data = []
cutoff_date = date(2025, 3, 31)

In [None]:
df_visit = get_subject_visit("meta_subject.subjectvisit")
df_visit = df_visit[df_visit.appt_datetime.dt.date <= cutoff_date]

In [None]:
cls = GlucoseEndpointsByDate()
cls.run()
df_endpoint = cls.endpoint_only_df.copy()

In [None]:
def get_df_main(
    df_visit: pd.DataFrame, lower_days: float | None = None, upper_days: float | None = None
):
    if not lower_days:
        lower_days = -1
    # cutoff_datetime = df_visit.query("@lower_days<followup_days<=@upper_days").visit_datetime.max()
    # exclude subjects for this reason
    # offstudy_reasons = ['Patient fulfilled late exclusion criteria (due to abnormal blood values or raised blood pressure at enrolment']

    df_eos = get_eos_df()
    df_eos_excluded = (
        df_eos.query(
            "followup_days<@lower_days and followup_days<=@upper_days and offstudy_reason.isin(@offstudy_reasons)"
        )
        .copy()
        .reset_index()
    )
    df_visit_final = (
        df_visit.query("@lower_days<followup_days<=@upper_days and reason!='missed'")
        .merge(
            df_eos_excluded[["subject_identifier"]],
            on="subject_identifier",
            how="left",
            suffixes=("", "_y"),
            indicator=True,
        )
        .query("_merge=='left_only'")
        .drop(columns=["_merge"])
    )
    df_main = (
        df_visit_final.groupby(by=["subject_identifier"])[
            ["baseline_datetime", "visit_datetime", "followup_days"]
        ]
        .max()
        .reset_index()
    )

    df_main = df_main.merge(
        df_endpoint.query("days_to_endpoint>@lower_days")[
            ["subject_identifier", "endpoint_label", "endpoint_type", "days_to_endpoint"]
        ],
        how="left",
        on=["subject_identifier"],
    ).reset_index(drop=True)
    if lower_days >= 365.25:
        df_main["followup_days"] = df_main["followup_days"] - lower_days
    df_main["followup_years"] = df_main["followup_days"] / 365.25
    return (
        df_main,
        len(df_main),
        len(
            df_main.query(
                "@lower_days<days_to_endpoint<=@upper_days and endpoint_label.notna()"
            )
        ),
    )


def get_rate_and_ci(events, person_years_total):
    lower_ci = (chi2.ppf(0.025, 2 * events) / (2 * person_years_total)) * 1000
    upper_ci = (chi2.ppf(0.975, 2 * (events + 1)) / (2 * person_years_total)) * 1000
    return events / person_years_total * 1000, lower_ci, upper_ci


def get_incidence_data(term: str, lower_days: float, upper_days: float):
    data = {}
    df_main, subjects, events = get_df_main(
        df_visit, lower_days=lower_days, upper_days=upper_days
    )
    person_years_total = df_main.followup_years.sum()
    data.update(
        {
            term: [
                person_years_total,
                subjects,
                events,
                *get_rate_and_ci(events, person_years_total),
            ]
        }
    )
    return data

In [None]:
incidence_data = {}
incidence_data.update(get_incidence_data("total", lower_days=0, upper_days=10000))
incidence_data.update(get_incidence_data("0-1 years", lower_days=0, upper_days=365.25))
incidence_data.update(
    get_incidence_data("1-2 years", lower_days=365.25, upper_days=2 * 365.25)
)
incidence_data.update(
    get_incidence_data("2-3 years", lower_days=2 * 365.25, upper_days=3 * 365.25)
)
incidence_data.update(
    get_incidence_data("3+ years", lower_days=3 * 365.25, upper_days=10 * 365.25)
)

In [None]:
data = dict(label=[], person_years=[], failures=[], rate=[], lower_ci=[], upper_ci=[])
for k in incidence_data:
    data["label"].append(k)

for v in incidence_data.values():
    data["person_years"].append(v[0])
    data["failures"].append(v[2])
    data["rate"].append(v[3])
    data["lower_ci"].append(v[4])
    data["upper_ci"].append(v[5])

df = pd.DataFrame(data=data)

In [None]:
gt = df_as_great_table(
    df,
    title="Table 9: Incident Rate per 1000 person years",
    subtitle=md("using randomisation to diabetes/last seen"),
)
gt = gt.fmt_number(
    columns=["person_years", "failures", "rate", "lower_ci", "upper_ci"], decimals=2
)
gt = (
    gt.cols_label(
        {
            "label": "Label",
            "person_years": "Person years",
            "failures": "Failures",
            "rate": "Rate",
            "lower_ci": "Lower",
            "upper_ci": "Upper",
        }
    )
    .cols_align(align="left", columns=["label"])
    .cols_align(
        align="center", columns=["person_years", "failures", "rate", "lower_ci", "upper_ci"]
    )
    .tab_spanner(
        label="95%CI",
        columns=["lower_ci", "upper_ci"],
    )
)
gt.show()
html_data.append(gt.as_raw_html())

In [None]:
raw_html = "</BR>".join(html_data)
raw_html = (
    '<!DOCTYPE html>\n<html lang="en">\n<head>\n<meta charset="utf-8"/>\n</head>\n<body>\n'
    + raw_html
    + "\n</body>\n</html>\n"
)
pdfkit.from_string(raw_html, str(analysis_folder / "incident_rate.pdf"))