In [47]:
%%capture
# uv sync --group develop --group lint --group test --group notebook
import os
from pathlib import Path

import numpy as np
import pandas as pd
from dj_notebook import activate
from django_pandas.io import read_frame

env_file = os.environ["META_ENV"]
reports_folder = Path(os.environ["META_REPORTS_FOLDER"])
analysis_folder = Path(os.environ["META_ANALYSIS_FOLDER"])
pharmacy_folder = Path(os.environ["META_PHARMACY_FOLDER"])
plus = activate(dotenv_file=env_file)
pd.set_option('future.no_silent_downcasting', True)

In [48]:
from collections.abc import Callable
from datetime import date

import pdfkit
from edc_appointment.analytics import get_appointment_df
from edc_appointment.constants import (  # noqa
    CANCELLED_APPT,
    MISSED_APPT,
    NEW_APPT,
    ONTIME_APPT,
    SCHEDULED_APPT,
    UNSCHEDULED_APPT,
)
from edc_constants.constants import YES  # noqa
from edc_pdutils.dataframes import get_subject_visit
from edc_visit_schedule.models import SubjectScheduleHistory
from great_tables import loc, md, style
from scipy.stats import chi2

from meta_analytics.dataframes import (
    GlucoseEndpointsByDate,
    get_eos_df,
    get_glucose_df,
    get_screening_df,
)
from meta_analytics.utils import df_as_great_table, df_as_great_table2
from meta_consent.models import SubjectConsentV1Ext
from meta_prn.models import LossToFollowup
from meta_visit_schedule.constants import (
    MONTH15,
    MONTH18,
    MONTH21,
    MONTH27,
    MONTH30,
    MONTH33,
    MONTH39,
)

In [49]:
html_data = []
data_download_date = date(2025, 9, 6)
cutoff_date = date(2025, 9, 6)
end_of_trial_date = date(2026, 5, 31)
document_title = f"<h2>Monitoring Report: {cutoff_date.strftime('%B %Y')}</h2><h5>Data Download: {data_download_date.strftime('%d %B %Y')} using cutoff date of {cutoff_date.strftime('%d %B %Y')}</h5>"
study_title = 'META3 - Metformin treatment for diabetes prevention in Africa'
pdf_filename = f"monitoring_report_{cutoff_date.strftime('%Y%m%d')}.pdf"


In [50]:
# 105-30-0288-5 should also be late excluded based on the haemoglobin 4.8 presented at baseline

df_visit = get_subject_visit("meta_subject.subjectvisit")
df_visit_1691 = df_visit.copy()

late_exlusion_offstudy_reasons = [
    'Patient fulfilled late exclusion criteria (due to abnormal blood values or raised blood pressure at enrolment']
df_eos = get_eos_df()
df_eos_1691 = df_eos.copy()
df_eos_excluded = (
    df_eos
    .query("offstudy_reason.isin(@late_exlusion_offstudy_reasons)")
    .copy()
    .reset_index()
)
df_visit = (
    df_visit
    .merge(df_eos_excluded[["subject_identifier", "offstudy_datetime", "offstudy_reason"]], on="subject_identifier",
           how="left", indicator=True)
    .query("_merge=='left_only'")
    .drop(columns=["_merge"])
)

df_visit = df_visit[df_visit.appt_datetime.dt.date <= cutoff_date]

df_appointments = get_appointment_df()
df_appointments["site_id"] = df_appointments.site_id.astype(str)
df_appointments_1691 = df_appointments.copy()
df_appointments = (
    df_appointments
    .merge(df_eos_excluded[["subject_identifier", "offstudy_datetime", "offstudy_reason"]], on="subject_identifier",
           how="left", indicator=True)
    .query("_merge=='left_only'")
    .drop(columns=["_merge"])
)

cls = GlucoseEndpointsByDate()
cls.run()
df_endpoint = cls.endpoint_only_df.copy()
df_glucose = get_glucose_df()
# df_glucose_fbg = get_glucose_fbg_df()
# df_glucose = pd.concat([df_glucose, df_glucose_fbg])


enrolled = df_visit.copy()
enrolled["site_id"] = enrolled["site_id"].astype(str)
enrolled_pivot = (
    enrolled
    .query("visit_code==1000.0").groupby(["site_id"])
    .size()
    .reset_index()
    .pivot_table(columns="site_id", values=0, observed=True)
)
enrolled_pivot.columns.name = ""
enrolled_pivot["total"] = enrolled_pivot[["10", "20", "30", "40", "60"]].sum(axis=1)

In [51]:
# before late exclusion
df_visit_orig = df_visit_1691[df_visit_1691.appt_datetime.dt.date <= cutoff_date]
enrolled_1691 = df_visit_1691.copy()
enrolled_1691["site_id"] = enrolled_1691["site_id"].astype(str)
enrolled_1691_pivot = (
    enrolled_1691
    .query("visit_code==1000.0").groupby(["site_id"])
    .size()
    .reset_index()
    .pivot_table(columns="site_id", values=0, observed=True)
)
enrolled_1691_pivot.columns.name = ""
enrolled_1691_pivot["total"] = enrolled_1691_pivot[["10", "20", "30", "40", "60"]].sum(axis=1)

# df_eos_1691
# df_appointments_1691



In [52]:
column_headers = {"label": "Label", "visit_code": "Visit code", "10": "Hindu Mandal", "20": "Amana", "30": "Temeke",
                  "40": "Mwananyamala", "60": "Mnazi Moja", "total": "Total"}
column_headers_with_str = {"label": "Label", "10_str": "Hindu Mandal", "20_str": "Amana", "30_str": "Temeke",
                           "40_str": "Mwananyamala", "60_str": "Mnazi Moja", "total_str": "Total"}

In [53]:
# Table 1a Visits completed to date

df_tbl1 = df_visit[(df_visit.visit_code_sequence == 0) & (df_visit.appt_timing == ONTIME_APPT) & ~(
    df_visit.appt_status.isin([NEW_APPT, CANCELLED_APPT]))].groupby(
    by=["visit_code", "site_id"]).size().to_frame().reset_index()

df_tbl1.columns = ["visit_code", "site_id", "visits"]
df1 = df_tbl1.pivot(index="visit_code", columns="site_id", values="visits").reset_index()
df1.columns.name = None
df1.columns = ['visit_code', "10", "20", "30", "40", "60"]
df1['total'] = df1[['10', '20', '30', '40', '60']].sum(axis=1)
df1.fillna(0, inplace=True)
df_attended = df1.copy().reset_index(drop=True)
df_attended = df_attended.fillna(0.0)

In [54]:
gt = df_as_great_table(
    df_attended[["visit_code", "10", "20", "30", "40", "60", "total"]],
    title="Table 1a: Visits completed to date"
)
gt = (
    gt
    .cols_label({k: v for k, v in column_headers.items() if k != "label"})
    .cols_align(align="center", columns=["10", "20", "30", "40", "60", "total"])
    .cols_align(align="left", columns=["visit_code"])
    .data_color(
        columns=["visit_code"],
        palette=["lavender", "thistle"],
        domain=[2000, 5000],
        na_color="white"
    )
    .tab_source_note(
        source_note=(
            "Excludes visit reports submitted for participants "
            "eventually withdrawn on late exclusion criteria."
        )
    )
)
html_data.append(gt.as_raw_html())
gt.show()

Table 1a: Visits completed to date,Table 1a: Visits completed to date,Table 1a: Visits completed to date,Table 1a: Visits completed to date,Table 1a: Visits completed to date,Table 1a: Visits completed to date,Table 1a: Visits completed to date
Visit code,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
1000.0,177.0,374.0,328.0,527.0,224.0,1630.0
1005.0,157.0,355.0,304.0,476.0,213.0,1505.0
1010.0,170.0,358.0,304.0,490.0,212.0,1534.0
1030.0,170.0,358.0,297.0,472.0,212.0,1509.0
1060.0,169.0,353.0,285.0,460.0,211.0,1478.0
1090.0,169.0,337.0,269.0,436.0,204.0,1415.0
1120.0,160.0,341.0,259.0,432.0,199.0,1391.0
1150.0,161.0,307.0,242.0,415.0,181.0,1306.0
1180.0,157.0,311.0,238.0,414.0,178.0,1298.0
1210.0,148.0,287.0,223.0,396.0,172.0,1226.0


In [55]:
# Table 1b Total scheduled appointments
df_appt_pivot = (
    df_appointments.query("appt_reason==@SCHEDULED_APPT")
    .merge(df_eos_excluded[["subject_identifier"]], on="subject_identifier", how="left", indicator=True)
    .query("_merge=='left_only'")
    .drop(columns=["_merge"])
    .reset_index(drop=True)
    .groupby(["visit_code", "site_id"])
    .size()
    .to_frame()
    .reset_index()
    .pivot(index="visit_code", columns="site_id", values=0)
    .reset_index()
    .fillna(0)
)

df_appt_pivot["total"] = df_appt_pivot.iloc[:, 1:].sum(axis=1)
df_appt_pivot.columns.name = None
gt = df_as_great_table(
    df_appt_pivot,
    title="Table 1b: Total appointments",
    subtitle="Total possible appointments not including unscheduled appointments"

)
gt = (
    gt
    .cols_label({k: v for k, v in column_headers.items() if k != "label"})
    .cols_align(align="center", columns=["10", "20", "30", "40", "60", "total"])
    .cols_align(align="left", columns=["visit_code"])
    .data_color(
        columns=["visit_code"],
        palette=["lavender", "thistle"],
        domain=[2000, 5000],
        na_color="white"
    )
)
html_data.append(gt.as_raw_html())
gt.show()

Table 1b: Total appointments,Table 1b: Total appointments,Table 1b: Total appointments,Table 1b: Total appointments,Table 1b: Total appointments,Table 1b: Total appointments,Table 1b: Total appointments
Total possible appointments not including unscheduled appointments,Total possible appointments not including unscheduled appointments,Total possible appointments not including unscheduled appointments,Total possible appointments not including unscheduled appointments,Total possible appointments not including unscheduled appointments,Total possible appointments not including unscheduled appointments,Total possible appointments not including unscheduled appointments
Visit code,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
1000.0,177,374,328,527,224,1630
1005.0,177,374,328,527,224,1630
1010.0,177,372,327,527,223,1626
1030.0,176,372,325,526,223,1622
1060.0,176,366,324,523,220,1609
1090.0,171,364,322,522,215,1594
1120.0,169,363,321,513,210,1576
1150.0,164,355,319,505,200,1543
1180.0,162,352,318,501,196,1529
1210.0,160,350,316,494,193,1513


In [56]:
# Table 1c Past scheduled appointments -- no information provided
df_appt_pivot = (
    df_appointments.query(
        "appt_datetime<@cutoff_date and appt_reason==@SCHEDULED_APPT and appt_timing==@ONTIME_APPT and appt_status.isin([@NEW_APPT])")
    .merge(df_eos_excluded[["subject_identifier"]], on="subject_identifier", how="left", indicator=True)
    .query("_merge=='left_only'")
    .drop(columns=["_merge"])
    .reset_index(drop=True)
    .groupby(["visit_code", "site_id"])
    .size()
    .to_frame()
    .reset_index()
    .pivot(index="visit_code", columns="site_id", values=0)
    .reset_index()
    .fillna(0)
)
df_appt_pivot["total"] = df_appt_pivot.iloc[:, 1:].sum(axis=1)
df_appt_pivot.columns.name = None
gt = df_as_great_table(
    df_appt_pivot,
    title="Table 1c: Past appointments not attended/not reported",
    subtitle="Expected by now but no information provided by site",
)
gt = (
    gt
    .cols_label({k: v for k, v in column_headers.items() if k != "label"})
    .cols_align(align="center", columns=["10", "20", "30", "40", "60", "total"])
    .cols_align(align="left", columns=["visit_code"])
    .data_color(
        columns=["visit_code"],
        palette=["lavender", "thistle"],
        domain=[2000, 5000],
        na_color="white"
    )
    .tab_source_note(source_note=f"Scheduled appointment date is before {cutoff_date.strftime('%d %B %Y')}.")
)
html_data.append(gt.as_raw_html())
gt.show()

Table 1c: Past appointments not attended/not reported,Table 1c: Past appointments not attended/not reported,Table 1c: Past appointments not attended/not reported,Table 1c: Past appointments not attended/not reported,Table 1c: Past appointments not attended/not reported,Table 1c: Past appointments not attended/not reported,Table 1c: Past appointments not attended/not reported
Expected by now but no information provided by site,Expected by now but no information provided by site,Expected by now but no information provided by site,Expected by now but no information provided by site,Expected by now but no information provided by site,Expected by now but no information provided by site,Expected by now but no information provided by site
Visit code,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
1005.0,0.0,1.0,1.0,0.0,0.0,2.0
1010.0,0.0,2.0,1.0,1.0,0.0,4.0
1030.0,1.0,6.0,1.0,2.0,3.0,13.0
1060.0,4.0,5.0,0.0,1.0,6.0,16.0
1090.0,1.0,6.0,0.0,5.0,6.0,18.0
1120.0,1.0,12.0,2.0,0.0,7.0,22.0
1150.0,2.0,16.0,5.0,3.0,10.0,36.0
1180.0,2.0,22.0,20.0,5.0,9.0,58.0
1210.0,4.0,31.0,37.0,10.0,10.0,92.0
1240.0,2.0,29.0,43.0,10.0,20.0,104.0


In [57]:
# Table 1d Unscheduled appointments
df_appt = (
    df_appointments.query("appt_reason==@UNSCHEDULED_APPT and appt_timing==@ONTIME_APPT and appt_status!=@NEW_APPT")
    .merge(df_eos_excluded[["subject_identifier"]], on="subject_identifier", how="left", indicator=True)
    .query("_merge=='left_only'")
    .drop(columns=["_merge"])
    .reset_index(drop=True)
    .copy()
    .reset_index(drop=True)
)
df_appt['visit_code'] = df_appt['visit_code'].astype(int)
df_appt['visit_code'] = df_appt['visit_code'].astype(str)

subjects_with_unscheduled = df_appt.subject_identifier.nunique()

df_appt_pivot = (
    df_appt
    .groupby(["visit_code", "site_id"])
    .size()
    .to_frame()
    .reset_index()
    .pivot(index="visit_code", columns="site_id", values=0)
    .reset_index()
    .fillna(0)
)
df_appt_pivot["total"] = df_appt_pivot.iloc[:, 1:].sum(axis=1)
df_appt_pivot.columns.name = None
df_appt_pivot[["10", "20", "30", "40", "60", "total"]] = df_appt_pivot[["10", "20", "30", "40", "60", "total"]].astype(
    'float64')

# add totals row
sum_row = df_appt_pivot.select_dtypes(include='float64').sum()
sum_row['visit_code'] = 'Total'
sum_row_df = pd.DataFrame(sum_row).T
df_appt_pivot = pd.concat([df_appt_pivot, sum_row_df], axis=0).reset_index(drop=True)

gt = df_as_great_table(
    df_appt_pivot,
    title="Table 1d: Unscheduled appointments",
    subtitle="Appointments with sequence>0 grouped by visit code",
)
gt = (
    gt
    .cols_label({k: v for k, v in column_headers.items() if k != "label"})
    .cols_align(align="center", columns=["10", "20", "30", "40", "60", "total"])
    .cols_align(align="left", columns=["visit_code"])
    .data_color(
        columns=["visit_code"],
        palette=["lavender", "thistle"],
        domain=[2000, 5000],
        na_color="white"
    )
    .fmt_number(columns=["10", "20", "30", "40", "60", "total"], decimals=0)
    .tab_source_note(source_note=f"{subjects_with_unscheduled} participants had at least one unscheduled appointment.")
)
html_data.append(gt.as_raw_html())
gt.show()

Table 1d: Unscheduled appointments,Table 1d: Unscheduled appointments,Table 1d: Unscheduled appointments,Table 1d: Unscheduled appointments,Table 1d: Unscheduled appointments,Table 1d: Unscheduled appointments,Table 1d: Unscheduled appointments
Appointments with sequence>0 grouped by visit code,Appointments with sequence>0 grouped by visit code,Appointments with sequence>0 grouped by visit code,Appointments with sequence>0 grouped by visit code,Appointments with sequence>0 grouped by visit code,Appointments with sequence>0 grouped by visit code,Appointments with sequence>0 grouped by visit code
Visit code,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
1000,1,11,2,1,0,15
1005,0,1,2,3,1,7
1010,6,5,2,0,1,14
1030,9,13,9,12,1,44
1060,5,16,13,8,3,45
1090,6,7,9,8,2,32
1120,6,25,7,7,1,46
1150,1,9,4,6,0,20
1180,10,15,3,11,5,44
1210,3,4,1,4,1,13


In [58]:
# Table 1e Future scheduled appointments
df_appt_pivot = (
    df_appointments.query(
        "@cutoff_date<=appt_datetime<@end_of_trial_date and appt_reason==@SCHEDULED_APPT and appt_timing==@ONTIME_APPT and appt_status.isin([@NEW_APPT])")
    .merge(df_eos_excluded[["subject_identifier"]], on="subject_identifier", how="left", indicator=True)
    .query("_merge=='left_only'")
    .drop(columns=["_merge"])
    .reset_index(drop=True)
    .groupby(["visit_code", "site_id"])
    .size()
    .to_frame()
    .reset_index()
    .pivot(index="visit_code", columns="site_id", values=0)
    .reset_index()
    .fillna(0)
)
df_appt_pivot["total"] = df_appt_pivot.iloc[:, 1:].sum(axis=1)
df_appt_pivot.columns.name = None
gt = df_as_great_table(
    df_appt_pivot,
    title="Table 1e: Future appointments",
)
gt = (
    gt
    .cols_label({k: v for k, v in column_headers.items() if k != "label"})
    .cols_align(align="center", columns=["10", "20", "30", "40", "60", "total"])
    .cols_align(align="left", columns=["visit_code"])
    .data_color(
        columns=["visit_code"],
        palette=["lavender", "thistle"],
        domain=[2000, 5000],
        na_color="white"
    )
    .fmt_number(columns=["10", "20", "30", "40", "60", "total"], decimals=0)
    .tab_source_note(
        source_note=f"Scheduled appointment date is on or after {cutoff_date.strftime('%d %B %Y')} and before {end_of_trial_date.strftime('%d %B %Y')}.")
)
html_data.append(gt.as_raw_html())
gt.show()

Table 1e: Future appointments,Table 1e: Future appointments,Table 1e: Future appointments,Table 1e: Future appointments,Table 1e: Future appointments,Table 1e: Future appointments,Table 1e: Future appointments
Visit code,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
1210.0,3,3,2,6,3,17
1240.0,20,20,23,28,19,110
1270.0,27,36,43,54,48,208
1300.0,35,55,73,100,70,333
1330.0,47,83,94,148,92,464
1360.0,76,153,140,226,115,710
1390.0,16,39,39,51,17,162
1420.0,28,85,63,113,17,306
1450.0,37,100,73,126,15,351
1480.0,23,77,48,67,0,215


In [59]:
# Table 2 Visits Missed to Date as % of Visits Attended + Visits Missed
subject_count = (
    df_visit
    .merge(df_eos_excluded[["subject_identifier"]], on="subject_identifier", how="left", indicator=True)
    .query("_merge=='left_only'")
    .drop(columns=["_merge"])
    .reset_index(drop=True)
    .query("visit_code_sequence==0 and appt_timing==@MISSED_APPT and ~appt_status.isin([@NEW_APPT, @CANCELLED_APPT])")
).subject_identifier.nunique()
df_tbl = (
    df_visit[(df_visit.visit_code_sequence == 0) & (df_visit.appt_timing == MISSED_APPT) & ~(
        df_visit.appt_status.isin([NEW_APPT, CANCELLED_APPT]))]
    .merge(df_eos_excluded[["subject_identifier"]], on="subject_identifier", how="left", indicator=True)
    .query("_merge=='left_only'")
    .drop(columns=["_merge"])
    .reset_index(drop=True)
    .groupby(by=["visit_code", "site_id"])
    .size()
    .to_frame()
    .reset_index()
)
df_tbl.columns = ["visit_code", "site_id", "visits"]
df_tbl_pivot = df_tbl.pivot(index="visit_code", columns="site_id", values="visits").reset_index()
df_tbl_pivot.columns.name = None
df_tbl_pivot.columns = ['visit_code', "10", "20", "30", "40", "60"]
df_tbl_pivot['total'] = df_tbl_pivot[['10', '20', '30', '40', '60']].sum(axis=1)
df_missed = (
    df_tbl_pivot
    .fillna(0)
    .copy()
    .set_index(["visit_code"])
)

df_attended_display = df_attended.copy()
df_attended_display = (
    df_attended_display
    .set_index(["visit_code"])
)

attended_and_missed = df_attended_display + df_missed
attended_and_missed = (
    attended_and_missed
    .fillna(0)
    .reset_index()
    .set_index(["visit_code"])
)

attended_and_missed_perc = df_missed / attended_and_missed
attended_and_missed_perc = (
    attended_and_missed_perc
    .fillna(0)
    .reset_index()
    .set_index(["visit_code"])
)

df_result = df_missed.merge(attended_and_missed_perc, on=["visit_code"], suffixes=("", "_perc"))
for col in ["10", "20", "30", "40", "60", "total"]:
    col_perc = f"{col}_perc"
    df_result[col] = df_result.apply(lambda x: f"{x[col]} ({x[col_perc] * 100:.2f})", axis=1)
df_result = df_result.reset_index().sort_values(by=["visit_code"], ascending=True)
df_result = df_result.fillna(0.0)

In [60]:
df_table = df_result[["visit_code", "10", "20", "30", "40", "60", "total"]].copy()
gt = df_as_great_table(
    df_table,
    title="Table 2a: Visits Missed to Date",
    subtitle="as % of Visits Attended + Visits Missed"
)
gt = (
    gt
    .cols_label({k: v for k, v in column_headers.items() if k != "label"})
    .cols_align(align="center", columns=["10", "20", "30", "40", "60", "total"])
    .cols_align(align="left", columns=["visit_code", "label"])
    .tab_style(
        style=[style.fill(color="snow"), style.text(color="black")],
        locations=loc.body(
            columns=[0],
            rows=list(range(0, len(df_table))),
        ),
    )
    .tab_source_note(source_note=f"{subject_count} participants had at least one missed visit.")

)
html_data.append(gt.as_raw_html())
gt.show()


Table 2a: Visits Missed to Date,Table 2a: Visits Missed to Date,Table 2a: Visits Missed to Date,Table 2a: Visits Missed to Date,Table 2a: Visits Missed to Date,Table 2a: Visits Missed to Date,Table 2a: Visits Missed to Date
as % of Visits Attended + Visits Missed,as % of Visits Attended + Visits Missed,as % of Visits Attended + Visits Missed,as % of Visits Attended + Visits Missed,as % of Visits Attended + Visits Missed,as % of Visits Attended + Visits Missed,as % of Visits Attended + Visits Missed
Visit code,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
1005.0,20.0 (11.30),18.0 (4.83),23.0 (7.03),51.0 (9.68),11.0 (4.91),123.0 (7.56)
1010.0,7.0 (3.95),12.0 (3.24),22.0 (6.75),36.0 (6.84),11.0 (4.93),88.0 (5.43)
1030.0,5.0 (2.86),8.0 (2.19),27.0 (8.33),52.0 (9.92),8.0 (3.64),100.0 (6.22)
1060.0,3.0 (1.74),8.0 (2.22),39.0 (12.04),62.0 (11.88),3.0 (1.40),115.0 (7.22)
1090.0,1.0 (0.59),21.0 (5.87),53.0 (16.46),81.0 (15.67),5.0 (2.39),161.0 (10.22)
1120.0,8.0 (4.76),10.0 (2.85),60.0 (18.81),81.0 (15.79),4.0 (1.97),163.0 (10.49)
1150.0,1.0 (0.62),32.0 (9.44),72.0 (22.93),87.0 (17.33),9.0 (4.74),201.0 (13.34)
1180.0,3.0 (1.88),19.0 (5.76),60.0 (20.13),82.0 (16.53),9.0 (4.81),173.0 (11.76)
1210.0,5.0 (3.27),29.0 (9.18),54.0 (19.49),82.0 (17.15),8.0 (4.44),178.0 (12.68)
1240.0,12.0 (8.89),14.0 (4.70),42.0 (16.87),80.0 (17.78),7.0 (4.58),155.0 (12.06)


In [61]:
# Table 2b: Number of missed visits by participant
subject_count = (
    df_visit
    .merge(df_eos_excluded[["subject_identifier"]], on="subject_identifier", how="left", indicator=True)
    .query("_merge=='left_only'")
    .drop(columns=["_merge"])
    .reset_index(drop=True)
    .query("visit_code_sequence==0 and appt_timing==@MISSED_APPT and ~appt_status.isin([@NEW_APPT, @CANCELLED_APPT])")
).subject_identifier.nunique()
df_tbl = (
    df_visit[(df_visit.visit_code_sequence == 0) & (df_visit.appt_timing == MISSED_APPT) & ~(
        df_visit.appt_status.isin([NEW_APPT, CANCELLED_APPT]))]
    .merge(df_eos_excluded[["subject_identifier"]], on="subject_identifier", how="left", indicator=True)
    .query("_merge=='left_only'")
    .drop(columns=["_merge"])
    .reset_index(drop=True)
    .groupby(by=["subject_identifier", "site_id"])
    .size()
    .to_frame()
    .reset_index()
)
df_tbl.columns = ["subject_identifier", "site_id", "missed_count"]
df_tbl["category"] = pd.cut(df_tbl["missed_count"], bins=[0, 1, 3, 5, 7, 100],
                            labels=["Missed at least 1", "2 to 3", "4 to 5", "6 to 7", "missed more than 7"])
df_tbl_pivot = df_tbl.pivot_table(index="category", columns="site_id", values="missed_count", observed=False,
                                  aggfunc="count").reset_index()

df_tbl_pivot['total'] = df_tbl_pivot.select_dtypes(include='int').sum(axis=1, skipna=True)

sum_row = df_tbl_pivot.select_dtypes(include='int64').sum()
sum_row['category'] = 'Total'

df_tbl_pivot = (
    pd.concat([df_tbl_pivot, sum_row.to_frame().T], axis=0)
    .rename(columns={10: "10", 20: "20", 30: "30", 40: "40", 60: "60"})
)

gt = df_as_great_table(
    df_tbl_pivot,
    title="Table 2b: Number of participants who missed one or more visits",
)
gt = (
    gt
    .cols_label(
        {"category": "Category", **{k: v for k, v in column_headers.items() if k not in ["visit_code", "label"]}})
    .cols_align(align="center", columns=["10", "20", "30", "40", "60", "total"])
    .cols_align(align="left", columns=["category"])
    .tab_style(
        style=[style.fill(color="snow"), style.text(color="black")],
        locations=loc.body(
            columns=[0],
            rows=list(range(0, len(df_table))),
        ),
    )
)
html_data.append(gt.as_raw_html())
gt.show()



Table 2b: Number of participants who missed one or more visits,Table 2b: Number of participants who missed one or more visits,Table 2b: Number of participants who missed one or more visits,Table 2b: Number of participants who missed one or more visits,Table 2b: Number of participants who missed one or more visits,Table 2b: Number of participants who missed one or more visits,Table 2b: Number of participants who missed one or more visits
Category,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
Missed at least 1,36,56,42,84,34,252
2 to 3,21,42,67,72,14,216
4 to 5,0,14,33,26,5,78
6 to 7,0,1,20,27,0,48
missed more than 7,0,0,8,32,0,40
Total,57,113,170,241,53,634


In [62]:
# func for tables 3,4,5
def get_row_df(row_df: pd.DataFrame, label: str = None, **kwargs) -> pd.DataFrame:
    row_df = row_df.groupby(by=["site_id"]).site_id.count().to_frame(name="n")
    row_df["label"] = label
    row_df = row_df.reset_index()
    row_df = row_df.pivot(index="label", values="n", columns="site_id").reset_index()
    row_df.columns.name = ""
    all_sites = [10, 20, 30, 40, 60]
    for site in all_sites:
        if site not in row_df.columns:
            row_df[site] = None
    row_df = row_df.reset_index(drop=True)
    return row_df


def get_table_df(
        df_source: pd.DataFrame,
        visit_code: float | None = None,
        month_label: str | None = None,
        visit_codes: list[float] | None = None,
        get_row_func: Callable | None = None,
        category_labels: list[str] | None = None,
) -> pd.DataFrame:
    get_row_df_func = get_row_func or get_row_df
    if visit_code:
        df_month = df_source[df_source.visit_code == visit_code].copy()
    elif visit_codes:
        df_month = df_source[df_source.visit_code.isin(visit_codes)].copy()
    elif month_label:
        df_month = df_source.copy()

    row_df = df_month.copy()
    table_df = get_row_df_func(row_df, "Total (n)", category_labels=category_labels)

    row_df = df_month.query("ogtt_value<7.8 and fbg_value<6.1").copy()
    table_df = pd.concat([table_df, get_row_df_func(row_df, "OGTT <7.8; FBG <6.1", category_labels=category_labels)])

    row_df = df_month[(df_month.ogtt_value < 7.8) & (df_month.fbg_value >= 6.1) & (df_month.fbg_value < 7.0)].copy()
    table_df = pd.concat(
        [table_df, get_row_df_func(row_df, "OGTT <7.8; FBG >=6.1 <7.0", category_labels=category_labels)])

    row_df = df_month[(df_month.ogtt_value < 7.8) & (df_month.fbg_value >= 7.0)].copy()
    table_df = pd.concat([table_df, get_row_df_func(row_df, "OGTT <7.8; FBG >=7.0", category_labels=category_labels)])

    row_df = df_month[(df_month.ogtt_value >= 7.8) & (df_month.ogtt_value < 11.1) & (df_month.fbg_value < 6.1)].copy()
    table_df = pd.concat(
        [table_df, get_row_df_func(row_df, "OGTT ≥7.8 to <11.1; FBG <6.1", category_labels=category_labels)])

    row_df = df_month[(df_month.ogtt_value >= 7.8) & (df_month.ogtt_value < 11.1) & (df_month.fbg_value >= 6.1) & (
            df_month.fbg_value < 7.0)].copy()
    table_df = pd.concat(
        [table_df, get_row_df_func(row_df, "OGTT ≥7.8 to <11.1; FBG >=6.1 <7.0", category_labels=category_labels)])

    row_df = df_month[(df_month.ogtt_value >= 7.8) & (df_month.ogtt_value < 11.1) & (df_month.fbg_value >= 7.0)].copy()
    table_df = pd.concat(
        [table_df, get_row_df_func(row_df, "OGTT ≥7.8 to <11.1; FBG >=7.0", category_labels=category_labels)])

    row_df = df_month[(df_month.ogtt_value >= 11.1) & (df_month.fbg_value < 6.1)].copy()
    table_df = pd.concat([table_df, get_row_df_func(row_df, "OGTT ≥11.1; FBG <6.1", category_labels=category_labels)])

    row_df = df_month[(df_month.ogtt_value >= 11.1) & (df_month.fbg_value >= 6.1) & (df_month.fbg_value < 7.0)].copy()
    table_df = pd.concat(
        [table_df, get_row_df_func(row_df, "OGTT ≥11.1; FBG >=6.1 <7.0", category_labels=category_labels)])

    row_df = df_month[(df_month.ogtt_value >= 11.1) & (df_month.fbg_value >= 7.0)].copy()
    table_df = pd.concat([table_df, get_row_df_func(row_df, "OGTT ≥11.1; FBG >=7.0", category_labels=category_labels)])

    row_df = df_month[(df_month.ogtt_value.isna())].copy()
    table_df = pd.concat([table_df, get_row_df_func(row_df, "Missing OGTT", category_labels=category_labels)])
    return table_df


def format_table_df(tbl_df, add_totals: bool | None = None):
    """Pivot on site"""
    add_totals = True if add_totals is None else add_totals
    tbl_df = tbl_df.fillna(0.0)
    tbl_df["total"] = tbl_df.iloc[:, 1:].sum(axis=1)
    tbl_df = tbl_df.reset_index(drop=True)

    if add_totals:
        df_last = tbl_df[1:].sum().to_frame()
        df_last.loc["label"] = np.nan
        df_last = df_last.reset_index()
        df_last.columns = ["label", "value"]
        df_last = df_last.pivot_table(columns="label", values="value").reset_index(drop=True)
        df_last.columns.name = ""
        df_last["label"] = "Totals"

        tbl_df = pd.concat([tbl_df, df_last])
        tbl_df = tbl_df.reset_index(drop=True)

    tbl_df.columns = ["label", "10", "20", "30", "40", "60", "total"]

    for site in ["10", "20", "30", "40", "60", "total"]:
        tbl_df[f"{site}_perc"] = (tbl_df[site] / tbl_df.iloc[0][site]) * 100 if tbl_df.iloc[0][site] > 0 else 0
        tbl_df[f"{site}_perc_str"] = tbl_df[f"{site}_perc"].map('{:.1f}'.format)

    for site in ["10", "20", "30", "40", "60", "total"]:
        tbl_df[f"{site}_str"] = tbl_df[[f"{site}", f"{site}_perc_str"]].apply(lambda x: ' ('.join(x.astype(str)),
                                                                              axis=1)
        tbl_df[f"{site}_str"] = tbl_df[f"{site}_str"] + ")"

    cols = ["label", *[f"{site}_str" for site in ["10", "20", "30", "40", "60", "total"]]]
    tbl_df1 = tbl_df[cols]
    tbl_df1.loc[tbl_df.label == "Total (n)"] = tbl_df.iloc[0][
        ["label", "10", "20", "30", "40", "60", "total"]].to_list()
    return tbl_df1


def format_table_with_bmi_df(tbl_df, add_totals: bool | None = None, category_labels: list[str] = None):
    """Pivot on BMI categories"""

    add_totals = True if add_totals is None else add_totals
    tbl_df = tbl_df.fillna(0.0)
    tbl_df["total"] = tbl_df.iloc[:, 1:].sum(axis=1)
    tbl_df = tbl_df.reset_index(drop=True)

    if add_totals:
        df_last = tbl_df[1:].sum().to_frame()
        df_last.loc["label"] = np.nan
        df_last = df_last.reset_index()
        df_last.columns = ["label", "value"]
        df_last = df_last.pivot_table(columns="label", values="value").reset_index(drop=True)
        df_last.columns.name = ""
        df_last["label"] = "Totals"

        tbl_df = pd.concat([tbl_df, df_last])
        tbl_df = tbl_df.reset_index(drop=True)

    tbl_df.columns = ["label", *category_labels, "total"]

    for label in [*category_labels, "total"]:
        tbl_df[f"{label}_perc"] = (tbl_df[label] / tbl_df.iloc[0][label]) * 100 if tbl_df.iloc[0][label] > 0 else 0
        tbl_df[f"{label}_perc_str"] = tbl_df[f"{label}_perc"].map('{:.1f}'.format)

    for cat in [*category_labels, "total"]:
        tbl_df[f"{label}_str"] = tbl_df[[f"{label}", f"{label}_perc_str"]].apply(lambda x: ' ('.join(x.astype(str)),
                                                                                 axis=1)
        tbl_df[f"{label}_str"] = tbl_df[f"{label}_str"] + ")"

    cols = ["label", *[f"{label}_str" for label in [*category_labels, "total"]]]
    tbl_df1 = tbl_df[cols]
    tbl_df1.loc[tbl_df.label == "Total (n)"] = tbl_df.iloc[0][["label", *category_labels, "total"]].to_list()
    return tbl_df1


def get_row_by_df(row_df: pd.DataFrame, label: str, category_labels: list[str]) -> pd.DataFrame:
    # if label not in category_labels:
    #     raise ValueError(f"Invalid label. Expected one of {category_labels}. Got {label}.")
    row_df = row_df.groupby(by=["site_id"]).site_id.count().to_frame(name="n")
    row_df["label"] = label
    row_df = row_df.reset_index()
    row_df = row_df.pivot(index="label", values="n", columns="site_id").reset_index()
    row_df.columns.name = ""

    for label in category_labels:
        if label not in row_df.columns:
            row_df[label] = None
    row_df = row_df.reset_index(drop=True)
    return row_df

In [63]:
def get_fbg_value(r):
    if not pd.isna(r["converted_fbg2_value"]):
        return r["converted_fbg2_value"]
    return r["converted_fbg_value"]


def get_ogtt_value(r):
    if not pd.isna(r["converted_ogtt2_value"]):
        return r["converted_ogtt2_value"]
    return r["converted_ogtt_value"]


In [64]:
# Table 3: OGTT and FBG at Enrolment

subjects = df_visit.subject_identifier.unique()
df_screening = get_screening_df().query("consented==True and subject_identifier.isin(@subjects)")
df_screening["visit_code"] = "Enrol"
df_screening["fbg_value"] = df_screening.apply(get_fbg_value, axis=1)
df_screening["ogtt_value"] = df_screening.apply(get_ogtt_value, axis=1)
df_screening["site_id"] = df_screening.site.astype(int)
df_screening = df_screening.drop(columns=["site"])
df_table3 = get_table_df(df_screening, month_label="enrol")
df_table3 = format_table_df(df_table3)
df_table3 = df_table3.fillna(0.0)
gt = df_as_great_table(df_table3, title="Table 3a: OGTT and FBG at Screening / Enrolment")

column_headers_enrol = {k: v for k, v in column_headers_with_str.items() if k not in "visit_code"}
gt = (
    gt
    .cols_label(column_headers_enrol)
    .cols_align(align="center", columns=["10_str", "20_str", "30_str", "40_str", "60_str", "total_str"])
    .cols_align(align="left", columns=["label"])
    .cols_width(cases={"label": "35%"})
    .tab_source_note(source_note="Excluding patients eventually withdrawn for `late exclusion` criteria")
)
html_data.append(gt.as_raw_html())
gt.show()


Table 3a: OGTT and FBG at Screening / Enrolment,Table 3a: OGTT and FBG at Screening / Enrolment,Table 3a: OGTT and FBG at Screening / Enrolment,Table 3a: OGTT and FBG at Screening / Enrolment,Table 3a: OGTT and FBG at Screening / Enrolment,Table 3a: OGTT and FBG at Screening / Enrolment,Table 3a: OGTT and FBG at Screening / Enrolment
Label,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
Total (n),177,374,328,527,224,1630
OGTT <7.8; FBG >=6.1 <7.0,60 (33.9),167 (44.7),166 (50.6),133 (25.2),51 (22.8),577 (35.4)
OGTT ≥7.8 to <11.1; FBG <6.1,28 (15.8),87 (23.3),64 (19.5),55 (10.4),121 (54.0),355 (21.8)
OGTT ≥7.8 to <11.1; FBG >=6.1 <7.0,89 (50.3),90 (24.1),69 (21.0),184 (34.9),39 (17.4),471 (28.9)
OGTT ≥7.8 to <11.1; FBG >=7.0,0.0 (0.0),30 (8.0),29 (8.8),155 (29.4),13 (5.8),227.0 (13.9)
Totals,177.0 (100.0),374.0 (100.0),328.0 (100.0),527.0 (100.0),224.0 (100.0),1630.0 (100.0)
Excluding patients eventually withdrawn for `late exclusion` criteria,Excluding patients eventually withdrawn for `late exclusion` criteria,Excluding patients eventually withdrawn for `late exclusion` criteria,Excluding patients eventually withdrawn for `late exclusion` criteria,Excluding patients eventually withdrawn for `late exclusion` criteria,Excluding patients eventually withdrawn for `late exclusion` criteria,Excluding patients eventually withdrawn for `late exclusion` criteria


In [65]:
# bmi_categories:
#     1 calculated_bmi_value<25
#     2 calculated_bmi_value>=25 & calculated_bmi_value<30
#     3 calculated_bmi_value>=30


# subjects = df_visit.subject_identifier.unique()
# df_screening = get_screening_df().query("consented==True and subject_identifier.isin(@subjects)")
# df_screening["visit_code"] = "Enrol"
# df_screening["fbg_value"] = df_screening.apply(get_fbg_value, axis=1)
# df_screening["ogtt_value"] = df_screening.apply(get_ogtt_value, axis=1)
# df_screening["site_id"] = df_screening.site.astype(int)
# df_screening = df_screening.drop(columns=["site"])
# df_screening["bmi"] = pd.NA
# df_screening.loc[df_screening["calculated_bmi_value"] < 25.0, "bmi"] = "bmi<25"
# df_screening.loc[(df_screening["calculated_bmi_value"]>=25.0) & (df_screening["calculated_bmi_value"] < 30.0), "bmi"] = "25<=bmi<30"
# df_screening.loc[df_screening["calculated_bmi_value"] > 30.0, "bmi"] = "bmi>30"
#
# category_labels = [ "bmi<25", "25<=bmi<30", "bmi>=30", "Total (n)"]
# df_table3 = get_table_df(df_screening, month_label="enrol", get_row_func=get_row_by_df, category_labels=category_labels)
# df_table3 = format_table_with_bmi_df(df_table3, category_labels=category_labels)
# df_table3 = df_table3.fillna(0.0)
# gt = df_as_great_table(df_table3, title="Table 3b: OGTT/FBG by BMI at Screening / Enrolment")
# column_headers_enrol = {"bmi<25_str":"bmi<25", "25<=bmi<30_str":"25<=bmi<30", "bmi>30_str":"bmi>30", "total_str": "total"}
# gt = (
#     gt
#     .cols_label(column_headers_enrol)
#     .cols_align(align="center", columns=["bmi<25_str", "25<=bmi<30_str", "bmi>30_str", "total_str"])
#     .cols_align(align="left", columns=["label"])
#     .cols_width(cases={"label": "35%"})
#     .tab_source_note(source_note="Excluding patients eventually withdrawn for `late exclusion` criteria")
# )
# html_data.append(gt.as_raw_html())
# gt.show()



In [66]:
[col for col in df_screening.columns if "bmi" in col]

['calculated_bmi_value']

In [67]:
# Table 4: OGTT and FBG at 12-month visit
df_table3 = get_table_df(df_glucose, visit_codes=[1120.0])
df_table3 = format_table_df(df_table3)
df_table3 = df_table3.fillna(0.0)
gt = df_as_great_table(df_table3, title="Table 4: OGTT and FBG at 12-month visit")
gt = (
    gt
    .cols_label(column_headers_with_str)
    .cols_align(align="center", columns=["10_str", "20_str", "30_str", "40_str", "60_str", "total_str"])
    .cols_align(align="left", columns=["label"])
    .cols_width(cases={"label": "35%"})
)
html_data.append(gt.as_raw_html())
gt.show()


Table 4: OGTT and FBG at 12-month visit,Table 4: OGTT and FBG at 12-month visit,Table 4: OGTT and FBG at 12-month visit,Table 4: OGTT and FBG at 12-month visit,Table 4: OGTT and FBG at 12-month visit,Table 4: OGTT and FBG at 12-month visit,Table 4: OGTT and FBG at 12-month visit
Label,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
Total (n),160,341,259,432,199,1391
OGTT <7.8; FBG <6.1,41 (25.6),155 (45.5),126 (48.6),119 (27.5),112 (56.3),553 (39.8)
OGTT <7.8; FBG >=6.1 <7.0,42 (26.2),85 (24.9),56 (21.6),125 (28.9),12 (6.0),320 (23.0)
OGTT <7.8; FBG >=7.0,4 (2.5),8 (2.3),3 (1.2),27 (6.2),1 (0.5),43 (3.1)
OGTT ≥7.8 to <11.1; FBG <6.1,15 (9.4),31 (9.1),42 (16.2),39 (9.0),39 (19.6),166 (11.9)
OGTT ≥7.8 to <11.1; FBG >=6.1 <7.0,36 (22.5),45 (13.2),26 (10.0),82 (19.0),22 (11.1),211 (15.2)
OGTT ≥7.8 to <11.1; FBG >=7.0,9 (5.6),7 (2.1),4 (1.5),36 (8.3),1 (0.5),57 (4.1)
OGTT ≥11.1; FBG <6.1,0.0 (0.0),1 (0.3),1 (0.4),0.0 (0.0),1 (0.5),3.0 (0.2)
OGTT ≥11.1; FBG >=6.1 <7.0,2 (1.2),1 (0.3),0.0 (0.0),0.0 (0.0),7 (3.5),10.0 (0.7)
OGTT ≥11.1; FBG >=7.0,3 (1.9),3 (0.9),1 (0.4),3 (0.7),3 (1.5),13 (0.9)


In [68]:
df_glucose.query("visit_code==1360.0 and ogtt_value.isna()")[["subject_identifier", "site_id"]].set_index("subject_identifier")


Unnamed: 0_level_0,site_id
subject_identifier,Unnamed: 1_level_1
105-10-0024-6,10
105-10-0031-1,10
105-10-0033-7,10
105-10-0044-4,10
105-10-0057-6,10
105-20-0157-3,20
105-20-0166-4,20
105-30-0005-3,30
105-30-0024-4,30
105-30-0051-7,30


In [69]:
# Table 5: OGTT and FBG at 24-month visit
df_table4 = get_table_df(df_glucose, 1240.0)
df_table4 = format_table_df(df_table4)
df_table4 = df_table4.fillna(0.0)
gt = df_as_great_table(df_table4, title="Table 5: OGTT and FBG at 24-month visit")
gt = (
    gt
    .cols_label(column_headers_with_str)
    .cols_align(align="center", columns=["10_str", "20_str", "30_str", "40_str", "60_str", "total_str"])
    .cols_align(align="left", columns=["label"])
    .cols_width(cases={"label": "35%"})
)
html_data.append(gt.as_raw_html())
gt.show()

Table 5: OGTT and FBG at 24-month visit,Table 5: OGTT and FBG at 24-month visit,Table 5: OGTT and FBG at 24-month visit,Table 5: OGTT and FBG at 24-month visit,Table 5: OGTT and FBG at 24-month visit,Table 5: OGTT and FBG at 24-month visit,Table 5: OGTT and FBG at 24-month visit
Label,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
Total (n),123,284,207,370,146,1130
OGTT <7.8; FBG <6.1,42 (34.1),130 (45.8),72 (34.8),117 (31.6),67 (45.9),428 (37.9)
OGTT <7.8; FBG >=6.1 <7.0,27 (22.0),66 (23.2),55 (26.6),102 (27.6),22 (15.1),272 (24.1)
OGTT <7.8; FBG >=7.0,3 (2.4),0.0 (0.0),6 (2.9),9 (2.4),0.0 (0.0),18.0 (1.6)
OGTT ≥7.8 to <11.1; FBG <6.1,15 (12.2),39 (13.7),25 (12.1),42 (11.4),33 (22.6),154 (13.6)
OGTT ≥7.8 to <11.1; FBG >=6.1 <7.0,16 (13.0),42 (14.8),38 (18.4),78 (21.1),18 (12.3),192 (17.0)
OGTT ≥7.8 to <11.1; FBG >=7.0,1 (0.8),4 (1.4),4 (1.9),17 (4.6),2 (1.4),28 (2.5)
OGTT ≥11.1; FBG <6.1,0.0 (0.0),1 (0.4),0.0 (0.0),0.0 (0.0),1 (0.7),2.0 (0.2)
OGTT ≥11.1; FBG >=6.1 <7.0,0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),1 (0.7),1.0 (0.1)
OGTT ≥11.1; FBG >=7.0,2 (1.6),1 (0.4),0.0 (0.0),2 (0.5),0.0 (0.0),5.0 (0.4)


In [70]:
# Table 6: OGTT and FBG at 36-month visit
df_table5 = get_table_df(df_glucose, 1360.0)
df_table5 = format_table_df(df_table5)
df_table5 = df_table5.fillna(0.0)
gt = df_as_great_table(df_table5, title="Table 6: OGTT and FBG at 36-month visit")
gt = (
    gt
    .cols_label(column_headers_with_str)
    .cols_align(align="center", columns=["10_str", "20_str", "30_str", "40_str", "60_str", "total_str"])
    .cols_align(align="left", columns=["label"])
    .cols_width(cases={"label": "35%"})
)
html_data.append(gt.as_raw_html())
gt.show()

Table 6: OGTT and FBG at 36-month visit,Table 6: OGTT and FBG at 36-month visit,Table 6: OGTT and FBG at 36-month visit,Table 6: OGTT and FBG at 36-month visit,Table 6: OGTT and FBG at 36-month visit,Table 6: OGTT and FBG at 36-month visit,Table 6: OGTT and FBG at 36-month visit
Label,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
Total (n),41,118,90,145,17,411
OGTT <7.8; FBG <6.1,20 (48.8),53 (44.9),19 (21.1),50 (34.5),3 (17.6),145 (35.3)
OGTT <7.8; FBG >=6.1 <7.0,3 (7.3),26 (22.0),37 (41.1),40 (27.6),2 (11.8),108 (26.3)
OGTT <7.8; FBG >=7.0,2 (4.9),1 (0.8),2 (2.2),5 (3.4),0.0 (0.0),10.0 (2.4)
OGTT ≥7.8 to <11.1; FBG <6.1,6 (14.6),17 (14.4),8 (8.9),9 (6.2),4 (23.5),44 (10.7)
OGTT ≥7.8 to <11.1; FBG >=6.1 <7.0,3 (7.3),17 (14.4),18 (20.0),34 (23.4),5 (29.4),77 (18.7)
OGTT ≥7.8 to <11.1; FBG >=7.0,1 (2.4),0.0 (0.0),1 (1.1),5 (3.4),0.0 (0.0),7.0 (1.7)
OGTT ≥11.1; FBG >=6.1 <7.0,1 (2.4),1 (0.8),0.0 (0.0),0.0 (0.0),0.0 (0.0),2.0 (0.5)
OGTT ≥11.1; FBG >=7.0,0.0 (0.0),1 (0.8),0.0 (0.0),0.0 (0.0),0.0 (0.0),1.0 (0.2)
Missing OGTT,5 (12.2),2 (1.7),5 (5.6),2 (1.4),3 (17.6),17 (4.1)


In [71]:
# Table 7: Any OGTT>11.1 ever
row_df = df_glucose[df_glucose.ogtt_value >= 11.1].copy()
table_df = get_row_df(row_df, "Total (n)")
df_table6 = format_table_df(table_df)
df_table = df_table6[:1].fillna(0.0).copy().reset_index(drop=True)
gt = df_as_great_table(df_table, title="Table 7: Any OGTT>11.1 ever")
gt = (
    gt
    .cols_label(column_headers_with_str)
    .cols_align(align="center", columns=["10_str", "20_str", "30_str", "40_str", "60_str", "total_str"])
    .cols_align(align="left", columns=["label"])
    .cols_width(cases={"label": "35%"})
)
html_data.append(gt.as_raw_html())
gt.show()

Table 7: Any OGTT>11.1 ever,Table 7: Any OGTT>11.1 ever,Table 7: Any OGTT>11.1 ever,Table 7: Any OGTT>11.1 ever,Table 7: Any OGTT>11.1 ever,Table 7: Any OGTT>11.1 ever,Table 7: Any OGTT>11.1 ever
Label,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
Total (n),8,9,3,12,19,51


In [72]:
# func for table 7
def get_table7_df(df_source: pd.DataFrame, visit_code: float) -> pd.DataFrame:
    df_month = df_source[(df_source.visit_code >= visit_code) & (df_source.visit_code <= visit_code + 0.9)].copy()

    row_df = df_month.copy()
    table_df = get_row_df(row_df, "Total (n)")

    row_df = df_month[(df_month.fbg_value < 6.1)].copy()
    table_df = pd.concat([table_df, get_row_df(row_df, "FBG <6.1")])

    row_df = df_month[(df_month.fbg_value >= 6.1) & (df_month.fbg_value < 7.0)].copy()
    table_df = pd.concat([table_df, get_row_df(row_df, "FBG >=6.1 <7.0")])

    row_df = df_month[(df_month.fbg_value >= 7.0)].copy()
    table_df = pd.concat([table_df, get_row_df(row_df, "FBG >=7.0")])
    return table_df

In [73]:
# Table 8: Interim FBG results
df_table7 = get_table7_df(df_glucose, 1150.0)
df_table7 = format_table_df(df_table7, add_totals=False)
df_table7["visit_code"] = MONTH15

df_table71 = get_table7_df(df_glucose, 1180.0)
df_table71 = format_table_df(df_table71, add_totals=False)
df_table71["visit_code"] = MONTH18

df_table72 = get_table7_df(df_glucose, 1210.0)
df_table72 = format_table_df(df_table72, add_totals=False)
df_table72["visit_code"] = MONTH21

df_table73 = get_table7_df(df_glucose, 1270.0)
df_table73 = format_table_df(df_table73, add_totals=False)
df_table73["visit_code"] = MONTH27

df_table74 = get_table7_df(df_glucose, 1300.0)
df_table74 = format_table_df(df_table74, add_totals=False)
df_table74["visit_code"] = MONTH30

df_table75 = get_table7_df(df_glucose, 1330.0)
df_table75 = format_table_df(df_table75, add_totals=False)
df_table75["visit_code"] = MONTH33

df_table76 = get_table7_df(df_glucose, 1390.0)
df_table76 = format_table_df(df_table76, add_totals=False)
df_table76["visit_code"] = MONTH39

df_table = pd.concat([df_table7, df_table71, df_table72, df_table73, df_table74, df_table75, df_table76])
df_table = df_table.reset_index(drop=True)
df_table = df_table.fillna(0.0)

In [74]:
column_headers_with_str = {"visit_code": "Visit Code", **column_headers_with_str}
gt = df_as_great_table2(df_table, title="Table 8: Interim FBG results")
gt = (
    gt
    .cols_label(column_headers_with_str)
    .cols_move_to_start(columns="visit_code")
    .cols_align(align="center", columns=["10_str", "20_str", "30_str", "40_str", "60_str", "total_str"])
    .cols_align(align="left", columns=["visit_code", "label"])
    .cols_width(cases={"label": "15%"})
    .tab_style(
        style=[
            style.text(color="black", weight="bold"),
            style.fill(color="lightgray")
        ],
        locations=loc.row_groups()
    )
)
html_data.append(gt.as_raw_html())
gt.show()

Table 8: Interim FBG results,Table 8: Interim FBG results,Table 8: Interim FBG results,Table 8: Interim FBG results,Table 8: Interim FBG results,Table 8: Interim FBG results,Table 8: Interim FBG results
Unnamed: 0_level_1,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
1150,1150,1150,1150,1150,1150,1150
Total (n),162,316,246,421,181,1326
FBG <6.1,47 (29.0),203 (64.2),127 (51.6),161 (38.2),63 (34.8),601 (45.3)
FBG >=6.1 <7.0,48 (29.6),53 (16.8),57 (23.2),103 (24.5),26 (14.4),287 (21.6)
FBG >=7.0,17 (10.5),8 (2.5),8 (3.3),44 (10.5),10 (5.5),87 (6.6)
1180,1180,1180,1180,1180,1180,1180
Total (n),167,326,241,425,183,1342
FBG <6.1,87 (52.1),215 (66.0),137 (56.8),187 (44.0),116 (63.4),742 (55.3)
FBG >=6.1 <7.0,67 (40.1),99 (30.4),92 (38.2),164 (38.6),58 (31.7),480 (35.8)
FBG >=7.0,6 (3.6),7 (2.1),3 (1.2),66 (15.5),8 (4.4),90 (6.7)


In [75]:
# Table 9: Primary Endpoint met
df_endpoint_grp = df_endpoint.groupby(by=["site_id", "endpoint_label"]).size().to_frame().reset_index()
df_endpoint_grp.columns = ["site_id", "label", "endpoints"]
df_endpoint_pivot = df_endpoint_grp.pivot_table(index="label", columns="site_id", values="endpoints").reset_index()
df_endpoint_pivot.columns.name = ""
df_endpoint_pivot.columns = ['label', "10", "20", "30", "40", "60"]
df_endpoint_pivot.loc[len(df_endpoint_pivot)] = df_endpoint_pivot[['10', '20', '30', '40', '60']].sum().to_dict()
df_endpoint_pivot.at[len(df_endpoint_pivot) - 1, 'label'] = 'Total'
df_endpoint_pivot['total'] = df_endpoint_pivot[['10', '20', '30', '40', '60']].sum(axis=1)
df_endpoint_pivot = df_endpoint_pivot.fillna(0.0)

gt = df_as_great_table(
    df_endpoint_pivot,
    title="Table 9a: Primary Endpoint met"
)
gt = (
    gt
    .cols_label({k: v for k, v in column_headers.items() if k not in ["visit_code"]})
    .cols_align(align="center", columns=["10", "20", "30", "40", "60", "total"])
    .cols_align(align="left", columns=["label"])
    .cols_width(cases={"label": "25%"})
)
html_data.append(gt.as_raw_html())
gt.show()

Table 9a: Primary Endpoint met,Table 9a: Primary Endpoint met,Table 9a: Primary Endpoint met,Table 9a: Primary Endpoint met,Table 9a: Primary Endpoint met,Table 9a: Primary Endpoint met,Table 9a: Primary Endpoint met
Label,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
EOS - Patient developed diabetes,2.0,0.0,0.0,3.0,1.0,6.0
"FBG >= 7 x 2, first OGTT<=11.1",0.0,0.0,0.0,14.0,1.0,15.0
"FBG >= 7 x 2, second OGTT<=11.1",4.0,0.0,1.0,15.0,1.0,21.0
OGTT >= 11.1,8.0,9.0,3.0,10.0,19.0,49.0
Total,14.0,9.0,4.0,42.0,22.0,91.0


In [76]:
df_endpoint.query("subject_identifier=='105-30-0225-7'")

Unnamed: 0,subject_identifier,site_id,baseline_datetime,visit_datetime,interval_in_days,visit_code,fbg_value,ogtt_value,fbg_datetime,fasted,endpoint_label,endpoint_type,endpoint,offstudy_datetime,offstudy_reason,test,days_to_endpoint


In [77]:
from great_tables import html

# Table 9b: Primary Endpoint no EOS or DM Referral
df_subjecthistory = read_frame(
    SubjectScheduleHistory.objects.filter(offschedule_model="meta_prn.offschedule", offschedule_datetime__isnull=False),
    verbose=False).rename(columns={"site": "site_id"})
df_subjecthistory["site_id"] = df_subjecthistory["site_id"].astype(str)
df_endpoint_no_off = df_endpoint.merge(df_subjecthistory[["subject_identifier", "offschedule_datetime"]],
                                       on=["subject_identifier"], how="left")
df_endpoint_grp = df_endpoint_no_off.query("offschedule_datetime.isna()").groupby(
    by=["site_id", "endpoint_label"]).size().to_frame().reset_index()
df_endpoint_grp.columns = ["site_id", "label", "endpoints"]
df_endpoint_pivot = df_endpoint_grp.pivot_table(index="label", columns="site_id", values="endpoints").reset_index()
df_endpoint_pivot.columns.name = ""
df_endpoint_pivot.columns = ['label', *[str(col) for col in df_endpoint_pivot.columns if col != "label"]]
for col in [c for c in ['label', "10", "20", "30", "40", "60"] if str(c) not in df_endpoint_pivot.columns]:
    df_endpoint_pivot[str(col)] = np.nan
df_endpoint_pivot.columns = ['label', "10", "20", "30", "40", "60"]
df_endpoint_pivot.loc[len(df_endpoint_pivot)] = df_endpoint_pivot[['10', '20', '30', '40', '60']].sum().to_dict()
df_endpoint_pivot.at[len(df_endpoint_pivot) - 1, 'label'] = 'Total'
df_endpoint_pivot['total'] = df_endpoint_pivot[['10', '20', '30', '40', '60']].sum(axis=1)
df_endpoint_pivot = df_endpoint_pivot.fillna(0.0)
subjects = df_endpoint_no_off.query("offschedule_datetime.isna()").subject_identifier.to_list()

gt = df_as_great_table(
    df_endpoint_pivot,
    title="Table 9b: Primary Endpoint met -- participant not referred"
)
gt = (
    gt
    .cols_label({k: v for k, v in column_headers.items() if k not in ["visit_code"]})
    .cols_align(align="center", columns=["10", "20", "30", "40", "60", "total"])
    .cols_align(align="left", columns=["label"])
    .cols_width(cases={"label": "25%"})
    .tab_source_note(source_note=html("<BR>".join(subjects)))
)
html_data.append(gt.as_raw_html())
gt.show()

Table 9b: Primary Endpoint met -- participant not referred,Table 9b: Primary Endpoint met -- participant not referred,Table 9b: Primary Endpoint met -- participant not referred,Table 9b: Primary Endpoint met -- participant not referred,Table 9b: Primary Endpoint met -- participant not referred,Table 9b: Primary Endpoint met -- participant not referred,Table 9b: Primary Endpoint met -- participant not referred
Label,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
"FBG >= 7 x 2, first OGTT<=11.1",0.0,0.0,3.0,0.0,0.0,3.0
"FBG >= 7 x 2, second OGTT<=11.1",1.0,1.0,2.0,1.0,0.0,5.0
OGTT >= 11.1,1.0,0.0,0.0,0.0,0.0,1.0
Total,2.0,1.0,5.0,1.0,0.0,9.0
105-10-0060-0 105-10-0020-4 105-30-0052-5 105-40-0029-2 105-40-0111-8 105-40-0215-7 105-40-0298-3 105-40-0390-8 105-60-0169-4,105-10-0060-0 105-10-0020-4 105-30-0052-5 105-40-0029-2 105-40-0111-8 105-40-0215-7 105-40-0298-3 105-40-0390-8 105-60-0169-4,105-10-0060-0 105-10-0020-4 105-30-0052-5 105-40-0029-2 105-40-0111-8 105-40-0215-7 105-40-0298-3 105-40-0390-8 105-60-0169-4,105-10-0060-0 105-10-0020-4 105-30-0052-5 105-40-0029-2 105-40-0111-8 105-40-0215-7 105-40-0298-3 105-40-0390-8 105-60-0169-4,105-10-0060-0 105-10-0020-4 105-30-0052-5 105-40-0029-2 105-40-0111-8 105-40-0215-7 105-40-0298-3 105-40-0390-8 105-60-0169-4,105-10-0060-0 105-10-0020-4 105-30-0052-5 105-40-0029-2 105-40-0111-8 105-40-0215-7 105-40-0298-3 105-40-0390-8 105-60-0169-4,105-10-0060-0 105-10-0020-4 105-30-0052-5 105-40-0029-2 105-40-0111-8 105-40-0215-7 105-40-0298-3 105-40-0390-8 105-60-0169-4


In [95]:
# export subject_identifiers who reached endpoint
# df_endpoint_no_off[["subject_identifier", "endpoint_label"]].to_stata(analysis_folder / "endpoints_20250906.dta")

In [78]:
# Table 10: Incident Rate per 1000 person-years

def get_df_main(df_visit: pd.DataFrame, lower_days: float | None = None, upper_days: float | None = None):
    if not lower_days:
        lower_days = -1
    # cutoff_datetime = df_visit.query("@lower_days<followup_days<=@upper_days").visit_datetime.max()
    # exclude subjects for this reason
    offstudy_reasons = [
        'Patient fulfilled late exclusion criteria (due to abnormal blood values or raised blood pressure at enrolment']

    df_eos = get_eos_df()
    df_eos_excluded = (
        df_eos
        .query("followup_days>@lower_days and followup_days<=@upper_days and offstudy_reason.isin(@offstudy_reasons)")
        .copy()
        .reset_index()
    )
    df_visit_final = (
        df_visit.query("@lower_days<followup_days<=@upper_days and reason!='missed' and visit_code<2000.0")
        .merge(df_eos_excluded[["subject_identifier"]], on="subject_identifier", how="left", suffixes=("", "_y"),
               indicator=True)
        .query("_merge=='left_only'")
        .drop(columns=["_merge"])
    )
    df_main = (
        df_visit_final
        .groupby(by=["subject_identifier"])[["baseline_datetime", "visit_datetime", "followup_days"]]
        .max()
        .reset_index()
    )

    df_main = (
        df_main
        .merge(
            df_endpoint.query("days_to_endpoint>@lower_days")[
                ["subject_identifier", "endpoint_label", "endpoint_type", "days_to_endpoint"]],
            how="left",
            on=["subject_identifier"])
        .reset_index(drop=True)
    )
    if lower_days >= 365.25:
        df_main["followup_days"] = df_main["followup_days"] - lower_days
    df_main["followup_years"] = df_main["followup_days"] / 365.25
    return df_main, len(df_main), len(
        df_main.query("@lower_days<days_to_endpoint<=@upper_days and endpoint_label.notna()"))


def get_rate_and_ci(events, person_years_total):
    lower_ci = (chi2.ppf(0.025, 2 * events) / (2 * person_years_total)) * 1000
    upper_ci = (chi2.ppf(0.975, 2 * (events + 1)) / (2 * person_years_total)) * 1000
    return events / person_years_total * 1000, lower_ci, upper_ci


def get_incidence_data(term: str, lower_days: float, upper_days: float):
    data = {}
    df_main, subjects, events = get_df_main(df_visit, lower_days=lower_days, upper_days=upper_days)
    person_years_total = df_main.followup_years.sum()
    data.update({term: [person_years_total, subjects, events, *get_rate_and_ci(events, person_years_total)]})
    return data

In [79]:
incidence_data = {}
incidence_data.update(get_incidence_data("total", lower_days=-1, upper_days=10000))
incidence_data.update(get_incidence_data("0-1 years", lower_days=-1, upper_days=365.25))
incidence_data.update(get_incidence_data("1-2 years", lower_days=365.25, upper_days=2 * 365.25))
incidence_data.update(get_incidence_data("2-3 years", lower_days=2 * 365.25, upper_days=3 * 365.25))
incidence_data.update(get_incidence_data("3+ years", lower_days=3 * 365.25, upper_days=10 * 365.25))
data = dict(label=[], person_years=[], subjects=[], failures=[], rate=[], lower_ci=[], upper_ci=[])
for k in incidence_data:
    data["label"].append(k)

for v in incidence_data.values():
    data["person_years"].append(v[0])
    data["subjects"].append(v[1])
    data["failures"].append(v[2])
    data["rate"].append(v[3])
    data["lower_ci"].append(v[4])
    data["upper_ci"].append(v[5])

df_table9 = pd.DataFrame(data={k: v for k, v in data.items() if k != "subjects"})

In [80]:
gt = df_as_great_table(
    df_table9,
    title="Table 10: Incident Rate per 1000 person years",
    subtitle=md("using randomisation to diabetes/last seen"),
)
gt = gt.fmt_number(columns=["person_years", "failures", "rate", "lower_ci", "upper_ci"], decimals=2)
gt = (gt
      .cols_label(
    {"label": "Label", "person_years": "Person years", "failures": "Failures", "rate": "Rate", "lower_ci": "Lower",
     "upper_ci": "Upper"})
      .cols_align(align="left", columns=["label"])
      .cols_align(align="center", columns=["person_years", "failures", "rate", "lower_ci", "upper_ci"])
      .tab_spanner(
    label="95%CI",
    columns=["lower_ci", "upper_ci"],
)
      .tab_source_note(source_note="Excluding patients withdrawn for `late exclusion` criteria")
      )
gt.show()
html_data.append(gt.as_raw_html())

Table 10: Incident Rate per 1000 person years,Table 10: Incident Rate per 1000 person years,Table 10: Incident Rate per 1000 person years,Table 10: Incident Rate per 1000 person years,Table 10: Incident Rate per 1000 person years,Table 10: Incident Rate per 1000 person years
using randomisation to diabetes/last seen,using randomisation to diabetes/last seen,using randomisation to diabetes/last seen,using randomisation to diabetes/last seen,using randomisation to diabetes/last seen,using randomisation to diabetes/last seen
Label,Person years,Failures,Rate,95%CI,95%CI
Label,Person years,Failures,Rate,Lower,Upper
total,3770.41,85.00,22.54,18.01,27.88
0-1 years,1318.11,32.00,24.28,16.61,34.27
1-2 years,1092.10,37.00,33.88,23.85,46.70
2-3 years,712.73,12.00,16.84,8.70,29.41
3+ years,85.64,4.00,46.71,12.73,119.59
Excluding patients withdrawn for `late exclusion` criteria,Excluding patients withdrawn for `late exclusion` criteria,Excluding patients withdrawn for `late exclusion` criteria,Excluding patients withdrawn for `late exclusion` criteria,Excluding patients withdrawn for `late exclusion` criteria,Excluding patients withdrawn for `late exclusion` criteria


In [81]:
# Table 11: Proportion meeting primary endpoint
df_table10 = pd.DataFrame(data=data)
df_table10["proportion"] = df_table10["failures"] / df_table10["subjects"] * 100
gt = df_as_great_table(
    df_table10[["label", "subjects", 'failures', "proportion"]],
    title="Table 11: Proportion meeting primary endpoint",
)
gt = (
    gt
    .fmt_number(columns=["failures", "proportion"], decimals=2)
    .cols_label({"label": "Label", "subjects": "Participants", "failures": "Failures", "proportion": "%"})
    .cols_align(align="left", columns=["label"])
    .cols_align(align="center", columns=["subjects", "failures", "proportion"])
    .tab_source_note(source_note="Excluding patients withdrawn for `late exclusion` criteria")
)
html_data.append(gt.as_raw_html())
gt.show()


Table 11: Proportion meeting primary endpoint,Table 11: Proportion meeting primary endpoint,Table 11: Proportion meeting primary endpoint,Table 11: Proportion meeting primary endpoint
Label,Participants,Failures,%
total,1630,85.00,5.21
0-1 years,1630,32.00,1.96
1-2 years,1430,37.00,2.59
2-3 years,1203,12.00,1.00
3+ years,384,4.00,1.04
Excluding patients withdrawn for `late exclusion` criteria,Excluding patients withdrawn for `late exclusion` criteria,Excluding patients withdrawn for `late exclusion` criteria,Excluding patients withdrawn for `late exclusion` criteria


In [82]:
# Table 11a: End of Study Table (for those who have completed an end of study form)
df_eos = get_eos_df()
offstudy_reasons = {
    "Delivered / Completed followup from pregnancy": "Pregnancy",
    "Patient completed 36 months of follow-up": "Completed 36m",
    "Patient developed diabetes": "Developed diabetes",
    "Other reason (specify below)": "Other",
    "Patient fulfilled late exclusion criteria (due to abnormal blood values or raised blood pressure at enrolment": "Late exclusion",
    "Patient has been transferred to another health centre": "Transferred out",
    "Patient is withdrawn on CLINICAL grounds ...": "Withdrawal: Clinical grounds",
    "Patient lost to follow-up": "LTFU",
    "Patient reported/known to have died": "Died",
    "Patient withdrew consent to participate further": "Withdrawal: Consent",
}
df_eos["offstudy_reason"] = df_eos["offstudy_reason"].map(offstudy_reasons)
df_eos["offstudy_reason"] = pd.Categorical(df_eos["offstudy_reason"],
                                           categories=sorted(list(offstudy_reasons.values())), ordered=True)
df_eos["site_id"] = df_eos["site_id"].astype(str)
df_eos_pivot = (
    df_eos
    .groupby(by=["offstudy_reason", "site_id"], observed=True)
    .size()
    .reset_index()
    .pivot_table(index="offstudy_reason", columns="site_id", values=0, observed=True)
    .fillna(0)
    .astype(int)
    .reset_index()
)
df_eos_pivot["total"] = df_eos_pivot[["10", "20", "30", "40", "60"]].sum(axis=1)
df_eos_pivot.columns.name = ""
sum_row = df_eos_pivot.select_dtypes(include='int64').sum()
sum_row['offstudy_reason'] = 'Total'
sum_row_df = pd.DataFrame(sum_row).T
enrolled_1691_pivot["offstudy_reason"] = "Enrolled"
enrolled_1691_pivot = enrolled_1691_pivot[[*df_eos_pivot.columns]]
df_eos_pivot = pd.concat([enrolled_1691_pivot, df_eos_pivot, sum_row_df], ignore_index=True)

gt = df_as_great_table(
    df_eos_pivot,
    title="Table 12a: End of study report",
    subtitle=md("for those who have completed an End of study report"),
)
gt = (
    gt
    .cols_label(
        {"offstudy_reason": "Reason", **{k: v for k, v in column_headers.items() if k not in ["visit_code", "label"]}})
    .cols_align(align="left", columns=["offstudy_reason"])
    .cols_align(align="center", columns=["10", "20", "30", "40", "60", "total"])
    .tab_style(
        style=[style.fill(color="snow"), style.text(color="black")],
        locations=loc.body(
            columns=[0],
            rows=[len(df_eos_pivot) - 1]),
    )
    .tab_style(
        style=[style.fill(color="lightblue"), style.text(color="black")],
        locations=loc.body(
            columns=["10", "20", "30", "40", "60"],
            rows=[len(df_eos_pivot) - 1],
        ),
    )
    .tab_style(
        style=[style.fill(color="lightgreen"), style.text(color="black")],
        locations=loc.body(
            columns=["total"],
            rows=[len(df_eos_pivot) - 1],
        ),
    )
    .tab_style(
        style=[style.fill(color="snow"), style.text(color="black")],
        locations=loc.body(
            columns=["offstudy_reason"],
            rows=[0],
        ),
    )
)
html_data.append(gt.as_raw_html())
gt.show()


Table 12a: End of study report,Table 12a: End of study report,Table 12a: End of study report,Table 12a: End of study report,Table 12a: End of study report,Table 12a: End of study report,Table 12a: End of study report
for those who have completed an End of study report,for those who have completed an End of study report,for those who have completed an End of study report,for those who have completed an End of study report,for those who have completed an End of study report,for those who have completed an End of study report,for those who have completed an End of study report
Reason,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
Enrolled,185.0,391.0,340.0,545.0,230.0,1691.0
Completed 36m,0.0,4.0,0.0,2.0,0.0,6.0
Developed diabetes,7.0,7.0,3.0,26.0,16.0,59.0
Died,1.0,3.0,0.0,5.0,2.0,11.0
LTFU,0.0,0.0,0.0,11.0,0.0,11.0
Late exclusion,8.0,17.0,12.0,18.0,6.0,61.0
Other,0.0,0.0,0.0,2.0,0.0,2.0
Pregnancy,1.0,2.0,2.0,4.0,2.0,11.0
Transferred out,3.0,3.0,3.0,10.0,0.0,19.0
Withdrawal: Clinical grounds,0.0,2.0,0.0,2.0,0.0,4.0


In [83]:
# Table 12b: Study status
def get_schedule_df(df_subjecthistory: pd.DataFrame, onschedule_model: str, offschedule_model: str,
                    mode: str) -> pd.DataFrame:
    columns = {k: f"{k}_{mode}" for k in ["10", "20", "30", "40", "60"]}
    df_schedule = (
        df_subjecthistory
        .query(
            f"onschedule_model==@onschedule_model and offschedule_model==@offschedule_model and offschedule_datetime.{'isna' if mode == 'on' else 'notna'}()")
        .groupby(by=["onschedule_model", "site_id"])
        .size()
        .reset_index()
        .pivot_table(index="onschedule_model", columns="site_id", values=0, observed=True)
        .reset_index()
        .rename(columns={"onschedule_model": "schedule", **columns})
        .fillna(0)
        .copy()
    )
    df_schedule.columns.name = ""
    return df_schedule


df_subjecthistory = read_frame(SubjectScheduleHistory.objects.all(), verbose=False).rename(columns={"site": "site_id"})
df_subjecthistory["site_id"] = df_subjecthistory["site_id"].astype(str)

df_on = pd.concat([
    get_schedule_df(df_subjecthistory, 'meta_prn.onschedule', 'meta_prn.offschedule', "on"),
    get_schedule_df(df_subjecthistory, 'meta_prn.onscheduledmreferral', 'meta_prn.offscheduledmreferral', "on"),
    get_schedule_df(df_subjecthistory, 'meta_prn.onschedulepregnancy', 'meta_prn.offschedulepregnancy', "on"),
])

df_on = (
    df_on
    .fillna(0)
    .reset_index(drop=True)
)

df_off = pd.concat([
    get_schedule_df(df_subjecthistory, 'meta_prn.onschedule', 'meta_prn.offschedule', "off"),
    get_schedule_df(df_subjecthistory, 'meta_prn.onscheduledmreferral', 'meta_prn.offscheduledmreferral', "off"),
    get_schedule_df(df_subjecthistory, 'meta_prn.onschedulepregnancy', 'meta_prn.offschedulepregnancy', "off"),
])
df_off = (
    df_off
    .fillna(0)
    .reset_index(drop=True)
)

df_status = pd.merge(df_on, df_off, on=["schedule"], how="outer")
columns = []
for ele in [[f"{x}_on", f"{x}_off"] for x in ["10", "20", "30", "40", "60"]]:
    columns.extend(ele)
df_status = df_status[["schedule", *columns]]
df_status["total_on"] = df_status[[col for col in columns if "on" in col]].sum(axis=1)
df_status["total_off"] = df_status[[col for col in columns if "off" in col]].sum(axis=1)
df_status["total"] = df_status[columns].sum(axis=1)
df_status["schedule"] = df_status.schedule.map(
    {"meta_prn.onschedule": "Main trial", "meta_prn.onscheduledmreferral": "Diabetes",
     "meta_prn.onschedulepregnancy": "Pregnancy"})

gt = df_as_great_table(
    df_status,
    title="Table 12b: Study status",
    subtitle=md("Calculated from Offschedule form; not End of study report"),
)
# gt = gt.fmt_number(columns=["person_years", "failures", "rate", "lower_ci", "upper_ci"], decimals=0)
gt = (gt
      .tab_source_note(
    source_note=(
        "Note: Offschedule form is always submitted before the End of study report. "
        "When the Offschedule form is submitted, future appointments for the schedule are removed and "
        "the site staff are actioned to submit the End of study report."
    )
)
      .cols_label({
    "10_on": "On", "10_off": "Off",
    "20_on": "On", "20_off": "Off",
    "30_on": "On", "30_off": "Off",
    "40_on": "On", "40_off": "Off",
    "60_on": "On", "60_off": "Off",
    "total_on": "On", "total_off": "Off",
    "schedule": "Schedule", "total": "Total"})
      .cols_align(align="center")
      .cols_align(align="left", columns=["label"])
      .tab_spanner(
    label="Hindu mandal",
    columns=["10_on", "10_off"],
)
      .tab_spanner(
    label="Amana",
    columns=["20_on", "20_off"],
)
      .tab_spanner(
    label="Temeke",
    columns=["30_on", "30_off"],
)
      .tab_spanner(
    label="Mwananyamala",
    columns=["40_on", "40_off"],
)
      .tab_spanner(
    label="Mnazi Moja",
    columns=["60_on", "60_off"],
)
      .tab_spanner(
    label="Total",
    columns=["total_on", "total_off"],
)
      .tab_style(
    style=[style.fill(color="lightblue"), style.text(color="black")],
    locations=loc.body(
        columns=["10_off", "20_off", "30_off", "40_off", "60_off"],
        rows=list(range(0, 1)),
    ),
)
      .tab_style(
    style=[style.fill(color="lightgreen"), style.text(color="black")],
    locations=loc.body(
        columns=["total_off"],
        rows=list(range(0, 1)),
    ),
)
      .fmt_number(columns=[*[c for c in df_status.columns if c not in ["schedule"]]], decimals=0)
      )
html_data.append(gt.as_raw_html())
gt.show()

Table 12b: Study status,Table 12b: Study status,Table 12b: Study status,Table 12b: Study status,Table 12b: Study status,Table 12b: Study status,Table 12b: Study status,Table 12b: Study status,Table 12b: Study status,Table 12b: Study status,Table 12b: Study status,Table 12b: Study status,Table 12b: Study status,Table 12b: Study status
Calculated from Offschedule form; not End of study report,Calculated from Offschedule form; not End of study report,Calculated from Offschedule form; not End of study report,Calculated from Offschedule form; not End of study report,Calculated from Offschedule form; not End of study report,Calculated from Offschedule form; not End of study report,Calculated from Offschedule form; not End of study report,Calculated from Offschedule form; not End of study report,Calculated from Offschedule form; not End of study report,Calculated from Offschedule form; not End of study report,Calculated from Offschedule form; not End of study report,Calculated from Offschedule form; not End of study report,Calculated from Offschedule form; not End of study report,Calculated from Offschedule form; not End of study report
Schedule,Hindu mandal,Hindu mandal,Amana,Amana,Temeke,Temeke,Mwananyamala,Mwananyamala,Mnazi Moja,Mnazi Moja,Total,Total,Total
Schedule,On,Off,On,Off,On,Off,On,Off,On,Off,On,Off,Total
Main trial,153,32,335,56,312,28,432,113,184,46,1416,275,1691
Diabetes,5,3,2,1,0,1,9,22,5,11,21,38,59
Pregnancy,2,1,2,3,3,2,0,8,2,6,9,20,29
"Note: Offschedule form is always submitted before the End of study report. When the Offschedule form is submitted, future appointments for the schedule are removed and the site staff are actioned to submit the End of study report.","Note: Offschedule form is always submitted before the End of study report. When the Offschedule form is submitted, future appointments for the schedule are removed and the site staff are actioned to submit the End of study report.","Note: Offschedule form is always submitted before the End of study report. When the Offschedule form is submitted, future appointments for the schedule are removed and the site staff are actioned to submit the End of study report.","Note: Offschedule form is always submitted before the End of study report. When the Offschedule form is submitted, future appointments for the schedule are removed and the site staff are actioned to submit the End of study report.","Note: Offschedule form is always submitted before the End of study report. When the Offschedule form is submitted, future appointments for the schedule are removed and the site staff are actioned to submit the End of study report.","Note: Offschedule form is always submitted before the End of study report. When the Offschedule form is submitted, future appointments for the schedule are removed and the site staff are actioned to submit the End of study report.","Note: Offschedule form is always submitted before the End of study report. When the Offschedule form is submitted, future appointments for the schedule are removed and the site staff are actioned to submit the End of study report.","Note: Offschedule form is always submitted before the End of study report. When the Offschedule form is submitted, future appointments for the schedule are removed and the site staff are actioned to submit the End of study report.","Note: Offschedule form is always submitted before the End of study report. When the Offschedule form is submitted, future appointments for the schedule are removed and the site staff are actioned to submit the End of study report.","Note: Offschedule form is always submitted before the End of study report. When the Offschedule form is submitted, future appointments for the schedule are removed and the site staff are actioned to submit the End of study report.","Note: Offschedule form is always submitted before the End of study report. When the Offschedule form is submitted, future appointments for the schedule are removed and the site staff are actioned to submit the End of study report.","Note: Offschedule form is always submitted before the End of study report. When the Offschedule form is submitted, future appointments for the schedule are removed and the site staff are actioned to submit the End of study report.","Note: Offschedule form is always submitted before the End of study report. When the Offschedule form is submitted, future appointments for the schedule are removed and the site staff are actioned to submit the End of study report.","Note: Offschedule form is always submitted before the End of study report. When the Offschedule form is submitted, future appointments for the schedule are removed and the site staff are actioned to submit the End of study report."


In [84]:
# off schedule no eos

subjects_preg_dm = df_subjecthistory[~(df_subjecthistory.offschedule_datetime.isna()) & (
        df_subjecthistory.schedule_name != "schedule")].subject_identifier

df_subjecthistory[
    ~(df_subjecthistory.subject_identifier.isin(df_eos_1691.subject_identifier))].sort_values(
    by=["subject_identifier", "onschedule_datetime"])

Unnamed: 0,revision,created,modified,user_created,user_modified,hostname_created,hostname_modified,device_created,device_modified,locale_created,...,id,subject_identifier,site_id,visit_schedule_name,schedule_name,onschedule_model,offschedule_model,onschedule_datetime,offschedule_datetime,schedule_status
1665,0.1.61-233-g493c02f:develop:493c02fa1931cfe26b...,2021-11-16 09:19:59.916145+00:00,2021-11-16 09:19:59.916145+00:00,,,meta3,meta3,99,99,,...,f2137437-19c6-458c-b115-f4bfc14f347c,105-10-0001-4,10,visit_schedule,schedule,meta_prn.onschedule,meta_prn.offschedule,2021-11-16 09:19:07+00:00,NaT,onschedule
1378,0.1.61-233-g493c02f:develop:493c02fa1931cfe26b...,2021-11-30 09:05:58.150673+00:00,2021-11-30 09:05:58.150673+00:00,,,meta3,meta3,99,99,,...,cba2dd82-7354-41cc-9c69-e4ca7ae223aa,105-10-0004-8,10,visit_schedule,schedule,meta_prn.onschedule,meta_prn.offschedule,2021-11-30 09:02:08+00:00,NaT,onschedule
1562,0.1.61-233-g493c02f:develop:493c02fa1931cfe26b...,2021-12-02 09:18:45.001285+00:00,2021-12-02 09:18:45.001285+00:00,,,meta3,meta3,99,99,,...,e3088404-6b95-4165-b3e2-3acf5ad35775,105-10-0005-5,10,visit_schedule,schedule,meta_prn.onschedule,meta_prn.offschedule,2021-12-02 09:16:19+00:00,NaT,onschedule
459,0.1.61-233-g493c02f:develop:493c02fa1931cfe26b...,2021-12-03 08:16:42.984813+00:00,2021-12-03 08:16:42.984813+00:00,,,meta3,meta3,99,99,,...,43d5e0ef-2cc9-4a9f-bf1e-ae8e493f87da,105-10-0007-1,10,visit_schedule,schedule,meta_prn.onschedule,meta_prn.offschedule,2021-12-03 08:08:44+00:00,NaT,onschedule
1064,0.1.61-233-g493c02f:develop:493c02fa1931cfe26b...,2021-12-06 10:10:12.319320+00:00,2021-12-06 10:10:12.319320+00:00,,,meta3,meta3,99,99,,...,a12bbbfa-92c0-4b0b-87d5-88f8b0d58d21,105-10-0008-9,10,visit_schedule,schedule,meta_prn.onschedule,meta_prn.offschedule,2021-12-06 10:08:43+00:00,NaT,onschedule
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371,0.2.41:main:fec3ea86c27d8bcdb2047bdba4665843dd...,2023-11-29 08:19:21.460863+00:00,2023-11-29 08:19:21.460863+00:00,,,meta3,meta3,99,99,,...,cac75984-04fc-4500-9594-d0e0b06af953,105-60-0226-2,60,visit_schedule,schedule,meta_prn.onschedule,meta_prn.offschedule,2023-11-29 08:17:23+00:00,NaT,onschedule
566,0.2.41:main:fec3ea86c27d8bcdb2047bdba4665843dd...,2023-11-30 08:29:53.589314+00:00,2023-11-30 08:29:53.589314+00:00,,,meta3,meta3,99,99,,...,527719d6-15d7-4218-b7fc-8b8802d4e0f2,105-60-0227-0,60,visit_schedule,schedule,meta_prn.onschedule,meta_prn.offschedule,2023-11-30 08:26:43+00:00,NaT,onschedule
1613,0.2.41:main:fec3ea86c27d8bcdb2047bdba4665843dd...,2023-12-21 08:48:26.036598+00:00,2023-12-21 08:48:26.036598+00:00,,,meta3,meta3,99,99,,...,eac04e0e-7286-460f-b4f6-cf0e00904e1e,105-60-0228-8,60,visit_schedule,schedule,meta_prn.onschedule,meta_prn.offschedule,2023-12-21 08:45:47+00:00,NaT,onschedule
1535,0.2.41:main:fec3ea86c27d8bcdb2047bdba4665843dd...,2023-12-28 09:01:03.816240+00:00,2023-12-28 09:01:03.816240+00:00,,,meta3,meta3,99,99,,...,df7c06e5-26d0-4f1e-aae6-47ce81a5302c,105-60-0229-6,60,visit_schedule,schedule,meta_prn.onschedule,meta_prn.offschedule,2023-12-28 08:58:26+00:00,NaT,onschedule


In [85]:
# Table 13: Loss to Follow Up
df_ltfu = read_frame(LossToFollowup.objects.all(), verbose=False).rename(columns={"site": "site_id"})
df_ltfu_pivot = (
    df_ltfu
    .groupby(by=["loss_category", "site_id"], observed=True, dropna=False)
    .size()
    .reset_index()
    .pivot_table(index="loss_category", columns="site_id", values=0, observed=True, dropna=False)
    .fillna(0)
    .astype(int)
    .reset_index()
)
df_ltfu_pivot["total"] = df_eos_pivot[["10", "20", "30", "40", "60"]].sum(axis=1)
df_ltfu_pivot.columns.name = ""
sum_row = df_ltfu_pivot.select_dtypes(include='int64').sum()
sum_row['loss_category'] = 'Total'
sum_row_df = pd.DataFrame(sum_row).T
df_ltfu_pivot = pd.concat([df_ltfu_pivot, sum_row_df], ignore_index=True)
df_ltfu_pivot


Unnamed: 0,loss_category,60,total
0,OTHER,1,1691.0
1,Total,1,


In [86]:
# Table 13c: End of study report not submitted

df1 = (
    df_status
    .query("schedule=='Main trial'")[[col for col in columns if "off" in col]]
    .rename(columns=dict(zip([col for col in columns if "off" in col], ["10", "20", "30", "40", "60"], strict=False)))
    .reset_index(drop=True)
)
df2 = (
    df_eos_pivot
    .query("offstudy_reason=='Total'")[["10", "20", "30", "40", "60"]]
    .reset_index(drop=True)
)

df_eos_not_reported = df1 - df2
df_eos_not_reported["schedule"] = 'Main trial'
df_eos_not_reported["total"] = df_eos_not_reported[["10", "20", "30", "40", "60"]].sum(axis=1)
df_eos_not_reported = df_eos_not_reported[["schedule", "10", "20", "30", "40", "60", "total"]]

gt = df_as_great_table(
    df_eos_not_reported,
    title="Table 13c: End of study report not submitted",
    subtitle=md("End of study report expected based on Offschedule form"),
)
gt = (
    gt
    .cols_label(
        {"schedule": "Schedule", **{k: v for k, v in column_headers.items() if k not in ["visit_code", "label"]}})
    .cols_align(align="left", columns=["schedule"])
    .cols_align(align="center", columns=["10", "20", "30", "40", "60", "total"])
    .tab_style(
        style=[style.fill(color="snow"), style.text(color="black")],
        locations=loc.body(
            columns=[0],
            rows=[len(df_eos_pivot) - 1]),
    )
    .tab_style(
        style=[style.fill(color="lightblue"), style.text(color="black")],
        locations=loc.body(
            columns=["10", "20", "30", "40", "60"],
            rows=[len(df_eos_pivot) - 1],
        ),
    )
    .tab_style(
        style=[style.fill(color="lightgreen"), style.text(color="black")],
        locations=loc.body(
            columns=["total"],
            rows=[len(df_eos_pivot) - 1],
        ),
    )
)
html_data.append(gt.as_raw_html())
gt.show()


Table 13c: End of study report not submitted,Table 13c: End of study report not submitted,Table 13c: End of study report not submitted,Table 13c: End of study report not submitted,Table 13c: End of study report not submitted,Table 13c: End of study report not submitted,Table 13c: End of study report not submitted
End of study report expected based on Offschedule form,End of study report expected based on Offschedule form,End of study report expected based on Offschedule form,End of study report expected based on Offschedule form,End of study report expected based on Offschedule form,End of study report expected based on Offschedule form,End of study report expected based on Offschedule form
Schedule,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
Main trial,7.0,7.0,4.0,15.0,17.0,50.0


In [87]:
# Table 14: Baseline Sample

In [88]:
# Table 15: Consented to extended followup
df_consented = (
    read_frame(SubjectConsentV1Ext.objects.all(), verbose=False)
    .query("agrees_to_extension==@YES")
    .rename(columns={"site": "site_id"})
)
df_consented["site_id"] = df_consented.site_id.astype(str)
df_consented["month"] = df_consented.report_datetime.dt.strftime("%m")
df_consented["year"] = df_consented.report_datetime.dt.strftime("%Y")
df_consented_grp = (
    df_consented.groupby(by=["site_id", "year", "month"]).
    size()
    .reset_index()
    .sort_values(by=["site_id", "year", "month"], ascending=True)
    .reset_index(drop=True)
)
df_consented_pivot = (
    df_consented_grp
    .pivot_table(index=["year", "month"], columns="site_id", values=0, aggfunc="sum")
    .reset_index()
    .fillna(0)
)
if "60" not in df_consented_pivot.columns:
    df_consented_pivot["60"] = 0.0 * len(df_consented_pivot)
df_consented_pivot.columns.name = ""
df_consented_pivot["year"] = df_consented_pivot["year"].astype(str)
df_consented_pivot["month"] = df_consented_pivot["month"].astype(str)

sum_row = df_consented_pivot[["10", "20", "30", "40", "60"]].sum()
sum_row['year'] = "Total"
sum_row['month'] = ""
df_consented_pivot = pd.concat([df_consented_pivot, sum_row.to_frame().T], ignore_index=True)
df_consented_pivot["total"] = df_consented_pivot[["10", "20", "30", "40", "60"]].sum(axis=1).astype(int)
df_consented_pivot[["10", "20", "30", "40", "60"]] = df_consented_pivot[["10", "20", "30", "40", "60"]].astype(int)
gt = df_as_great_table2(
    df_consented_pivot,
    title="Table 15: Consented to extended followup",
    rowname_col="month",
    groupname_col="year",
)
gt = (
    gt
    .cols_label({"year": "Year", "month": "Month",
                 **{k: v for k, v in column_headers.items() if k not in ["visit_code", "label"]}})
    .cols_align(align="center")
    .fmt_number(columns=["10", "20", "30", "40", "60", "total"], decimals=0)
    .tab_stubhead(label="Consented")
    .tab_style(
        style=[
            style.text(color="black", weight="bold"),
            style.fill(color="lightgray")
        ],
        locations=loc.row_groups()
    )
)
html_data.append(gt.as_raw_html())
gt.show()

Table 15: Consented to extended followup,Table 15: Consented to extended followup,Table 15: Consented to extended followup,Table 15: Consented to extended followup,Table 15: Consented to extended followup,Table 15: Consented to extended followup,Table 15: Consented to extended followup
Consented,Hindu Mandal,Amana,Temeke,Mwananyamala,Mnazi Moja,Total
2024,2024,2024,2024,2024,2024,2024
12,1,9,4,7,0,21
2025,2025,2025,2025,2025,2025,2025
01,4,11,8,8,0,31
02,4,6,6,9,0,25
03,12,12,8,2,0,34
04,1,14,7,11,0,33
05,2,24,13,37,0,76
06,8,17,12,33,0,70
07,3,12,21,19,5,60


In [89]:
# gather raw html
raw_html = [f'<div class="page-break">{s}</div>' for s in html_data]
style_css = """
<style>
  .page-break {
    page-break-inside: avoid; /* Always add page break before this element */
  }
  .table-header {
    font-weight: bold;
    font-size: 18px;
    text-align: center;
    border-bottom: None;
  }
</style>
"""
raw_html = ''.join(raw_html)
raw_html = f'<!DOCTYPE html>\n<html lang="en">\n{style_css}\n<head>\n<meta charset="utf-8"/>\n</head>\n<body>\n' + document_title + raw_html + '\n</body>\n</html>\n'

In [90]:
# render html to PDF
pdfkit.from_string(raw_html, str(analysis_folder / pdf_filename),
                   options={
                       'footer-center': 'Page [page] of [topage]',
                       'footer-font-size': '8',
                       'footer-spacing': '5',
                       'encoding': "UTF-8",
                       'margin-top': '10mm',
                       'margin-right': '15mm',
                       'margin-bottom': '15mm',
                       'margin-left': '15mm',
                       'header-center': study_title,
                       'header-font-size': '6',
                       'header-spacing': '0',
                       'disable-javascript': None,
                       'no-outline': None,
                   },
                   verbose=True)

Loading pages (1/6)
Counting pages (2/6)                                               
Resolving links (4/6)                                                       
Loading headers and footers (5/6)                                           
Printing pages (6/6)
Done                                                                        


True