In [None]:
%%capture
import os
import pandas as pd
import numpy as np
from dj_notebook import activate
from pathlib import Path

env_file = os.environ["META_ENV"]
reports_folder = Path(os.environ["META_REPORTS_FOLDER"])
analysis_folder = Path(os.environ["META_ANALYSIS_FOLDER"])
plus = activate(dotenv_file=env_file)


In [None]:
"""
Magreth data request
* Medication adherence statistics
* Median (interquartile) age of the cohort
* Median follow-up duration and range
"""

In [None]:
from meta_consent.models import SubjectConsent
from edc_pdutils.dataframes import get_subject_consent, get_subject_visit, get_eos
from edc_constants.constants import FEMALE, MALE
from edc_pdutils.dataframes import get_crf
from datetime import datetime

In [None]:
cutoff_datetime = datetime(2025, 2,1, 0, 0, 0)

In [None]:
df_consent = get_subject_consent(model_cls=SubjectConsent)

In [None]:
# df_consent

In [None]:
df_consent.gender.value_counts()

In [None]:
df_consent.age_in_years.describe()


In [None]:
df_visit = get_subject_visit("meta_subject.subjectvisit")
df_visit = df_visit[df_visit.visit_datetime < pd.Timestamp(cutoff_datetime)]
df_visit.reset_index(drop=True, inplace=True)

In [None]:
df_visit.dtypes

In [None]:

df_eos = get_eos("meta_prn.endofstudy")
df_visit = df_visit.merge(df_eos[["subject_identifier", "offstudy_datetime", "offstudy_reason"]], on="subject_identifier", how="left")


In [None]:
def get_cells_for_categorical(df:pd.DataFrame, col:str, categories:list[str]|None=None, arm:str|None=None)->list[str]:
    if arm:
        n = len(df[(df['assignment']==arm) & (df[col].notna())])
        counts = df[(df['assignment'] == arm) & (df[col].notna())][col].value_counts()
        percentages = df[(df['assignment'] == arm) & (df[col].notna())][col].value_counts(normalize=True) * 100
    else:
        n = len(df[(df[col].notna())])
        counts = df[(df[col].notna())][col].value_counts()
        percentages = df[(df[col].notna())][col].value_counts(normalize=True) * 100
    cells = [n]
    for cat in categories:
        cells.append(f"{counts.get(cat, 0)} ({percentages.get(cat, 0):.1f}%)",)
    return cells

def get_cells_for_continuous(df)->list[str]:
    """ From describe(), format 3 cells as:

        +======================+
        | 930                  |
        +----------------------+
        | 127.69(16.84)        |
        +----------------------+
        | 127.00(82.00–183.00) |
        +----------------------+
    """
    return [
        f"{int(df['count'])}",
        f"{df['mean']:.2f}({df['std']:.2f})",
        f"{df['50%']:.2f}({df['min']:.2f}–{df['max']:.2f})"
    ]

def get_formatted_rows(df, col:str|None=None):
    """Returns 5 columns"""

    df = df[df[col].notna()].copy()
    df_all = df[col].describe()

    return  {
        # 'Timepoint': ['Baseline', '', '', 'Endline', '', ''],
        'Statistics': ['n', 'Mean(sd)', 'Median(min-max)'],
        'All': [
            *get_cells_for_continuous(df_all),
        ],
    }

def get_formatted_rows_mf(df, col:str|None=None):
    """Returns 5 columns"""

    df = df[df[col].notna()].copy()
    df_all = df[col].describe()

    return  {
        # 'Timepoint': ['Baseline', '', '', 'Endline', '', ''],
        'Statistics': ['n', 'Mean(sd)', 'Median(min-max)'],
        'All': [
            *get_cells_for_continuous(df_all),
        ],
        'Female': [
            *get_cells_for_continuous(df[df.gender==FEMALE][col].describe()),
        ],
        'Male': [
            *get_cells_for_continuous(df[df.gender==MALE][col].describe()),
        ],
    }


In [None]:
def days_on_study(s):
    return (s["endline_visit_datetime"] - s["baseline_datetime"]).days
df_visit["days_on_study"] = df_visit.apply(days_on_study, axis=1)

In [None]:
df_visit["months_on_study"] = df_visit["days_on_study"]/30

In [None]:


# ALL

table1a  = {'Category': ['Age at consent', '', '']}
table1a.update({
    'Parameter': ['Age (Years)', '', ''],
    **get_formatted_rows_mf(df_consent, "age_in_years"),

})

# all
df_months = df_visit[(df_visit.visit_code<=1480.9)].groupby(by=['subject_identifier'])["months_on_study"].max().to_frame().reset_index()
df_months = df_months.merge(df_consent[['subject_identifier', 'gender']], on='subject_identifier', how='left')
table1 = {'Category': ['Follow-up (main)', '', '']}
table1.update({
    'Parameter': ['>= 0m', '', ''],
    **get_formatted_rows_mf(df_months, "months_on_study")
})

# reach at least 12m
table2a = {'Category': ['', '', '']}
df_months = df_visit[(df_visit.visit_code>=1120.0) & (df_visit.visit_code<=1480.9)].groupby(by=['subject_identifier'])["months_on_study"].max().to_frame().reset_index()
df_months = df_months.merge(df_consent[['subject_identifier', 'gender']], on='subject_identifier', how='left')
table2a.update({
    'Parameter': ['>= 12m', '', ''],
    **get_formatted_rows_mf(df_months, "months_on_study")
})

# reach at least 24m
table2b = {'Category': ['', '', '']}
df_months = df_visit[(df_visit.visit_code>=1240.0) & (df_visit.visit_code<=1480.9)].groupby(by=['subject_identifier'])["months_on_study"].max().to_frame().reset_index()
df_months = df_months.merge(df_consent[['subject_identifier', 'gender']], on='subject_identifier', how='left')
table2b.update({
    'Parameter': ['>= 24m', '', ''],
    **get_formatted_rows_mf(df_months, "months_on_study")
})

# reach at least 36m
table3a  = {'Category': ['', '', '']}
df_months = df_visit[(df_visit.visit_code>=1360.0) & (df_visit.visit_code<=1480.9)].groupby(by=['subject_identifier'])["months_on_study"].max().to_frame().reset_index()
df_months = df_months.merge(df_consent[['subject_identifier', 'gender']], on='subject_identifier', how='left')
table3a.update({
    'Parameter': ['>= 36m', '', ''],
    **get_formatted_rows_mf(df_months, "months_on_study")
})

# reach at least 36m
table3b  = {'Category': ['', '', '']}
df_months = df_visit[(df_visit.visit_code>1360.0) & (df_visit.visit_code<=1480.9)].groupby(by=['subject_identifier'])["months_on_study"].max().to_frame().reset_index()
df_months = df_months.merge(df_consent[['subject_identifier', 'gender']], on='subject_identifier', how='left')
table3b.update({
    'Parameter': ['> 36m', '', ''],
    **get_formatted_rows_mf(df_months, "months_on_study")
})

# reach at least 48m
table3c  = {'Category': ['', '', '']}
df_months = df_visit[(df_visit.visit_code==1480.0)].groupby(by=['subject_identifier'])["months_on_study"].max().to_frame().reset_index()
df_months = df_months.merge(df_consent[['subject_identifier', 'gender']], on='subject_identifier', how='left')
table3c.update({
    'Parameter': ['48m', '', ''],
    **get_formatted_rows_mf(df_months, "months_on_study")
})

table4  = {'Category': ['Pregnancy cohort', '', '']}
df_months = df_visit[(df_visit.visit_code>=2000.0) & (df_visit.visit_code<3000.0)].groupby(by=['subject_identifier'])["months_on_study"].max().to_frame().reset_index()
df_months = df_months.merge(df_consent[['subject_identifier', 'gender']], on='subject_identifier', how='left')
table4.update({
    'Parameter': ['months', '', ''],
    **get_formatted_rows_mf(df_months, "months_on_study")
})

table5  = {'Category': ['Diabetes cohort', '', '']}
df_months = df_visit[(df_visit.visit_code>=3000.0)].groupby(by=['subject_identifier'])["months_on_study"].max().to_frame().reset_index()
df_months = df_months.merge(df_consent[['subject_identifier', 'gender']], on='subject_identifier', how='left')
table5.update({
    'Parameter': ['months', '', ''],
    **get_formatted_rows_mf(df_months, "months_on_study")
})

# NOTE: this may need to be the delta from baseline to offstudy_datetime instead of to endline_visit_datetime
table6  = {'Category': ['Offstudy (main)', '', '']}
df_months = df_visit[(df_visit.offstudy_datetime.notna())].groupby(by=['subject_identifier'])["months_on_study"].max().to_frame().reset_index()
df_months = df_months.merge(df_consent[['subject_identifier', 'gender']], on='subject_identifier', how='left')
table6.update({
    'Parameter': ['months', '', ''],
    **get_formatted_rows_mf(df_months, "months_on_study")
})


table1a_df  = pd.DataFrame(table1a)
table1_df  = pd.DataFrame(table1)
table2a_df  = pd.DataFrame(table2a)
table2b_df  = pd.DataFrame(table2b)
table3a_df  = pd.DataFrame(table3a)
table3b_df  = pd.DataFrame(table3b)
table3c_df  = pd.DataFrame(table3c)
table4_df  = pd.DataFrame(table4)
table5_df  = pd.DataFrame(table5)
table6_df  = pd.DataFrame(table6)
table_df = pd.concat([table1a_df, table1_df, table2a_df, table2b_df, table3a_df, table3b_df, table3c_df, table4_df, table5_df, table6_df])

# export as csv
path = analysis_folder / 'meta3_magreth_followup.csv'
table_df.to_csv(path_or_buf=path, index=False)



In [None]:
from tabulate import tabulate

table_formatted = tabulate(table_df, headers='keys', tablefmt='grid')

path = analysis_folder / 'meta3_magreth_followup.txt'
with open(path, 'w') as file:
    file.write(table_formatted)

print(table_formatted)


In [None]:
# medical adherence

In [None]:
df_adherence = get_crf(model="meta_subject.medicationadherence", subject_visit_model="meta_subject.subjectvisit")
df_adherence = df_adherence[df_adherence.visit_datetime < pd.Timestamp(cutoff_datetime)]
df_adherence.reset_index(drop=True, inplace=True)


In [None]:
df_adherence = df_adherence.merge(df_consent[['subject_identifier', 'gender', "age_in_years"]], on='subject_identifier', how='left')

In [None]:
# calculate mean per subject by visit
mean_visual_score_by_visit = df_adherence.groupby(by=['subject_identifier', 'visit_code'])['visual_score_confirmed'].mean().to_frame().reset_index()
# merge w/ consent
mean_visual_score_by_visit = mean_visual_score_by_visit.merge(df_consent[['subject_identifier', 'gender', 'age_in_years']], on='subject_identifier', how='left')

# calculate mean of means
mean_visual_score_confirmed = df_adherence.groupby(by=['subject_identifier'])['visual_score_confirmed'].mean().to_frame().reset_index()
# merge w/ consent
mean_visual_score_confirmed = mean_visual_score_confirmed.merge(df_consent[['subject_identifier', 'gender', 'age_in_years']], on='subject_identifier', how='left')


In [None]:
#
table0 = {'Category': ['', '', '']}
table0.update({
    'Parameter': ['at 2 weeks (%)', '', ''],
    **get_formatted_rows_mf(mean_visual_score_by_visit[mean_visual_score_by_visit.visit_code==1005.0], "visual_score_confirmed")
})

table1 = {'Category': ['', '', '']}
table1.update({
    'Parameter': ['at 12m (%)', '', ''],
    **get_formatted_rows_mf(mean_visual_score_by_visit[mean_visual_score_by_visit.visit_code==1120.0], "visual_score_confirmed")
})

table2 = {'Category': ['', '', '']}
table2.update({
    'Parameter': ['at 24m (%)', '', ''],
    **get_formatted_rows_mf(mean_visual_score_by_visit[mean_visual_score_by_visit.visit_code==1240.0], "visual_score_confirmed")
})

table3 = {'Category': ['', '', '']}
table3.update({
    'Parameter': ['at 36m (%)', '', ''],
    **get_formatted_rows_mf(mean_visual_score_by_visit[mean_visual_score_by_visit.visit_code==1360.0], "visual_score_confirmed")
})

table4 = {'Category': ['', '', '']}
table4.update({
    'Parameter': ['at 48m (%)', '', ''],
    **get_formatted_rows_mf(mean_visual_score_by_visit[mean_visual_score_by_visit.visit_code==1480.0], "visual_score_confirmed")
})

#
table5 = {'Category': ['Adherence', '', '']}
table5.update({
    'Parameter': ['mean of means %', '', ''],
    **get_formatted_rows_mf(mean_visual_score_confirmed, "visual_score_confirmed")
})


In [None]:

table0_df  = pd.DataFrame(table0)
table1_df  = pd.DataFrame(table1)
table2_df  = pd.DataFrame(table2)
table3_df  = pd.DataFrame(table3)
table4_df  = pd.DataFrame(table4)
table5_df  = pd.DataFrame(table5)
table_df = pd.concat([table1a_df, table5_df, table0_df, table1_df, table2_df, table3_df, table4_df])
table_df


In [None]:
table = tabulate(table_df, headers='keys', tablefmt='grid')
path = analysis_folder / 'meta3_magreth_adherence.csv'
table_df.to_csv(path_or_buf=path, index=False)

path = analysis_folder / 'meta3_magreth_adherence.txt'
with open(path, 'w') as file:
    file.write(table)


In [None]:
df = df_adherence.copy()
df.set_index('visit_datetime', inplace=True)



In [None]:
correlation = df[['pill_count', 'visual_score_confirmed']].corr()
print(correlation)

In [None]:
df[['pill_count']].plot()

In [None]:
df1 = df[df.visit_code_sequence==0].groupby("visit_code")["visual_score_confirmed"].mean().to_frame().reset_index()
df1.set_index('visit_code', inplace=True)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.scatterplot(x='visit_code', y='visual_score_confirmed', data=df1)
plt.title('Scatter Plot Visual Score Confirmed')
plt.xlabel('visit')
plt.ylabel('Visual Score Confirmed')
plt.show()


In [None]:
df1 = df.groupby("visit_code")["visual_score_confirmed"].mean()


In [None]:
df1

In [None]:
df2 = df_adherence.copy()
df2.set_index('visit_code', inplace=True)
df2.sort_values(by='visit_code', inplace=True)
mean_pill_count = df[df.visit_code_sequence==0].groupby("visit_code")["pill_count"].mean().to_frame().reset_index()



In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(mean_pill_count['visit_code'], mean_pill_count['pill_count'], marker='o')
plt.title('Pill count')
plt.xlabel('visit_code')
plt.ylabel('pills')
plt.xlim(1000, 1400)
plt.show()
