In [None]:
%%capture
import os
from pathlib import Path

import pandas as pd
from dj_notebook import activate

env_file = os.environ["META_ENV"]
reports_folder = Path(os.environ["META_REPORTS_FOLDER"])
analysis_folder = Path(os.environ["META_ANALYSIS_FOLDER"])
pharmacy_folder = Path(os.environ["META_PHARMACY_FOLDER"])
plus = activate(dotenv_file=env_file)
pd.set_option("future.no_silent_downcasting", True)

In [None]:
from datetime import date

from edc_pdutils.dataframes import get_crf, get_subject_consent, get_subject_visit

from meta_analytics.dataframes import get_screening_df

In [None]:
cutoff_date = date(2025, 5, 31)

In [None]:
df_visit = (
    get_subject_visit("meta_subject.subjectvisit")
    .query("visit_code==1000.0")
    .reset_index(drop=True)
)

In [None]:
# merge with consent for dob
df_consent = get_subject_consent(model="meta_consent.subjectconsent")[
    ["subject_identifier", "gender", "dob", "age_in_years", "consent_datetime"]
]
df_visit = df_visit.merge(df_consent, on="subject_identifier", how="left")
df_visit["age_in_years"] = df_visit["age_in_years"].astype("Int64")
df_visit["dob"] = df_visit["dob"].astype("datetime64[ns]")
df_visit["consent_datetime"] = df_visit["consent_datetime"].dt.normalize()

In [None]:
columns = [
    "subject_identifier",
    # "gender",
    # "age_in_years",
    "ethnicity",
    "fbg_value",
    "fbg_date",
    "fbg_units",
    "bmi",
    "weight",
    "height",
    "severe_htn",
    "dia_blood_pressure_avg",
    "sys_blood_pressure_avg",
    "waist_circumference",
]

df_screening = (
    get_screening_df()
    .query("subject_identifier.str.startswith('105-')")
    .rename(columns={"calculated_bmi_value": "bmi", "fbg_datetime": "fbg_date"})
    .reset_index()
)
df_screening["fbg_value"] = df_screening["fbg_value"].astype("Float64")
df_screening["bmi"] = df_screening["bmi"].astype("Float64")
df_screening["height"] = df_screening["height"].astype("Float64")
df_screening["weight"] = df_screening["weight"].astype("Float64")
df_screening["waist_circumference"] = df_screening["waist_circumference"].astype("Float64")
df_screening["dia_blood_pressure_avg"] = df_screening["dia_blood_pressure_avg"].astype(
    "Float64"
)
df_screening["sys_blood_pressure_avg"] = df_screening["sys_blood_pressure_avg"].astype(
    "Float64"
)
df_screening["fbg_date"] = df_screening["fbg_date"].dt.tz_localize(None).dt.normalize()

df_screening.loc[df_screening.fasting_duration_delta.dt.total_seconds() < 28800, "fbg"] = pd.NA
df_screening.loc[
    df_screening.fasting_duration_delta.dt.total_seconds() < 28800, "fbg_units"
] = pd.NA

# merge with vars from screening
df_visit = df_visit.merge(df_screening[columns], on="subject_identifier", how="left")

In [None]:
columns = [
    "subject_visit_id",
    "rft_date",
    "crf",
    "creatinine_value",
    "creatinine_units",
    "creatinine_abnormal",
    "egfr_value",
    "egfr_units",
    "egfr_abnormal",
    "urea_value",
    "urea_units",
    "urea_abnormal",
    "uric_acid_value",
    "uric_acid_units",
    "uric_acid_abnormal",
]
df_bloodresultsrft = (
    get_crf(
        model="meta_subject.bloodresultsrft", subject_visit_model="meta_subject.subjectvisit"
    )
    .rename(columns={"report_datetime": "rft_date"})
    .reset_index(drop=True)
)
df_bloodresultsrft["rft_date"] = (
    df_bloodresultsrft["rft_date"].dt.tz_localize(None).dt.normalize()
)
df_bloodresultsrft["crf"] = "bloodresultsrft"

for col in columns:
    if col.endswith("_value"):
        df_bloodresultsrft[col] = df_bloodresultsrft[col].astype("Float64")

df_visit = df_visit.merge(
    df_bloodresultsrft.query("visit_code==1000.0")[columns], on="subject_visit_id", how="left"
)

In [None]:
columns = [
    "subject_visit_id",
    "ast_value",
    "ast_units",
    "ast_abnormal",
    "alt_value",
    "alt_units",
    "alt_abnormal",
    "alp_value",
    "alp_units",
    "alp_abnormal",
    "amylase_value",
    "amylase_units",
    "amylase_abnormal",
    "ggt_value",
    "ggt_units",
    "ggt_abnormal",
    "albumin_value",
    "albumin_units",
    "albumin_abnormal",
]
df_bloodresultslft = (
    get_crf(
        model="meta_subject.bloodresultslft", subject_visit_model="meta_subject.subjectvisit"
    )
    .rename(columns={"report_datetime": "lft_date"})
    .reset_index(drop=True)
)
df_bloodresultslft["lft_date"] = (
    df_bloodresultslft["lft_date"].dt.tz_localize(None).dt.normalize()
)
df_bloodresultslft["crf"] = "bloodresultslft"

for col in columns:
    if col.endswith("_value"):
        df_bloodresultslft[col] = df_bloodresultslft[col].astype("Float64")

# df_bloodresultslft.query("visit_code==1000.0")[columns]
df_visit = df_visit.merge(
    df_bloodresultslft.query("visit_code==1000.0")[columns], on="subject_visit_id", how="left"
)

In [None]:
columns = [
    "subject_visit_id",
    "lipids_date",
    "crf",
    "hdl_value",
    "hdl_units",
    "hdl_abnormal",
    "ldl_value",
    "ldl_units",
    "ldl_abnormal",
    "trig_value",
    "trig_units",
    "trig_abnormal",
    "chol_value",
    "chol_units",
    "chol_abnormal",
]
df_bloodresultslipids = (
    get_crf(
        model="meta_subject.bloodresultslipids",
        subject_visit_model="meta_subject.subjectvisit",
    )
    .rename(columns={"report_datetime": "lipids_date"})
    .reset_index(drop=True)
)
df_bloodresultslipids["lipids_date"] = (
    df_bloodresultslipids["lipids_date"].dt.tz_localize(None).dt.normalize()
)
df_bloodresultslipids["crf"] = "bloodresultslipids"
for col in columns:
    if col.endswith("_value"):
        df_bloodresultslipids[col] = df_bloodresultslipids[col].astype("Float64")
# df_bloodresultslipids.query("visit_code==1000.0")[columns]
df_visit = df_visit.merge(
    df_bloodresultslipids.query("visit_code==1000.0")[columns],
    on="subject_visit_id",
    how="left",
)

In [None]:
columns = [
    "subject_visit_id",
    "fbc_date",
    "crf",
    "haemoglobin_value",
    "haemoglobin_units",
    "haemoglobin_abnormal",
    "wbc_value",
    "wbc_units",
    "wbc_abnormal",
    "rbc_value",
    "rbc_units",
    "rbc_abnormal",
    "platelets_value",
    "platelets_units",
    "platelets_abnormal",
]
df_bloodresultsfbc = (
    get_crf(
        model="meta_subject.bloodresultsfbc", subject_visit_model="meta_subject.subjectvisit"
    )
    .rename(columns={"report_datetime": "fbc_date"})
    .reset_index(drop=True)
)
df_bloodresultsfbc["fbc_date"] = (
    df_bloodresultsfbc["fbc_date"].dt.tz_localize(None).dt.normalize()
)
df_bloodresultsfbc["crf"] = "bloodresultsfbc"
for col in columns:
    if col.endswith("_value"):
        df_bloodresultsfbc[col] = df_bloodresultsfbc[col].astype("Float64")

df_visit = df_visit.merge(
    df_bloodresultsfbc.query("visit_code==1000.0")[columns], on="subject_visit_id", how="left"
)

In [None]:
columns = [
    "subject_visit_id",
    "current_smoker",
    "former_smoker",
    "hiv_diagnosis_date",
    "current_arv_regimen",
    "current_arv_regimen_start_date",
]
df_patient_history = get_crf(
    model="meta_subject.patienthistory", subject_visit_model="meta_subject.subjectvisit"
)
df_patient_history["hiv_diagnosis_date"] = (
    df_patient_history["hiv_diagnosis_date"].dt.tz_localize(None).dt.normalize()
)
df_patient_history["current_arv_regimen_start_date"] = (
    df_patient_history["current_arv_regimen_start_date"].dt.tz_localize(None).dt.normalize()
)

df_visit = df_visit.merge(
    df_patient_history.query("visit_code==1000.0")[columns], on="subject_visit_id", how="left"
)

In [None]:
columns = [
    "subject_visit_id",
    "weight",
    "waist_circumference",
    "severe_htn",
    "dia_blood_pressure_avg",
    "sys_blood_pressure_avg",
]
df_physical_exam = get_crf(
    model="meta_subject.physicalexam",
    subject_visit_model="meta_subject.subjectvisit",
)

df_visit = df_visit.merge(
    df_physical_exam[["subject_visit_id", "weight", "waist_circumference"]],
    on="subject_visit_id",
    how="left",
    suffixes=("", "_physicalexam"),
)
df_visit["weight_physicalexam"] = df_visit["weight_physicalexam"].astype("Float64")
df_visit["waist_circumference_physicalexam"] = df_visit[
    "waist_circumference_physicalexam"
].astype("Float64")

df_visit["weight"] = df_visit["weight"].fillna(df_visit["weight_physicalexam"])
df_visit["waist_circumference"] = df_visit["waist_circumference"].fillna(
    df_visit["waist_circumference_physicalexam"]
)

df_visit = df_visit.drop(columns=["weight_physicalexam", "waist_circumference_physicalexam"])

In [None]:
df_visit = (
    df_visit.drop(columns=[col for col in df_visit.columns if col.endswith("_y")])
    .drop(columns=[col for col in df_visit.columns if col.endswith("_x")])
    .drop(
        columns=[
            "appointment_id",
            "appt_datetime",
            "appt_status",
            "appt_timing",
            "baseline_datetime",
            "crf",
            "endline_visit_code",
            "endline_visit_code_str",
            "endline_visit_datetime",
            "followup_days",
            "reason",
            "reason_missed",
            "reason_missed_other",
            "reason_unscheduled",
            "reason_unscheduled_other",
            "subject_visit_id",
            "visit_code_str",
        ]
    )
)

In [None]:
for col in [col for col in df_visit.columns if col.endswith("_abnormal")]:
    df_visit[col] = df_visit[col].astype(str)
for col in [col for col in df_visit.columns if col.endswith("_units")]:
    df_visit[col] = df_visit[col].astype(str)

In [None]:
df_visit.to_stata(
    path=analysis_folder / "liver.dta",
    version=118,
    write_index=False,
)