In [None]:
%%capture
# output is suppressed but normally would spew out all the edc loading messages

import os
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np
import math
# import matplotlxib.pyplot as plt
# import seaborn as sns
import scipy.stats as stats

from dj_notebook import activate

env_file = os.environ["META_ENV"]
documents_folder = os.environ["META_DOCUMENTS_FOLDER"]
report_folder = Path(documents_folder)

plus = activate(dotenv_file=env_file)


In [None]:
import itertools
from meta_analytics.dataframes import GlucoseEndpointsByDate, get_eos_df, get_screening_df
from meta_analytics.dataframes.screening import get_glucose_tested_only_df


In [None]:

df = get_screening_df()


In [None]:
# unwilling to stay or not living nearby
df[(df["reasons_ineligible_part_one"].str.contains("nearby", na=False)) & (~df["reasons_ineligible_part_one"].str.contains("ART", na=False)) & (~df["reasons_ineligible_part_one"].str.contains("VL", na=False)) & (~df["reasons_ineligible_part_one"].str.contains("Pregnant", na=False)) & (~df["reasons_ineligible_part_one"].str.contains("META", na=False))].reasons_ineligible_part_one.value_counts(dropna=False)


In [None]:
df[(df["reasons_ineligible_part_one"].str.contains("nearby", na=False)) & (~df["reasons_ineligible_part_one"].str.contains("ART", na=False)) & (~df["reasons_ineligible_part_one"].str.contains("VL", na=False)) & (~df["reasons_ineligible_part_one"].str.contains("Pregnant", na=False)) & (~df["reasons_ineligible_part_one"].str.contains("META", na=False))].reasons_ineligible_part_one.count()

In [None]:
df[
(df["reasons_ineligible_part_one"].str.contains("ART", na=False)) & 
(~df["reasons_ineligible_part_one"].str.contains("VL", na=False)) & 
(~df["reasons_ineligible_part_one"].str.contains("Pregnant", na=False)) & 
(~df["reasons_ineligible_part_one"].str.contains("META", na=False))
].reasons_ineligible_part_one.count()

In [None]:
# VL not suppressed or not measured within last 6-12
df[
(~df["reasons_ineligible_part_one"].str.contains("ART", na=False)) & 
(df["reasons_ineligible_part_one"].str.contains("VL", na=False)) & 
(~df["reasons_ineligible_part_one"].str.contains("Pregnant", na=False)) & 
(~df["reasons_ineligible_part_one"].str.contains("META", na=False))
].reasons_ineligible_part_one.count()

In [None]:
# pregnant (unconfirmed)
df[
(~df["reasons_ineligible_part_one"].str.contains("ART", na=False)) & 
(~df["reasons_ineligible_part_one"].str.contains("VL", na=False)) & 
(df["reasons_ineligible_part_one"].str.contains("Pregnant", na=False)) & 
(~df["reasons_ineligible_part_one"].str.contains("META", na=False))
].reasons_ineligible_part_one.counts()

In [None]:
# META 2 participant
df[
(~df["reasons_ineligible_part_one"].str.contains("ART", na=False)) & 
(~df["reasons_ineligible_part_one"].str.contains("VL", na=False)) & 
(~df["reasons_ineligible_part_one"].str.contains("Pregnant", na=False)) & 
(df["reasons_ineligible_part_one"].str.contains("META", na=False))
].reasons_ineligible_part_one.count()

In [None]:
prods = list(itertools.product(["Yes", "No", "tbd"], repeat=3))
dfs = []
for p in prods:
    if p[0] == "tbd":
        continue
    dfs.append(
        pd.DataFrame([[
            p[0], 
            p[1],
            p[2],
            df[
                (df.eligible_part_one==p[0]) & 
                (df.eligible_part_two==p[1]) &
                (df.eligible_part_three==p[2])
            ].eligible_part_three.count()]],
            columns=["p1", "p2", "p3", "count"]))

df_eligibility = pd.concat(dfs, ignore_index=True)
df_eligibility

In [None]:
# assessed part one only
p1 = df_eligibility[df_eligibility.p2.isin(["tbd"])]["count"].sum()
p1

In [None]:
# assessed part one and part two
p12 = df_eligibility[
    (df_eligibility.p1.isin(["Yes", "No"])) & 
    (df_eligibility.p2.isin(["Yes", "No"])) & 
    (df_eligibility.p3 == "tbd")
]["count"].sum()
p12

In [None]:
# assessed part one, part two, part three
p123 = df_eligibility[
    (df_eligibility.p1.isin(["Yes", "No"])) & 
    (df_eligibility.p2.isin(["Yes", "No"])) & 
    (df_eligibility.p3 != "tbd")
]["count"].sum()
p123

In [None]:
assert p1+p12+p123 == 10574

In [None]:
p12 + p123

In [None]:
cond = (df["eligible_part_one"] == "Yes") 
df[cond].count()

In [None]:
# 9706 evaluated for part one and two
cond = (df["eligible_part_one"].isin(["Yes", "No"])) & (df["eligible_part_two"].isin(["Yes", "No"]))
df[cond].count()

In [None]:
# 9706 evaluated for part one and two
cond = (df["eligible_part_one"].isin(["Yes", "No"])) & (df["eligible_part_two"].isin(["Yes", "No"]))
df[cond].eligible_part_three.value_counts()



In [None]:
df_glu = get_glucose_tested_only_df()

In [None]:
df_glu.eligible_part_three.value_counts()


In [None]:
cond = (df["eligible_part_one"].isin(["Yes"])) & (df["eligible_part_two"].isin(["Yes"]))
df[cond].agree_to_p3.value_counts()

In [None]:
df_glu.gender.value_counts()

In [None]:
df_glu = df_glu.set_index("screening_identifier")

In [None]:
cond = (df["eligible_part_one"].isin(["Yes", "No"])) & (df["eligible_part_two"].isin(["Yes", "No"]) & (df["has_dm"]=="No"))
df[cond].eligible_part_three.count()


In [None]:
df2 = df[cond].copy()
df2 = df2.reset_index(drop=True)
df2 = df2.set_index("screening_identifier")
df2.count()

In [None]:
df2 = df2.drop(index=df_glu.index)

In [None]:
df2.count()

In [None]:
df2.gender.value_counts()

In [None]:
df2[df2["gender"] == "F"].age_in_years.describe()

In [None]:
df_glu[df_glu["gender"] == "F"].age_in_years.describe()

In [None]:
df_glu.age_in_years.describe()

In [None]:
df_glu[df_glu["gender"] == "M"].age_in_years.describe()

In [None]:
import scipy.stats as stats
df_glu[(df_glu["gender"] == "F")].count()

In [None]:
cond_fasting = (df_glu.fasting == "Yes") & (df_glu.fasting_fbg_hrs >= 8.0)
cond_f = (df_glu["gender"] == "F")
cond_m = (df_glu["gender"] == "M")

df_glu[(df_glu.fasting == "Yes") & (df_glu.fasting_fbg_hrs >= 8.0)].gender.value_counts()

In [None]:
df_glu[cond_fasting & cond_f].count()
df_glu[cond_fasting & cond_f & (df_glu.ogtt.notna())].count()

In [None]:
df_glu["fbg_threshold"] = df_glu[cond_fasting].fbg >= 7.0
df_glu["ogtt_threshold"] = df_glu[cond_fasting].ogtt >= 11.1


In [None]:
df_glu_female = df_glu[cond_f & cond_fasting][["fbg_threshold", "ogtt_threshold"]].value_counts().to_frame().reset_index()

In [None]:
assert df_glu_female["count"].sum() == 4201

In [None]:
assert df_glu_female[df_glu_female.fbg_threshold == True]["count"].sum() == 534

In [None]:
assert df_glu_female[df_glu_female.ogtt_threshold == True]["count"].sum() == 148

In [None]:
assert df_glu[cond_f & cond_fasting & (df_glu.fbg >= 7.0)]["gender"].count() == 534
assert df_glu_female[df_glu_female.fbg_threshold == True]["count"].sum() == 534

In [None]:
# men fbg
df_glu[cond_m & cond_fasting][["fbg", "ogtt"]].count()

In [None]:
# men fbg
assert df_glu[cond_m & cond_fasting & (df_glu.fbg >= 7.0)]["gender"].count() == 194

In [None]:
# men fbg
194/1414

In [None]:
# men ogtt
assert df_glu[cond_m & cond_fasting & (df_glu.ogtt >= 11.1)]["gender"].count() == 76

In [None]:
76/1393

In [None]:

df_glu_male = df_glu[cond_m & cond_fasting & (df_glu.)][
["fbg_threshold", "ogtt_threshold"]].value_counts().to_frame().reset_index()


In [None]:
df_glu_male

In [None]:
assert df_glu_male["count"].sum() == 1414

In [None]:
from scipy.stats.contingency import odds_ratio

# female
df_glu_female

In [None]:
# female
res = odds_ratio([[98, 436], [50,3617]])
res.statistic

In [None]:
# male
df_glu_male

In [None]:
# male
df_glu_male
res = odds_ratio([[44, 32], [150,1188]])
res.statistic

In [None]:
# female
res.confidence_interval(confidence_level=0.95)

In [None]:
res.confidence_interval(confidence_level=0.95)

In [None]:
df_glu["ogtt"].dtype

In [None]:
# df_glu[cond_f & cond_fasting & (df_glu.ogtt.notna()) & ((df_glu.fbg>=7.0) | (df_glu.ogtt>=11.1))].count()

# when ogtt not done 
# df_glu[cond_f & cond_fasting & (df_glu.ogtt.isna())].fbg.describe()

# we never have ogtt w/o fbg
# df_glu[cond_f & cond_fasting & (df_glu.fbg.isna())].ogtt.describe()
df_glu2 = get_glucose_tested_only_df()
cond_fasting2 = (df_glu2.fasting == "Yes") & (df_glu2.fasting_fbg_hrs >= 8.0)

# df_glu = df_glu.reset_index(drop=False)
def dx(row):
    # print((row.fbg>=7.0) & (row.ogtt>=11.1))
    # print(row.fbg, row.ogtt)
    if (row.fbg>=7.0) & (row.ogtt>=11.1):
        ret = "fbg_ogtt"
    elif (row.fbg>=7.0) & (row.ogtt<11.1):
        ret = "fbg_only"
    elif (row.fbg<7.0) & (row.ogtt>=11.1):
        ret = "ogtt_only"
    elif (row.fbg<7.0) & (row.ogtt<11.1):
        ret = "neither"
    else:
        ret = "error"
    return ret
    
df_glu2["glucose"] = ""        
df_glu2["glucose"] = df_glu2[cond_fasting2].apply(lambda r: dx(r), axis=1)
df_glu2["glucose"].value_counts()
# df_glu[cond_fasting & cond_f]


In [None]:
df_glu[cond_m & cond_fasting & ((df_glu.fbg>=7.0) | (df_glu.ogtt>=11.1))].count()

In [None]:
df_glu_female = df_glu[
    cond_f & 
    cond_fasting & 
    ((df_glu.fbg>=7.0) | (df_glu.fbg.isna()) | (df_glu.ogtt>=11.1) | (df_glu.ogtt.isna()) )
][["fbg_threshold", "ogtt_threshold"]].value_counts().to_frame().reset_index()
df_glu_female

In [None]:
res = odds_ratio([[98, 50], [436,3619]])
res.statistic

In [None]:
res.confidence_interval(confidence_level=0.95)

In [None]:
import numpy as np
from scipy.stats import hypergeom
table = np.array([[98, 436], [50, 3619]])
M = table.sum()
n = table[0].sum()
N = table[:, 0].sum()
start, end = hypergeom.support(M, n, N)
hypergeom.pmf(np.arange(start, end+1), M, n, N)


In [None]:
from scipy.stats import fisher_exact
res = fisher_exact(table, alternative='two-sided')
res.pvalue

In [None]:
res.statistic