In [None]:
%%capture
from remote_read_sql import get_db_connection
from pathlib import Path
import pandas as pd
from scipy import stats
pd.set_option("future.no_silent_downcasting", True)

In [None]:
# change as needed
my_cnf_path = Path("~/.my.cnf")
my_cnf_connection_name = "client"
data_folder = Path("~/Library/CloudStorage/OneDrive-UniversityCollegeLondon/Documents - igh.respond-africa/META2-data/").expanduser()

In [None]:
db_conn_opts = dict(my_cnf_path=my_cnf_path, connection_name=my_cnf_connection_name, db_name="meta2_production", local_bind_port=3306)

In [None]:
# sql statements
sql_screen = "select * from meta_screening_subjectscreening"

In [None]:
# read in table data
with get_db_connection(**db_conn_opts) as db_conn:
    df_screen = pd.read_sql(sql_screen, con=db_conn)


In [None]:
df_screen["selection_method"] = df_screen["selection_method"].replace('purposively_selected', 'purposeful')
df_screen["selection_method"] = df_screen["selection_method"].replace('random_sampling', 'random')
cat_type = pd.CategoricalDtype(categories=["purposeful", "random"], ordered=True)
df_screen["selection_method"] = df_screen["selection_method"].astype(cat_type)
df_screen.selection_method.value_counts()


In [None]:
cols = df_screen.select_dtypes(["int64", "float64"]).columns
print(cols)

In [None]:
cols = ['sys_blood_pressure', 'dia_blood_pressure', 'age_in_years', 'eligible',
       'consented', 'calculated_bmi_value', 'calculated_egfr_value',
       'converted_creatinine_value', 'converted_ogtt_value', 'weight',
       'height', 'hba1c_value', 'creatinine_value', 'ifg_value', 'ogtt_value',
       'waist_circumference', 'converted_ifg_value', 'refused']

In [None]:
interesting_cols = []
result_text = []
for col in cols:
    purposeful = df_screen[df_screen['selection_method'] == 'purposeful'][col].dropna()
    random = df_screen[df_screen['selection_method'] == 'random'][col].dropna()
    ttest_result = stats.ttest_ind(purposeful, random, equal_var=False)
    result_text.append(
        f"{col}\n"
        f"---------------------------------\n"
        f"All: {df_screen[col].count()}\n"
        f"{df_screen[col].describe()}\n\n"
        f"Purposeful: {purposeful.count()}\n"
        f"{purposeful.describe()}\n\n"
        f"Random: {random.count()}\n"
        f"{random.describe()}\n\n"
        f"Mean Purposeful: {purposeful.mean():.2f}\n"
        f"Mean Random:     {random.mean():.2f}\n"
        f"T-test P-value:  {ttest_result.pvalue:.4f}\n"
    )
    if ttest_result.pvalue<=0.05:
        interesting_cols.append(f"{col}:{ttest_result.pvalue:.4f}")

print("\n\n".join(result_text))

In [None]:
cat_cols = ["gender", "unsuitable_for_study", "art_six_months", "on_rx_stable", "pregnant", 'congestive_heart_failure', 'liver_disease', 'alcoholism', 'acute_metabolic_acidosis', 'renal_function_condition', 'tissue_hypoxia_condition', 'acute_condition', 'metformin_sensitivity','has_dm', 'on_dm_medication', 'severe_htn']

In [None]:
result_cat_text = []
for col in cat_cols:
    crosstab_col = pd.crosstab(df_screen['selection_method'], df_screen[col])
    try:
        chi2_col, p_col, _, _ = stats.chi2_contingency(crosstab_col)
    except ValueError:
        print(f"Skipping {col}")
    else:
        result_cat_text.append(
            f"---------------------------------\n"
            f"{col}\n"
            f"Chi2 P-value:  {p_col:.4f}\n"
        )
        if p_col<=0.05:
            interesting_cols.append(f"{col}:{p_col:.4f}")

print("\n\n".join(result_cat_text))

In [None]:
print(interesting_cols)

In [None]:
[
    'calculated_bmi_value:1.1014125530703846e-07',
    'converted_ogtt_value:0.014267344922641038',
    'weight:6.570385603951388e-10',
    'height:0.010089513582398957',
    'creatinine_value:1.4779039902876427e-08',
    'ifg_value:2.2597113620615133e-14',
    'ogtt_value:0.0021018558850009745',
    'waist_circumference:8.083437777983912e-11',
    'converted_ifg_value:0.030812252089487797']