In [None]:
import pandas as pd
import numpy as np
import random
import os
import statsmodels.api as sm

In [None]:
# ysi_mp_csv_path = '/Users/liangwang/Library/CloudStorage/OneDrive-Biolinq/Documents - Clinical Data Analysis/Data/CP-008 Session 25 - Pre-Pivotal2/Archive/missing references/all_data/prospective_8S17_v14h_2022_5day_AllRef/performance/ysi including 19c 20 22.csv'
ysi_mp_csv_path = '/Users/liangwang/Library/CloudStorage/OneDrive-Biolinq/Documents - Clinical Data Analysis/Data/CP-008 Session 25 - Pre-Pivotal2/Archive/missing references/experimental/prospective_8S20_v15_fault_dt_2022_5day_dayOneEnabled_thresh0.5_v2_AllRef/performance/ysi including 19c 20 22.csv'
ysi_data = pd.read_csv(ysi_mp_csv_path)

In [None]:
chem_list = ['2.5.11-PPD+KCL', 'Dynamic-Thick', 'Dynamic-Thin', 'Static-Thin']
tier_list = ['Tier 1: 3/3 Clinical Use']
# group_list = ['Pre-Piv-2','eBlinq19c', 'eBlinq20', 'eBlinq22']
group_list = ['Pre-Piv-2']
# group_list = ['eBlinq19c', 'eBlinq20', 'eBlinq22']
save_loc_bootstrap_df = '/Users/liangwang/Library/CloudStorage/OneDrive-Biolinq/Documents - Clinical Data Analysis/Data/CP-008 Session 25 - Pre-Pivotal2/Archive/\
missing references/experimental/prospective_8S20_v15_fault_dt_2022_5day_dayOneEnabled_thresh0.5_v2_AllRef/bootstrapped data'

In [None]:
valid_index = ysi_data[(ysi_data['Chemistry'].isin(chem_list)) & (ysi_data['Sorting Tier'].isin(tier_list)) & (ysi_data['Grouping'].isin(group_list))].index

In [None]:
ysi_data_valid = ysi_data.loc[valid_index].copy()

In [None]:
"""
Use bootstrap method by sensor to estimate confidence interval lower bound.
For each iteration, it resample sensor and put all matched pairs from that sensor into
the bootstap pool. Thus it remains intra-sensor correlation without making any assumptions
"""
seed_of_seed = 1
seed_range = 100000
num_bootstrap = 999

sid_df = pd.DataFrame({'sensor_id': pd.unique(ysi_data_valid['sensor_id'])})

all_seeds = list(range(1,seed_range))
random.seed(seed_of_seed)
random.shuffle(all_seeds)
seeds_for_bootstrap = all_seeds[0:num_bootstrap]
mard_by_bootstrap = np.full(num_bootstrap,np.nan)
for seedIdx,iSeed in enumerate(seeds_for_bootstrap):      
    bootstrap_df = sid_df.sample(frac=1, replace=True, random_state=iSeed)
    bootstrap_df = bootstrap_df.merge(ysi_data_valid,how='left')
    mard_by_bootstrap[seedIdx] = bootstrap_df['ard'].mean()
    bootstrap_df.to_csv(os.path.join(save_loc_bootstrap_df,f'bootstrap_df_{seedIdx}.csv'), header=True)
print(np.percentile(mard_by_bootstrap,2.5))
print(np.percentile(mard_by_bootstrap,97.5))

In [None]:
"""
Use GEE (Generalized Estimating Equations) method to estimate confidence interval lower bound of 
agreement rate. Employ statsmodel GEE module, which only accepts a matrix with two columns as input,
but in our case, we only need intercept, thus played a trick and created a dummy column filled value 0
"""
if 'dummy' not in ysi_data_valid.columns:
    ysi_data_valid.insert(0,'dummy',0)
ysi_data_valid['ard2'] = ysi_data_valid['ard'].div(100)
fam = sm.families.Gaussian()
ind = sm.cov_struct.Exchangeable()
mod = sm.GEE.from_formula("ard2 ~ dummy", groups="sensor_id", data=ysi_data_valid, cov_struct=ind, family=fam)
fitResult = mod.fit()
ci_matrix = fitResult.conf_int(alpha = 0.05, cov_type = 'robust')
ci_matrix.loc['Intercept',[0,1]]