# Setup

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from MetabolomicsPaper.GeneralHelperFunctions import *

paper_dir = '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Cardio/ACS_study/v2/'
curr_dir = os.path.join(paper_dir, 'Table1')
make_dir_if_not_exists(curr_dir)
os.chdir(paper_dir)

In [2]:
from LabData.DataLoaders.NightingaleLoader import NightingaleLoader
from LabData.DataLoaders.MetabolonLoader import MetabolonLoader

# Load data

In [14]:
acs_data = pd.read_csv('data/ACS_full_data.csv', index_col=0)
pnp_data = pd.read_csv('data/pnp1_full_data.csv', index_col='RegistrationCode')

all_pnp_ids = pd.read_csv('data/all_PNP1_RegIDs.csv', index_col=0)
all_acs_ids = pd.read_csv('data/all_ACS_RegIDs.csv', index_col=0)
# pnp_data = pnp_data[pnp_data.IsGenotek == 0]
acs_data['ACS sample'], pnp_data['ACS sample'] = 1, 0

pnp_data.index = pnp_data.index.astype(str)
acs_data.index = acs_data.index.astype(str)

pnp_data = pnp_data.loc[all_pnp_ids['PNP RegIDs'].astype(str)]
acs_data = acs_data.loc[all_acs_ids['ACS RegIDs'].astype(str)]

  interactivity=interactivity, compiler=compiler, result=result)


In [16]:
matching_dir = 'Matching'
pnp_acs_matching_dir = os.path.join('Matching', 'PNP1-ACS_matching')
make_dir_if_not_exists(pnp_acs_matching_dir)

Prepare two tables one for pnp and one for acs with all relevant features for table1

features 

In [18]:
features = ['Age', 'gender', 'bmi', 'smoke_tobacco_now', 'HbA1C', 'DM', 'Hypertension', 'Dyslipidemia', 'treatment for DM', 
            'treatment for Hypertension', 'treatment for Dyslipidemia']
# in healthy controls, what fraction medicated for (hypertension, dyslipidemia, dysglycemia not yet diagnosed as T2DM)?

In [19]:
pnp_data.loc[:, 'HbA1C'] = pnp_data['bt__hba1c']
acs_data.loc[:, 'DM'] = (acs_data['Glucose Disorder: DM1']==1) | (acs_data['Glucose Disorder: DM2']==1) | (acs_data['HbA1C']>6.4)
acs_data.loc[:, 'waist'] = None

In [20]:
diagnosis_by_drugs = pd.read_excel(os.path.join('data', 'pnp_drugs-diagnosis.xls'), 'Diagnosis')
diagnosis_by_drugs

Unnamed: 0,Hypertension,Dyslipidemia,DM,PreDM
0,Amiloride,Atorvastatin,Glimepiride,HbA1C 6-6.4
1,Amlodipine,Bezafibrate,Metformin,
2,Atenolol,Ciprofibrate,Sitagliptin,
3,Bisoprolol,Ezetimibe,Vildagliptin,
4,Candesartan,Pravastatin,,
5,Cilazapril,Rosuvastatin calcium,HbA1C 6.5 and above,
6,Clonidine Hydrochloride,Simvastatin,,
7,Enalapril,,,
8,Hydrochlorothiazide,,,
9,Lercanidipine,,,


In [21]:
# pd.Series(pnp_data.columns[3309:3461]).to_csv(os.path.join(paper_dir, 'data/', 'pnp_drug_names.csv'))
pd.Series(acs_data.columns[40:64]).to_csv(os.path.join(paper_dir, 'data/', 'acs_drug_names.csv'))

In [22]:
## pnp_data_metabolon_full.loc[:, 'treatment for DM'] = pnp_data_metabolon_full.reindex(diagnosis_by_drugs['DM'].dropna(), axis=1).any(1)
pnp_data.loc[:, 'treatment for Hypertension'] = pnp_data.reindex(diagnosis_by_drugs['Hypertension'].dropna(), axis=1).any(1)
pnp_data.loc[:, 'treatment for Dyslipidemia'] = pnp_data.reindex(diagnosis_by_drugs['Dyslipidemia'].dropna(), axis=1).any(1)

acs_diagnosis_by_drugs_dic = {'DM': ['Admission Metformin', 'Admission Sulfunylurea', 'Admission DPP-4 blockers', 'Admission GLP-1 Analogs', 'Admission SGLT-2 Blockers'], 
                                       'Hypertension': ['Admission Beta Blockers', 'Admission ACEI', 'Admission ARB', 'Admission Calcium', 'Admission Channel Blockers'], 
                                       'Dyslipidemia': ['Admission Statins']}
acs_diagnosis_by_drugs = pd.DataFrame.from_dict(acs_diagnosis_by_drugs_dic, orient='index').T

acs_data.loc[:, 'treatment for DM'] = acs_data.reindex(acs_diagnosis_by_drugs['DM'].dropna(), axis=1).any(1)
acs_data.loc[:, 'treatment for Hypertension'] = acs_data.reindex(acs_diagnosis_by_drugs['Hypertension'].dropna(), axis=1).any(1)
acs_data.loc[:, 'treatment for Dyslipidemia'] = acs_data.reindex(acs_diagnosis_by_drugs['Dyslipidemia'].dropna(), axis=1).any(1)

pnp_data.loc[:, 'treatment for DM'] = pnp_data.reindex(diagnosis_by_drugs['DM'].dropna(), axis=1).any(1)
pnp_data.loc[:, 'treatment for Hypertension'] = pnp_data.reindex(diagnosis_by_drugs['Hypertension'].dropna(), axis=1).any(1)
pnp_data.loc[:, 'treatment for Dyslipidemia'] = pnp_data.reindex(diagnosis_by_drugs['Dyslipidemia'].dropna(), axis=1).any(1)

In [23]:
features

['Age',
 'gender',
 'bmi',
 'smoke_tobacco_now',
 'HbA1C',
 'DM',
 'Hypertension',
 'Dyslipidemia',
 'treatment for DM',
 'treatment for Hypertension',
 'treatment for Dyslipidemia']

# Table1

In [24]:
acs_data['Dyslipidemia'] = acs_data['Dyslipidemia_asin_PNP']

## all samples

In [27]:
table1_names = {'Age (years)':'Age', 'Sex, Males (%)':'gender', 'BMI':'bmi', 'Currently smokes (%)':'smoke_tobacco_now',
                'HbA1C (%)':'HbA1C', 'DM (%)':'DM', 'Hypertension (%)':'Hypertension', 'Dyslipidemia (%)':'Dyslipidemia', 
                'Treated for DM (%)':'treatment for DM', 'Treated for Hypertension (%)':'treatment for Hypertension', 
                'Treated for Dyslipidemia (%)':'treatment for Dyslipidemia'}
table1_tests = {k:fisher_exact for k in table1_names}
cont_vars = ['Age (years)', 'BMI', 'HbA1C (%)']
bin_vars = list(set(table1_names.keys()) - set(cont_vars))
for cont_k in cont_vars:
    table1_tests[cont_k] = mannwhitneyu
    
pts_data = acs_data.copy()
normal_data = pnp_data.copy()

table1 = pd.DataFrame(index=['Age (years)', 'Sex, Males (%)', 'BMI', 'Currently smokes (%)'],
                      columns=[f'ACS (n={pts_data.shape[0]})', f'Healthy (n={normal_data.shape[0]})', 'P value'])
table1.index.names = ['Characteristics']

for b_v in bin_vars:
    print (b_v)
    pts_n = pts_data[table1_names[b_v]].sum()
    pts_non_na = pts_data[table1_names[b_v]].dropna().shape[0]
    pts_per, pts_rest = 100 * pts_n / pts_non_na, pts_non_na - pts_n
    norm_n = normal_data[table1_names[b_v]].sum()
    norm_non_na = normal_data[table1_names[b_v]].dropna().shape[0]
    norm_per, norm_rest = 100 * norm_n / norm_non_na, norm_non_na - norm_n
    p = fisher_exact([[pts_n, pts_rest], [norm_n, norm_rest]])[1]
    table1.loc[b_v] = ['%0.1f'%(pts_per) + '%','%0.1f'%(norm_per) + '%', '%0.2g'%p]
    
for c_v in cont_vars:
    pts_mean, pts_std = pts_data[table1_names[c_v]].mean(), pts_data[table1_names[c_v]].std()
    norm_mean, norm_std = normal_data[table1_names[c_v]].mean(), normal_data[table1_names[c_v]].std()
    p = mannwhitneyu(pts_data[table1_names[c_v]].dropna(), normal_data[table1_names[c_v]].dropna())[1]
    table1.loc[c_v] = ['%0.1f +- %0.1f'%(pts_mean, pts_std), '%0.1f +- %0.1f'%(norm_mean, norm_std), '%0.2g'%p]

    
# table1 = table1.applymap(lambda x: x.replace('+-', r'$\pm$'))
table1.to_csv(os.path.join(curr_dir, 'Table1_all_samples.csv'))
table1
# bin_vars

Hypertension (%)
DM (%)
Sex, Males (%)
Treated for Dyslipidemia (%)
Treated for Hypertension (%)
Treated for DM (%)
Dyslipidemia (%)
Currently smokes (%)


Unnamed: 0_level_0,ACS (n=199),Healthy (n=970),P value
Characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Age (years),60.3 +- 9.6,43.3 +- 12.7,1e-52
"Sex, Males (%)",89.9%,39.3%,1.1e-42
BMI,27.5 +- 4.2,26.5 +- 5.1,4.5e-05
Currently smokes (%),32.8%,10.4%,6e-14
Hypertension (%),56.1%,16.0%,6.5e-30
DM (%),31.2%,3.6%,7.5e-28
Treated for Dyslipidemia (%),48.2%,7.5%,3.4e-39
Treated for Hypertension (%),42.7%,7.1%,1.6e-32
Treated for DM (%),20.6%,2.0%,8.8e-20
Dyslipidemia (%),64.3%,21.8%,1.6e-30


## MMC no HPN + Dyslipidemia

In [28]:
mmc = pd.read_csv(os.path.join(pnp_acs_matching_dir, 'PNP1-ACS_metabolon_age+sex+bmi+DM+smoking_1:1.csv'), index_col=0)
mmc_pnp = mmc[mmc['ACS sample']==0].index.astype(str)
mmc_acs = mmc[mmc['ACS sample']==1].index.astype(str)

In [30]:
table1_names = {'Age (years)':'Age', 'Sex, Males (%)':'gender', 'BMI':'bmi', 'Currently smokes (%)':'smoke_tobacco_now',
                'HbA1C (%)':'HbA1C', 'DM (%)':'DM', 'Hypertension (%)':'Hypertension', 'Dyslipidemia (%)':'Dyslipidemia', 
                'Treated for DM (%)':'treatment for DM', 'Treated for Hypertension (%)':'treatment for Hypertension', 'Treated for Dyslipidemia (%)':'treatment for Dyslipidemia'}
table1_tests = {k:fisher_exact for k in table1_names}
cont_vars = ['Age (years)', 'BMI', 'HbA1C (%)']
bin_vars = list(set(table1_names.keys()) - set(cont_vars))
for cont_k in cont_vars:
    table1_tests[cont_k] = mannwhitneyu
    
pts_data = acs_data.loc[mmc_acs]
normal_data = pnp_data.loc[mmc_pnp]

table1 = pd.DataFrame(index=['Age (years)', 'Sex, Males (%)', 'BMI', 'Currently smokes (%)'],
                      columns=[f'ACS (n={pts_data.shape[0]})', f'Healthy (n={normal_data.shape[0]})', 'P value'])
table1.index.names = ['Characteristics']

for b_v in bin_vars:
    print (b_v)
    pts_n = pts_data[table1_names[b_v]].sum()
    pts_non_na = pts_data[table1_names[b_v]].dropna().shape[0]
    pts_per, pts_rest = 100 * pts_n / pts_non_na, pts_non_na - pts_n
    norm_n = normal_data[table1_names[b_v]].sum()
    norm_non_na = normal_data[table1_names[b_v]].dropna().shape[0]
    norm_per, norm_rest = 100 * norm_n / norm_non_na, norm_non_na - norm_n
    p = fisher_exact([[pts_n, pts_rest], [norm_n, norm_rest]])[1]
    table1.loc[b_v] = ['%0.1f'%(pts_per) + '%','%0.1f'%(norm_per) + '%', '%0.2g'%p]
    
for c_v in cont_vars:
    pts_mean, pts_std = pts_data[table1_names[c_v]].mean(), pts_data[table1_names[c_v]].std()
    norm_mean, norm_std = normal_data[table1_names[c_v]].mean(), normal_data[table1_names[c_v]].std()
    p = mannwhitneyu(pts_data[table1_names[c_v]].dropna(), normal_data[table1_names[c_v]].dropna())[1]
    table1.loc[c_v] = ['%0.1f +- %0.1f'%(pts_mean, pts_std), '%0.1f +- %0.1f'%(norm_mean, norm_std), '%0.2g'%p]

    
# table1 = table1.applymap(lambda x: x.replace('+-', r'$\pm$'))
table1.to_csv(os.path.join(curr_dir, 'Table1_age+sex+bmi+DM+smoking.csv'))
table1
# bin_vars

Hypertension (%)
DM (%)
Sex, Males (%)
Treated for Dyslipidemia (%)
Treated for Hypertension (%)
Treated for DM (%)
Dyslipidemia (%)
Currently smokes (%)


Unnamed: 0_level_0,ACS (n=83),Healthy (n=83),P value
Characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Age (years),57.8 +- 9.9,57.4 +- 9.5,0.36
"Sex, Males (%)",79.5%,79.5%,1.0
BMI,28.1 +- 4.3,27.7 +- 4.8,0.24
Currently smokes (%),21.7%,14.5%,0.31
Hypertension (%),48.8%,38.6%,0.21
DM (%),18.1%,15.7%,0.84
Treated for Dyslipidemia (%),43.4%,25.3%,0.022
Treated for Hypertension (%),34.9%,22.9%,0.12
Treated for DM (%),13.3%,9.6%,0.63
Dyslipidemia (%),55.4%,38.6%,0.043


## microbiome - MMC no HPN+Dys

In [32]:
mmc = pd.read_csv(os.path.join(pnp_acs_matching_dir, 'PNP1-ACS_microbiome_age+sex+bmi+DM+smoking_1:1.csv'), index_col=0)
mmc_pnp = mmc[mmc['ACS sample']==0].index.astype(str)
mmc_acs = mmc[mmc['ACS sample']==1].index.astype(str)

In [33]:
table1_names = {'Age (years)':'Age', 'Sex, Males (%)':'gender', 'BMI':'bmi', 'Currently smokes (%)':'smoke_tobacco_now',
               'HbA1C (%)':'HbA1C', 'DM (%)':'DM', 'Hypertension (%)':'Hypertension', 'Dyslipidemia (%)':'Dyslipidemia', 
                'Treated for DM (%)':'treatment for DM', 'Treated for Hypertension (%)':'treatment for Hypertension', 'Treated for Dyslipidemia (%)':'treatment for Dyslipidemia'}
table1_tests = {k:fisher_exact for k in table1_names}
cont_vars = ['Age (years)', 'BMI', 'HbA1C (%)']
bin_vars = list(set(table1_names.keys()) - set(cont_vars))
for cont_k in cont_vars:
    table1_tests[cont_k] = mannwhitneyu
    
pts_data = acs_data.loc[mmc_acs]
normal_data = pnp_data.loc[mmc_pnp]

table1 = pd.DataFrame(index=['Age (years)', 'Sex, Males (%)', 'BMI', 'Currently smokes (%)'],
                      columns=[f'ACS (n={pts_data.shape[0]})', f'Healthy (n={normal_data.shape[0]})', 'P value'])
table1.index.names = ['Characteristics']

for b_v in bin_vars:
    print (b_v)
    pts_n = pts_data[table1_names[b_v]].sum()
    pts_non_na = pts_data[table1_names[b_v]].dropna().shape[0]
    pts_per, pts_rest = 100 * pts_n / pts_non_na, pts_non_na - pts_n
    norm_n = normal_data[table1_names[b_v]].sum()
    norm_non_na = normal_data[table1_names[b_v]].dropna().shape[0]
    norm_per, norm_rest = 100 * norm_n / norm_non_na, norm_non_na - norm_n
    p = fisher_exact([[pts_n, pts_rest], [norm_n, norm_rest]])[1]
    table1.loc[b_v] = ['%0.1f'%(pts_per) + '%','%0.1f'%(norm_per) + '%', '%0.2g'%p]
    
for c_v in cont_vars:
    pts_mean, pts_std = pts_data[table1_names[c_v]].mean(), pts_data[table1_names[c_v]].std()
    norm_mean, norm_std = normal_data[table1_names[c_v]].mean(), normal_data[table1_names[c_v]].std()
    p = mannwhitneyu(pts_data[table1_names[c_v]].dropna(), normal_data[table1_names[c_v]].dropna())[1]
    table1.loc[c_v] = ['%0.1f +- %0.1f'%(pts_mean, pts_std), '%0.1f +- %0.1f'%(norm_mean, norm_std), '%0.2g'%p]

    
# table1 = table1.applymap(lambda x: x.replace('+-', r'$\pm$'))
table1.to_csv(os.path.join(curr_dir, 'Table1_microbiome_age+sex+bmi+DM+smoking.csv'))
table1
# bin_vars

Hypertension (%)
DM (%)
Sex, Males (%)
Treated for Dyslipidemia (%)
Treated for Hypertension (%)
Treated for DM (%)
Dyslipidemia (%)
Currently smokes (%)


Unnamed: 0_level_0,ACS (n=64),Healthy (n=64),P value
Characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Age (years),53.8 +- 9.8,54.6 +- 9.1,0.25
"Sex, Males (%)",75.0%,67.2%,0.44
BMI,28.2 +- 4.5,27.9 +- 3.9,0.33
Currently smokes (%),26.6%,25.0%,1.0
Hypertension (%),39.7%,37.5%,0.86
DM (%),15.6%,12.5%,0.8
Treated for Dyslipidemia (%),31.2%,28.1%,0.85
Treated for Hypertension (%),25.0%,18.8%,0.52
Treated for DM (%),15.6%,4.7%,0.076
Dyslipidemia (%),50.0%,50.0%,1.0
