# 03 Evaluate Select Additional Relationships

If a plasma marker was significantly related to both FDG-PET and volumetrics in the same brain region, an additional model was created to directly estimate the relationship between FDG-PET and volume.   

To explore the unexpected findings in 

In [69]:
# install required packages - commented out so it doesn't install every time
#%conda install -n Lauren openpyxl numpy pandas statsmodels plotnine matplotlib scikit-learn scipy mizani nbconvert pandoc pyreadstat kmodes seaborn

# import required packages
import numpy as np;
import pandas as pd;
from sklearn import preprocessing 
import statsmodels as sm;
import statsmodels.formula.api as smf;
import plotnine as p9;
import itertools
import pickle

import scipy; # for spearmann correlation

# read in and clean data

In [70]:
# read in data
pons_rr, new_para_rr = pd.read_pickle('./output/00_newdata.pkl')

In [71]:
all_data = new_para_rr # change this value to pons_rr to get the result when using pons as the FDG-PET reference region
data_name = 'new_para_rr' # change this value to pons_rr to get the result when using pons as the FDG-PET reference region

In [72]:
# define biomarker groups
FDG_columns2 = [
  'Par_gm', 'Par_gm', 
]

vol_columns2 = [
    'VOL_InfPar_Lz', 'VOL_InfPar_Rz',
]

# Additional FDG-Volumetric Analyses

## version using log10 plasma data

In [73]:
covariate_p_threshold = 0.05

In [74]:
biomarker_relationship_results_log10 = pd.DataFrame()

for varlist in [vol_columns2]:
    combinations = pd.DataFrame(zip(FDG_columns2, varlist))
    combinations = combinations[combinations[0].isin(all_data.columns) & combinations[1].isin(all_data.columns)].reset_index(drop = True)

    for idx in range(combinations[0].size):
        x_var = combinations[0][idx]
        y_var = combinations[1][idx]
        
        temp_df_baseline =  all_data.copy().dropna(subset = [y_var, x_var])

        # z-score data
        temp_df_baseline[[y_var, x_var, 'Education_years', 'age']] = preprocessing.StandardScaler().fit_transform(temp_df_baseline[[y_var, x_var, 'Education_years', 'age']])

        # run 1 - check for significant covariates
        sig_pvals_start = ['age', 'Education_years', 'apoe4_carrier', 'sex', 'race_ethnicity', x_var]
        change = 1
        full_model = smf.ols(formula = y_var + ' ~ ' + ' + '.join(sig_pvals_start), data = temp_df_baseline).fit()
        temp_df_baseline['full_model_residuals'] = full_model.resid

        while change > 0:
            model_vars = ' + '.join(sig_pvals_start)        
            test_model = smf.ols(formula = y_var + ' ~ ' + model_vars, data = temp_df_baseline).fit()
            sig_pvals = test_model.pvalues[test_model.pvalues < covariate_p_threshold].index.to_list()
            sig_pvals = [ele.split('[')[0] for ele in sig_pvals] # drop the [] indicators
            sig_pvals_unique = []
            for item in sig_pvals:
                if (item not in sig_pvals_unique) & (item not in ['Intercept']): sig_pvals_unique.append(item)
            if (x_var not in sig_pvals_unique):
                sig_pvals_unique = sig_pvals_unique + [x_var]
            change = len(sig_pvals_start) - len(sig_pvals_unique)
            sig_pvals_start = sig_pvals_unique
        
        # run 2 -with just the significant covariates (but forcing the x_var to be included)
        model_vars = ' + '.join(sig_pvals_unique)        
        model = smf.ols(formula = y_var + ' ~ ' + model_vars , data = temp_df_baseline).fit()

        model_p_val_baseline = model.pvalues
        model_B_val_baseline = model.params
        model_n_baseline = len(temp_df_baseline.dropna(subset = sig_pvals_unique + [y_var]))


        # save results
        temp_results  =  pd.DataFrame([y_var, x_var, model_p_val_baseline, model_B_val_baseline, model_n_baseline]).T

        temp_results.columns = ['y_var', 'x_var', 
        'model_p_val_baseline', 'model_B_val_baseline', 'model_n_baseline']
        
        # separate out p and B values into separate columns
        df_mv_age_pvals = pd.DataFrame(temp_results['model_p_val_baseline'][0]).T
        df_mv_age_pvals.columns = 'model_p_val_' + df_mv_age_pvals.columns
        temp_results[df_mv_age_pvals.columns] = df_mv_age_pvals

        df_mv_age_bvals = pd.DataFrame(temp_results['model_B_val_baseline'][0]).T
        df_mv_age_bvals.columns = 'model_B_val_' + df_mv_age_bvals.columns
        temp_results[df_mv_age_bvals.columns] = df_mv_age_bvals                
        
        # remove self-correlations
        temp_results = temp_results[~(temp_results['y_var'] == temp_results['x_var'])]

        # rename so var2 is in the same column
        temp_results = temp_results.rename(columns = dict(zip(temp_results.columns, [ele.replace(x_var, 'x_var') for ele in temp_results.columns ])))

        # add to full list of results
        biomarker_relationship_results_log10 = pd.concat([biomarker_relationship_results_log10, temp_results])

# subgroup_label Analyses

In [75]:
# make average (bilateral) hippocampal volume variable
all_data['VOL_Hip_z'] = all_data[['VOL_Hip_Rz', 'VOL_Hip_Lz']].mean(axis = 1)

# define groups / bins of continuous variables to use when graphing
all_data['age_decade2'] = pd.cut(all_data['age'], [57, 70, 77, 88])#.astype('str').str.replace('nan', 'unknown')

all_data['MMSE_bin'] = pd.cut(all_data['MMSE'], [18, 20, 23, 27]).astype('str').str.replace('nan', 'unknown')
all_data['cdrsum_bin'] = pd.cut(all_data['cdrsum'], [0, 2.5, 3.5, 10]).astype('str').str.replace('nan', 'unknown')
all_data['Ab42_40_bin'] = pd.cut(all_data['Ab42_40'], [0.04, 0.06, 0.07,  0.11]).astype('str').str.replace('nan', 'unknown')
all_data['pTau217_bin'] = pd.cut(all_data['pTau217'], [0.2, 1.1, 2, 4.5]).astype('str').str.replace('nan',  'unknown')
all_data['GFAP_bin'] = pd.cut(all_data['GFAP'], [75, 175, 275, 475]).astype('str').str.replace('nan', 'unknown')
all_data['NFL_bin'] = pd.cut(all_data['NFL'], [15, 30, 45, 90]).astype('str').str.replace('nan', 'unknown')
all_data['VOL_Hip_z_bin'] = pd.cut(all_data['VOL_Hip_z'], [-5.4, -2.7, -1.2, 1.3]).astype('str').str.replace('nan', 'unknown')

## version using log10 plasma data

In [76]:
covariate_p_threshold = 0.05

In [77]:
['age_decade2', 'cdrsum_bin', 'Ab42_40_bin', 'GFAP_bin']

['age_decade2', 'cdrsum_bin', 'Ab42_40_bin', 'GFAP_bin']

In [78]:
combinations

Unnamed: 0,0,1
0,Par_gm,VOL_InfPar_Lz
1,Par_gm,VOL_InfPar_Rz


In [79]:
temp_df_baseline[all_na_covariates]

0
1
2
3
4
5
6
7
8
9
11


In [80]:
#biomarker_relationship_results_log10 = pd.DataFrame()


In [None]:

for subgroup_type in ['age_decade2', 'cdrsum_bin', 'Ab42_40_bin', 'GFAP_bin']:

    x_var = 'pTau181'
    y_var = 'MTL_gm'
    
    temp_df_baseline =  all_data.copy().dropna(subset = [y_var, x_var])

    # z-score data
    temp_df_baseline[[y_var, x_var, 'Education_years', 'age']] = preprocessing.StandardScaler().fit_transform(temp_df_baseline[[y_var, x_var, 'Education_years', 'age']])

    
    # run 1 - check for significant covariates (using all subjects, so each subgroup is run using same set of covariates as others)
    sig_pvals_start = ['age', 'Education_years', 'apoe4_carrier', 'sex', 'race_ethnicity', x_var]
    change = 1
    full_model = smf.ols(formula = y_var + ' ~ ' + ' + '.join(sig_pvals_start), data = temp_df_baseline).fit()
    temp_df_baseline['full_model_residuals'] = full_model.resid

    while change > 0:
        model_vars = ' + '.join(sig_pvals_start)        
        test_model = smf.ols(formula = y_var + ' ~ ' + model_vars, data = temp_df_baseline).fit()
        sig_pvals = test_model.pvalues[test_model.pvalues < covariate_p_threshold].index.to_list()
        sig_pvals = [ele.split('[')[0] for ele in sig_pvals] # drop the [] indicators
        sig_pvals_unique = []
        for item in sig_pvals:
            if (item not in sig_pvals_unique) & (item not in ['Intercept']): sig_pvals_unique.append(item)
        if (x_var not in sig_pvals_unique):
            sig_pvals_unique = sig_pvals_unique + [x_var]
        change = len(sig_pvals_start) - len(sig_pvals_unique)
        sig_pvals_start = sig_pvals_unique

    for subgroup_label in all_data[subgroup_type].value_counts()[all_data[subgroup_type].value_counts()>1].index:
        temp_df_baseline2 =  temp_df_baseline[temp_df_baseline[subgroup_type].isin([subgroup_label])].copy()

        # run 2 -with just the significant covariates (but forcing the x_var to be included)
        model_vars = ' + '.join(sig_pvals_unique)        
        model = smf.ols(formula = y_var + ' ~ ' + model_vars , data = temp_df_baseline2).fit()

        model_p_val_baseline = model.pvalues
        model_B_val_baseline = model.params
        model_n_baseline = len(temp_df_baseline2.dropna(subset = sig_pvals_unique + [y_var]))


        # save results
        temp_results  =  pd.DataFrame([subgroup_type, subgroup_label, y_var, x_var, model_p_val_baseline, model_B_val_baseline, model_n_baseline]).T

        temp_results.columns = ['subgroup_type', 'subgroup_label', 'y_var', 'x_var', 
        'model_p_val_baseline', 'model_B_val_baseline', 'model_n_baseline']
        
        # separate out p and B values into separate columns
        df_mv_age_pvals = pd.DataFrame(temp_results['model_p_val_baseline'][0]).T
        df_mv_age_pvals.columns = 'model_p_val_' + df_mv_age_pvals.columns
        temp_results[df_mv_age_pvals.columns] = df_mv_age_pvals

        df_mv_age_bvals = pd.DataFrame(temp_results['model_B_val_baseline'][0]).T
        df_mv_age_bvals.columns = 'model_B_val_' + df_mv_age_bvals.columns
        temp_results[df_mv_age_bvals.columns] = df_mv_age_bvals                
        
        # remove self-correlations
        temp_results = temp_results[~(temp_results['y_var'] == temp_results['x_var'])]

        # rename so var2 is in the same column
        temp_results = temp_results.rename(columns = dict(zip(temp_results.columns, [ele.replace(x_var, 'x_var') for ele in temp_results.columns ])))

        # add to full list of results
        biomarker_relationship_results_log10 = pd.concat([biomarker_relationship_results_log10, temp_results])

In [86]:
biomarker_relationship_results_log10

Unnamed: 0,y_var,x_var,model_p_val_baseline,model_B_val_baseline,model_n_baseline,model_p_val_Intercept,model_p_val_x_var,model_B_val_Intercept,model_B_val_x_var,var,subgroup,model_p_val_Education_years,model_B_val_Education_years
0,VOL_InfPar_Lz,Par_gm,Intercept 1.000000 Par_gm 0.000017 dt...,Intercept 2.437464e-17 Par_gm 6.44681...,37,1.0,1.7e-05,2.4374640000000002e-17,0.644681,,,,
0,VOL_InfPar_Rz,Par_gm,Intercept 1.000000 Par_gm 0.000093 dt...,Intercept -1.345467e-16 Par_gm 5.97992...,37,1.0,9.3e-05,-1.345467e-16,0.597993,,,,
0,MTL_gm,pTau181,Intercept 0.846228 Education_years ...,Intercept -0.037026 Education_years ...,20,0.846228,0.011544,-0.03702563,0.428834,age_decade2,"(70, 77]",0.043057,0.428322
0,MTL_gm,pTau181,Intercept 0.527290 Education_years ...,Intercept 0.205544 Education_years ...,13,0.52729,0.034501,0.2055439,1.21859,age_decade2,"(77, 88]",0.03937,0.654713
0,MTL_gm,pTau181,Intercept 0.161545 Education_years ...,Intercept 0.294920 Education_years ...,8,0.161545,0.302512,0.2949196,0.193763,age_decade2,"(57, 70]",0.091521,0.393498
0,MTL_gm,pTau181,Intercept 0.262581 Education_years ...,Intercept -0.171073 Education_years ...,17,0.262581,0.017342,-0.1710733,0.320768,cdrsum_bin,"(2.5, 3.5]",0.006156,0.451301
0,MTL_gm,pTau181,Intercept 0.078591 Education_years ...,Intercept 0.615858 Education_years ...,10,0.078591,0.125999,0.6158584,0.620362,cdrsum_bin,"(0.0, 2.5]",0.721027,0.136995
0,MTL_gm,pTau181,Intercept 0.775778 Education_years ...,Intercept -0.081465 Education_years ...,10,0.775778,0.352489,-0.08146529,0.35215,cdrsum_bin,"(3.5, 10.0]",0.040761,0.721472
0,MTL_gm,pTau181,Intercept 0.544203 Education_years ...,Intercept 0.691717 Education_years ...,4,0.544203,0.33893,0.691717,2.212269,cdrsum_bin,unknown,0.372933,1.282956
0,MTL_gm,pTau181,Intercept 0.641823 Education_years ...,Intercept -0.104241 Education_years ...,14,0.641823,0.05004,-0.1042405,0.582485,Ab42_40_bin,"(0.06, 0.07]",0.009731,0.632668


# save results

In [87]:
with pd.ExcelWriter('./output/03_model_results_' + data_name + '.xlsx') as writer:
    biomarker_relationship_results_log10.to_excel(writer, sheet_name='All', index=False)
    biomarker_relationship_results_log10.loc[(biomarker_relationship_results_log10[['model_p_val_x_var']].min(axis = 1)<0.05), :].round(3).to_excel(writer, sheet_name='Significant', index=False)