# 01 Main Analyses

The main analysis consisted of evaluating each plasma biomarker for relationships with 1) other plasma biomarkers, 2) FDG-PET SUVR in target brain regions, 3) cognitive test results, and 4) regional brain volumes in Alzheimer’s disease-relevant regions. Relationships to plasma biomarkers were assessed using general linear models with the log-transformed plasma biomarker as the independent variable (using Python 3.9.1 ‘statsmodels’ package ver. 0.14.0).41 The plasma biomarkers were log-10-transformed due to their non-normal distribution, as is common in the field.  
  
The specific set of covariates included was individualized to each model, but may include age, years of education, sex, a combination of race and ethnicity, and APOE ε4 carrier status. The covariates included in an individual model were determined by first evaluating a model that included all potential covariates in addition to the main variable of interest. Non-significant covariates were dropped from the model before refitting, and the process was repeated until any remaining covariates were significant. A significance threshold of p > 0.05 was used, although using p > 0.15 produced similar results. Which covariates were ultimately included for each model can be seen in Supplementary Table S1, where the standardized coefficient (β-weight) and p-value for each covariate is listed if included or left blank if not included. If no models contained a specific covariate, the corresponding column in Supplementary Table S1 is not present. The same information (p-value and β-value) are also listed for the main independent variable; it is these values that are described in the main results of the paper. Due to the exploratory nature of this work, the reported p-values have not been adjusted for multiple comparisons.  

In [1]:
# install required packages - commented out so it doesn't install every time
#%conda install -n Lauren openpyxl numpy pandas statsmodels plotnine matplotlib scikit-learn scipy mizani nbconvert pandoc pyreadstat kmodes seaborn

# import required packages
import numpy as np;
import pandas as pd;
from sklearn import preprocessing 
import statsmodels as sm;
import statsmodels.formula.api as smf;
import plotnine as p9;
import itertools
import pickle

import scipy; # for spearmann correlation

data_folder_loc = "//admsyn/Primary/ADM/CustomerStudies/Rockefeller/Riluzole_Biomarkers/"

  from pandas.core import (


# read in and clean data

In [2]:
# read in data
pons_rr, new_para_rr = pd.read_pickle('./output/00_newdata.pkl')

In [3]:
all_data = new_para_rr # change this value to pons_rr to get the result when using pons as the FDG-PET reference region
data_name = 'new_para_rr' #change this value to pons_rr to get the result when using pons as the FDG-PET reference region

In [4]:
# define biomarker groups
FDG_columns = ['Avg_MedOrbFrontal', 'Graycer_gm', 'Avg_PCC', 'Avg_Hip', 'MTL_gm', 'CO', 'Temp', 'SensMot_gm', 'L_Hip',
 'Vermis_gm', 'FRONTAL_gm', 'AC_gm', 'Precun_gm', 'Par_gm', 'Temp_gm', 'R_Hip', 'PostCing_gm']

plasma_columns = ['Ab42_40', 'GFAP','NFL',  'pTau181', 'pTau217',  'pTau231', 'pTau217_Ab42']
log10_plasma_columns = [s + '_log10' for s in plasma_columns]

cog_columns = ['MMSE', 'adascogtotal', 'bvrt', 'dstotal','tma','tmb',  'cowattotal', 'cdrtotal', 'cdrsum', 'adltotal', 'npitotal', 'gds']
vol_columns = ['VOL_Ventricles_Lz',
       'VOL_Ventricles_Rz', 'VOL_Putamen_Lz', 'VOL_Putamen_Rz',
       'VOL_ParaHip_Lz', 'VOL_ParaHip_Rz', 'VOL_Fusi_Lz', 'VOL_Fusi_Rz',
       'VOL_InfTemp_Lz', 'VOL_InfTemp_Rz', 'VOL_MidTemp_Lz', 'VOL_MidTemp_Rz',
       'VOL_SupTemp_Lz', 'VOL_SupTemp_Rz', 'VOL_Precun_Lz', 'VOL_Precun_Rz',
       'VOL_InfPar_Lz', 'VOL_InfPar_Rz', 'VOL_ParaPostCentr_Lz',
       'VOL_ParaPostCentr_Rz', 'VOL_SupraMarg_Lz', 'VOL_SupraMarg_Rz',
       'VOL_SupPar_Lz', 'VOL_SupPar_Rz', 'VOL_OrbitFront_Lz',
       'VOL_OrbitFront_Rz', 'VOL_Insula_Lz', 'VOL_Insula_Rz',
       'VOL_InfFront_Lz', 'VOL_InfFront_Rz', 'VOL_MidFront_Lz',
       'VOL_MidFront_Rz', 'VOL_SupFront_Lz', 'VOL_SupFront_Rz',
       'VOL_PrecFront_Lz', 'VOL_PrecFront_Rz', 'VOL_LatOcc_Lz',
       'VOL_LatOcc_Rz', 'VOL_Lingual_Lz', 'VOL_Lingual_Rz', 'VOL_Cuneus_Lz',
       'VOL_Cuneus_Rz', 'VOL_Pericalc_Lz', 'VOL_Pericalc_Rz',
       'VOL_AntCingulate_Lz', 'VOL_AntCingulate_Rz', 'VOL_PostCingulate_Lz',
       'VOL_PostCingulate_Rz', 'VOL_Entorhinal_Lz', 'VOL_Entorhinal_Rz',
       'VOL_Hip_Lz', 'VOL_Hip_Rz', 'VOL_TotalGrayz', 'VOL_TotalGray_Lz',
       'VOL_TotalGray_Rz', 'VOL_LatTemp_Lz', 'VOL_LatTemp_Rz',
       'VOL_Parietal_Lz', 'VOL_Parietal_Rz', 'VOL_Frontal_Lz',
       'VOL_Frontal_Rz', 'VOL_InfMidTemp_Lz', 'VOL_InfMidTemp_Rz',
       'VOL_InfInsFrontal_Lz', 'VOL_InfInsFrontal_Rz', 'VOL_MidSupFrontal_Lz',
       'VOL_MidSupFrontal_Rz', 'VOL_Inf_Mid_Fus_Temp_Lz',
       'VOL_Inf_Mid_Fus_Temp_Rz', 'VOL_Precun_InfPar_Lz',
       'VOL_Precun_InfPar_Rz', 'VOL_Precun_InfPar_Supramarg_Lz',
       'VOL_Precun_InfPar_Supramarg_Rz', 'VOL_LatOccLingCun_Lz',
       'VOL_LatOccLingCun_Rz', 'VOL_InfParSupra_Lz', 'VOL_InfParSupra_Rz']

In [5]:
print('any missing covariate data?')
all_data[all_data['timepoint'].isin(['base'])].dropna(subset = [ele for ele in FDG_columns if ele in all_data.columns], how = 'all')[['age', 'Education_years', 'sex', 'apoe4_carrier', 'race_ethnicity']].isna().value_counts().sort_index()

any missing covariate data?


age    Education_years  sex    apoe4_carrier  race_ethnicity
False  False            False  False          False             37
                               True           False              1
                                              True               4
True   True             True   True           True               1
Name: count, dtype: int64

# Analyses

In [6]:
covariate_p_threshold = 0.05 # significance required for a covariate to be included in the final model

In [7]:
biomarker_relationship_results = pd.DataFrame()


for varlist in [ [ele for ele in FDG_columns if ele in all_data.columns], log10_plasma_columns, cog_columns, vol_columns]:
    combinations = pd.DataFrame(list(itertools.product(log10_plasma_columns, varlist)))

    for idx in range(combinations[0].size):
        x_var = combinations[0][idx]
        y_var = combinations[1][idx]
        
        temp_df_baseline =  all_data.copy().dropna(subset = [y_var, x_var])

        # z-score data
        temp_df_baseline[[y_var, x_var, 'Education_years', 'age']] = preprocessing.StandardScaler().fit_transform(temp_df_baseline[[y_var, x_var, 'Education_years', 'age']])

        # run 1 - check for significant covariates
        sig_pvals_start = ['age', 'Education_years', 'apoe4_carrier', 'sex', 'race_ethnicity', x_var]
        change = 1
        full_model = smf.ols(formula = y_var + ' ~ ' + ' + '.join(sig_pvals_start), data = temp_df_baseline).fit()
        temp_df_baseline['full_model_residuals'] = full_model.resid

        while change > 0:
            model_vars = ' + '.join(sig_pvals_start)        
            test_model = smf.ols(formula = y_var + ' ~ ' + model_vars, data = temp_df_baseline).fit()
            sig_pvals = test_model.pvalues[test_model.pvalues < covariate_p_threshold].index.to_list()
            sig_pvals = [ele.split('[')[0] for ele in sig_pvals] # drop the [] indicators
            sig_pvals_unique = []
            for item in sig_pvals:
                if (item not in sig_pvals_unique) & (item not in ['Intercept']): sig_pvals_unique.append(item)
            if (x_var not in sig_pvals_unique):
                sig_pvals_unique = sig_pvals_unique + [x_var]
            change = len(sig_pvals_start) - len(sig_pvals_unique)
            sig_pvals_start = sig_pvals_unique
        
        # run 2 - fit model with just the significant covariates (but forcing the x_var to be included)
        model_vars = ' + '.join(sig_pvals_unique)        
        model = smf.ols(formula = y_var + ' ~ ' + model_vars , data = temp_df_baseline).fit()

        model_p_val_baseline = model.pvalues
        model_B_val_baseline = model.params
        model_n_baseline = len(temp_df_baseline.dropna(subset = sig_pvals_unique + [y_var]))


        # save results
        temp_results  =  pd.DataFrame([y_var, x_var, model_p_val_baseline, model_B_val_baseline, model_n_baseline]).T

        temp_results.columns = ['y_var', 'x_var', 
        'model_p_val_baseline', 'model_B_val_baseline', 'model_n_baseline']
        
        # separate out p and B values into separate columns
        df_mv_age_pvals = pd.DataFrame(temp_results['model_p_val_baseline'][0]).T
        df_mv_age_pvals.columns = 'model_p_val_' + df_mv_age_pvals.columns
        temp_results[df_mv_age_pvals.columns] = df_mv_age_pvals

        df_mv_age_bvals = pd.DataFrame(temp_results['model_B_val_baseline'][0]).T
        df_mv_age_bvals.columns = 'model_B_val_' + df_mv_age_bvals.columns
        temp_results[df_mv_age_bvals.columns] = df_mv_age_bvals                
        
        # remove self-correlations
        temp_results = temp_results[~(temp_results['y_var'] == temp_results['x_var'])]

        # rename so var2 is in the same column
        temp_results = temp_results.rename(columns = dict(zip(temp_results.columns, [ele.replace(x_var, 'x_var') for ele in temp_results.columns ])))

        # add to full list of results
        biomarker_relationship_results = pd.concat([biomarker_relationship_results, temp_results])


In [8]:
# add indicator of biomarker group to results
biomarker_relationship_results['type'] = 'other'
biomarker_relationship_results.loc[biomarker_relationship_results['y_var'].isin( [ele for ele in FDG_columns if ele in all_data.columns]), 'type'] = 'FDG PET'
biomarker_relationship_results.loc[biomarker_relationship_results['y_var'].isin(log10_plasma_columns + log10_plasma_columns), 'type'] = 'Plasma'
biomarker_relationship_results.loc[biomarker_relationship_results['y_var'].isin(cog_columns), 'type'] = 'Cognitive'
biomarker_relationship_results.loc[biomarker_relationship_results['y_var'].isin(vol_columns), 'type'] = 'Volumetric'

# if any covariates were never included then a column was never created for that covariate
# create it here so that the spreadsheet stays consistent no matter which covariates were included
biomarker_relationship_results[[ele for ele in [
'type', 'y_var', 'x_var','model_n_baseline', 
'model_B_val_x_var', 'model_p_val_x_var',
'model_B_val_age',  'model_p_val_age',
'model_B_val_Education_years', 'model_p_val_Education_years', 
'model_B_val_apoe4_carrier[T.Yes]','model_p_val_apoe4_carrier[T.Yes]', 
'model_B_val_race[T.White/NH]',  'model_p_val_race[T.White/NH]', 
'model_B_val_sex[T.M]','model_p_val_sex[T.M]', 
'model_B_val_Intercept','model_p_val_Intercept'
] if ele not in biomarker_relationship_results.columns]] = np.nan

# save results

In [9]:
import pickle
# Saving the objects:
with open('./output/01_data_' + data_name + '.pkl', 'wb') as f: 
    pickle.dump([biomarker_relationship_results, all_data], f)

In [10]:
# clean and organize results before saving as an excel file

# file used to rename variables from original name to a more decipherable name for the manuscript
full_names = pd.read_csv(data_folder_loc + 'discrete/data-dictionary.csv')
full_names = dict(zip(full_names['variable_name'], full_names['full_name']))

In [11]:
def pval_rounder(pval):
    pval_str = pval    
    if (pval < 0.001):
        pval_str = '<0.001'
    elif (pval < 0.05) & (pval >= 0.04995):
        pval_str = str(round(pval, 5))
    elif (pval < 0.05) & (pval >= 0.0495):
        pval_str = str(round(pval, 4))
    elif (pval < 0.05) & (pval >= 0.045):
        pval_str = str(round(pval, 3))
    elif (pval < 0.01) & (pval > 0.001):
        pval_str = str(round(pval, 3))
    elif (pval >= 0.01):
        pval_str = str(round(pval, 2))
    return pval_str


# round p values
biomarker_relationship_results['model_p_val_x_var'] = biomarker_relationship_results['model_p_val_x_var'].apply(lambda x: pval_rounder(x))
biomarker_relationship_results['model_p_val_age'] = biomarker_relationship_results['model_p_val_age'].apply(lambda x: pval_rounder(x))
biomarker_relationship_results['model_p_val_Education_years'] = biomarker_relationship_results['model_p_val_Education_years'].apply(lambda x: pval_rounder(x))
biomarker_relationship_results['model_p_val_apoe4_carrier[T.Yes]'] = biomarker_relationship_results['model_p_val_apoe4_carrier[T.Yes]'].apply(lambda x: pval_rounder(x))
biomarker_relationship_results['model_p_val_race[T.White/NH]'] = biomarker_relationship_results['model_p_val_race[T.White/NH]'].apply(lambda x: pval_rounder(x))
biomarker_relationship_results['model_p_val_sex[T.M]'] = biomarker_relationship_results['model_p_val_sex[T.M]'].apply(lambda x: pval_rounder(x))
biomarker_relationship_results['model_p_val_Intercept'] = biomarker_relationship_results['model_p_val_Intercept'].apply(lambda x: pval_rounder(x))

def betaval_rounder(bval):
    bval_str = bval
    if (abs(bval) >= 0.001):
        bval_str = str(round(bval, 3))
    elif (abs(bval) < 0.001):
        bval_str = '{:.2e}'.format(bval)
        
    return bval_str

biomarker_relationship_results['model_B_val_x_var'] = biomarker_relationship_results['model_B_val_x_var'].apply(lambda x: betaval_rounder(x))
biomarker_relationship_results['model_B_val_age'] = biomarker_relationship_results['model_B_val_age'].apply(lambda x: betaval_rounder(x))
biomarker_relationship_results['model_B_val_Education_years'] = biomarker_relationship_results['model_B_val_Education_years'].apply(lambda x: betaval_rounder(x))
biomarker_relationship_results['model_B_val_apoe4_carrier[T.Yes]'] = biomarker_relationship_results['model_B_val_apoe4_carrier[T.Yes]'].apply(lambda x: betaval_rounder(x))
biomarker_relationship_results['model_B_val_race[T.White/NH]'] = biomarker_relationship_results['model_B_val_race[T.White/NH]'].apply(lambda x: betaval_rounder(x))
biomarker_relationship_results['model_B_val_sex[T.M]'] = biomarker_relationship_results['model_B_val_sex[T.M]'].apply(lambda x: betaval_rounder(x))
biomarker_relationship_results['model_B_val_Intercept'] = biomarker_relationship_results['model_B_val_Intercept'].apply(lambda x: betaval_rounder(x))

In [12]:
# rename to clarify
biomarker_relationship_results = biomarker_relationship_results.rename(columns = {
'model_n_baseline':'number of subjects'})

# rename to clarify
biomarker_relationship_results = biomarker_relationship_results.rename(columns = dict(zip(biomarker_relationship_results.columns, [ele.replace('model_', '') for ele in biomarker_relationship_results.columns])))

# rename to clarify
biomarker_relationship_results['x_var'] = biomarker_relationship_results['x_var'].replace(full_names)
biomarker_relationship_results['y_var'] = biomarker_relationship_results['y_var'].replace(full_names)
biomarker_relationship_results.columns = biomarker_relationship_results.columns.str.replace('B_val', 'beta_val')

# assign category order so can more easily sort into desired order
biomarker_relationship_results['x_var'] = biomarker_relationship_results['x_var'].astype('category').cat.reorder_categories(['Plasma P-tau217 (log10)', 'Plasma GFAP (log10)',
       'Plasma NFL (log10)', 'Plasma Aβ42/40 (log10)', 'Plasma P-tau181 (log10)', 'Plasma P-tau231 (log10)', 'Plasma P-tau217 / Aβ42 (log10)'])
biomarker_relationship_results['type'] = biomarker_relationship_results['type'].astype('category').cat.reorder_categories(['FDG PET', 'Plasma', 'Cognitive', 'Volumetric'])

# sort and add an ID number for when table spreads to multiple printed tables
biomarker_relationship_results = biomarker_relationship_results.sort_values(['x_var', 'type', 'y_var'])
biomarker_relationship_results = biomarker_relationship_results.reset_index(drop = True)
biomarker_relationship_results['model_id_number'] =  biomarker_relationship_results.index


# reorder columns
biomarker_relationship_results = biomarker_relationship_results[['model_id_number', 'type', 'y_var', 'x_var', 'number of subjects', 'beta_val_x_var',
       'p_val_x_var', 'beta_val_age', 'p_val_age', 'beta_val_Education_years',
       'p_val_Education_years', 'beta_val_apoe4_carrier[T.Yes]',
       'p_val_apoe4_carrier[T.Yes]', 'beta_val_race[T.White/NH]',
       'p_val_race[T.White/NH]', 'beta_val_sex[T.M]', 'p_val_sex[T.M]',
       'beta_val_Intercept', 'p_val_Intercept']]

biomarker_relationship_results

Unnamed: 0,model_id_number,type,y_var,x_var,number of subjects,beta_val_x_var,p_val_x_var,beta_val_age,p_val_age,beta_val_Education_years,p_val_Education_years,beta_val_apoe4_carrier[T.Yes],p_val_apoe4_carrier[T.Yes],beta_val_race[T.White/NH],p_val_race[T.White/NH],beta_val_sex[T.M],p_val_sex[T.M],beta_val_Intercept,p_val_Intercept
0,0,FDG PET,Anterior Cingulate FDG-PET SUVR,Plasma P-tau217 (log10),38,-0.06,0.72,,,,,,,,,,,2.36e-16,1.0
1,1,FDG PET,Frontal FDG-PET SUVR,Plasma P-tau217 (log10),38,-0.3,0.07,,,,,,,,,,,1.77e-15,1.0
2,2,FDG PET,Inferior Parietal FDG-PET SUVR,Plasma P-tau217 (log10),38,-0.492,0.002,,,,,,,,,,,4.58e-16,1.0
3,3,FDG PET,Lateral Temporal FDG-PET SUVR,Plasma P-tau217 (log10),38,-0.514,<0.001,,,,,,,,,,,6.25e-16,1.0
4,4,FDG PET,Medial Temporal FDG-PET SUVR,Plasma P-tau217 (log10),37,-0.081,0.62,,,0.414,0.01,,,,,,,3.22e-04,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
716,716,Volumetric,Right Superior Frontal Volume z-score,Plasma P-tau217 / Aβ42 (log10),27,-0.197,0.32,,,,,,,,,,,0.00e+00,1.0
717,717,Volumetric,Right Superior Parietal Volume z-score,Plasma P-tau217 / Aβ42 (log10),27,0.042,0.83,0.472,0.02,,,,,,,,,-3.47e-17,1.0
718,718,Volumetric,Right Superior Temporal Volume z-score,Plasma P-tau217 / Aβ42 (log10),27,0.183,0.36,,,,,,,,,,,1.39e-17,1.0
719,719,Volumetric,Right Supramarginal Gyrus Volume z-score,Plasma P-tau217 / Aβ42 (log10),27,0.109,0.59,,,,,,,,,,,1.39e-17,1.0


In [13]:
# save results, split with each plasma marker in its own sheet
with pd.ExcelWriter('./output/01_supplemental_full_results_' + data_name + '.xlsx') as writer:
    biomarker_relationship_results[biomarker_relationship_results['x_var'].isin(['Plasma P-tau217 (log10)'])].to_excel(writer, sheet_name='pTau217', index=False)
    biomarker_relationship_results[biomarker_relationship_results['x_var'].isin(['Plasma GFAP (log10)'])].to_excel(writer, sheet_name='GFAP', index=False)
    biomarker_relationship_results[biomarker_relationship_results['x_var'].isin(['Plasma NFL (log10)'])].to_excel(writer, sheet_name='NFL', index=False)
    biomarker_relationship_results[biomarker_relationship_results['x_var'].isin(['Plasma Aβ42/40 (log10)'])].to_excel(writer, sheet_name='Ab42_40', index=False)
    biomarker_relationship_results[biomarker_relationship_results['x_var'].isin(['Plasma P-tau181 (log10)'])].to_excel(writer, sheet_name='pTau181', index=False)
    biomarker_relationship_results[biomarker_relationship_results['x_var'].isin(['Plasma P-tau231 (log10)'])].to_excel(writer, sheet_name='pTau231', index=False)
    biomarker_relationship_results[biomarker_relationship_results['x_var'].isin(['Plasma P-tau217 / Aβ42 (log10)'])].to_excel(writer, sheet_name='pTau217_Ab42', index=False)
    biomarker_relationship_results.loc[(biomarker_relationship_results['p_val_x_var'].str.replace('<', '').astype('float64')<0.05), :].to_excel(writer, sheet_name='Significant', index=False)