# 03 Evaluate Select FDG-Volumetric Relationships

If a plasma marker was significantly related to both FDG-PET and volumetrics in the same brain region, an additional model was created to directly estimate the relationship between FDG-PET and volume.   

In [9]:
# install required packages - commented out so it doesn't install every time
#%conda install -n Lauren openpyxl numpy pandas statsmodels plotnine matplotlib scikit-learn scipy mizani nbconvert pandoc pyreadstat kmodes seaborn

# import required packages
import numpy as np;
import pandas as pd;
from sklearn import preprocessing 
import statsmodels as sm;
import statsmodels.formula.api as smf;
import plotnine as p9;
import itertools
import pickle

import scipy; # for spearmann correlation

# read in and clean data

In [10]:
data_folder_loc = '//admsyn/Primary/ADM/CustomerStudies/Rockefeller/Riluzole_Biomarkers/'
code_folder_loc = '//admsyn/homes/@DH-ADMDX/0/lauren.koenig-1606/code/ptau-217 and FDG PET AD - official copy/'

In [11]:
# read in data
pons_rr, new_para_rr = pd.read_pickle(code_folder_loc + '/output/00_newdata.pkl')

In [12]:
all_data = new_para_rr # change this value to pons_rr to get the result when using pons as the FDG-PET reference region
data_name = 'new_para_rr' # change this value to pons_rr to get the result when using pons as the FDG-PET reference region

In [13]:
# define biomarker groups
FDG_columns2 = [
  'Par_gm', 'Par_gm', 
]

vol_columns2 = [
    'VOL_InfPar_Lz', 'VOL_InfPar_Rz',
]

# Analyses

## version using log10 plasma data

In [14]:
covariate_p_threshold = 0.05

In [15]:
biomarker_relationship_results_log10 = pd.DataFrame()

for varlist in [vol_columns2]:
    combinations = pd.DataFrame(zip(FDG_columns2, varlist))
    combinations = combinations[combinations[0].isin(all_data.columns) & combinations[1].isin(all_data.columns)].reset_index(drop = True)

    for idx in range(combinations[0].size):
        x_var = combinations[0][idx]
        y_var = combinations[1][idx]
        
        temp_df_baseline =  all_data.copy().dropna(subset = [y_var, x_var])

        # z-score data
        temp_df_baseline[[y_var, x_var, 'Education_years', 'age']] = preprocessing.StandardScaler().fit_transform(temp_df_baseline[[y_var, x_var, 'Education_years', 'age']])

        # run 1 - check for significant covariates
        sig_pvals_start = ['age', 'Education_years', 'apoe4_carrier', 'sex', 'race_ethnicity', x_var]
        change = 1
        full_model = smf.ols(formula = y_var + ' ~ ' + ' + '.join(sig_pvals_start), data = temp_df_baseline).fit()
        temp_df_baseline['full_model_residuals'] = full_model.resid

        while change > 0:
            model_vars = ' + '.join(sig_pvals_start)        
            test_model = smf.ols(formula = y_var + ' ~ ' + model_vars, data = temp_df_baseline).fit()
            sig_pvals = test_model.pvalues[test_model.pvalues < covariate_p_threshold].index.to_list()
            sig_pvals = [ele.split('[')[0] for ele in sig_pvals] # drop the [] indicators
            sig_pvals_unique = []
            for item in sig_pvals:
                if (item not in sig_pvals_unique) & (item not in ['Intercept']): sig_pvals_unique.append(item)
            if (x_var not in sig_pvals_unique):
                sig_pvals_unique = sig_pvals_unique + [x_var]
            change = len(sig_pvals_start) - len(sig_pvals_unique)
            sig_pvals_start = sig_pvals_unique
        
        # run 2 -with just the significant covariates (but forcing the x_var to be included)
        model_vars = ' + '.join(sig_pvals_unique)        
        model = smf.ols(formula = y_var + ' ~ ' + model_vars , data = temp_df_baseline).fit()

        model_p_val_baseline = model.pvalues
        model_B_val_baseline = model.params
        model_n_baseline = len(temp_df_baseline.dropna(subset = sig_pvals_unique + [y_var]))


        # save results
        temp_results  =  pd.DataFrame([y_var, x_var, model_p_val_baseline, model_B_val_baseline, model_n_baseline]).T

        temp_results.columns = ['y_var', 'x_var', 
        'model_p_val_baseline', 'model_B_val_baseline', 'model_n_baseline']
        
        # separate out p and B values into separate columns
        df_mv_age_pvals = pd.DataFrame(temp_results['model_p_val_baseline'][0]).T
        df_mv_age_pvals.columns = 'model_p_val_' + df_mv_age_pvals.columns
        temp_results[df_mv_age_pvals.columns] = df_mv_age_pvals

        df_mv_age_bvals = pd.DataFrame(temp_results['model_B_val_baseline'][0]).T
        df_mv_age_bvals.columns = 'model_B_val_' + df_mv_age_bvals.columns
        temp_results[df_mv_age_bvals.columns] = df_mv_age_bvals                
        
        # remove self-correlations
        temp_results = temp_results[~(temp_results['y_var'] == temp_results['x_var'])]

        # rename so var2 is in the same column
        temp_results = temp_results.rename(columns = dict(zip(temp_results.columns, [ele.replace(x_var, 'x_var') for ele in temp_results.columns ])))

        # add to full list of results
        biomarker_relationship_results_log10 = pd.concat([biomarker_relationship_results_log10, temp_results])

# save results

In [16]:
with pd.ExcelWriter(code_folder_loc + 'output/03_model_results_' + data_name + '.xlsx') as writer:
    biomarker_relationship_results_log10.to_excel(writer, sheet_name='All', index=False)
    biomarker_relationship_results_log10.loc[(biomarker_relationship_results_log10[['model_p_val_x_var']].min(axis = 1)<0.05), :].round(3).to_excel(writer, sheet_name='Significant', index=False)