## Notebook for performing eQTS analysis for PD GRS and expression

In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import pandas as pd
import os
import numpy as np
import statsmodels.stats.multitest as smm
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [None]:
# parameters
cohort = 'ppmi'
visit = 0
visit_name = 'BLM0T1'
tissue = 'wb'
quant_type = 'genes'

In [None]:
# naming
cohort_build = f'{cohort}.{tissue}{visit}'

# directories
home_dir = '/home/jupyter'
wrk_dir = f'{home_dir}/{cohort}'
expr_dir = f'{wrk_dir}/expression'
info_dir = f'{wrk_dir}/sample_info'
eqts_dir = f'{wrk_dir}/eqts'

# in files
covs_file = f'{info_dir}/{cohort}_rna_sample_info.csv'
grs_file = f'{info_dir}/{cohort}.grs.scaled.csv'
quants_file = f'{expr_dir}/{cohort_build}.norm.adj.hdf5'

# out files
eqts_file = f'{eqts_dir}/{cohort_build}.eqts.csv'

In [None]:
os.makedirs(eqts_dir, exist_ok=True)

#### analysis functions

In [None]:
# functions to run the linear regression
def reg_model(y, x, term):
    Xb = sm.add_constant(x)
    reg_model = sm.OLS(y, Xb).fit()

# #     formula api style
#     this_formula = f'Q("{this_trait}") ~ GRS + predicted_Lymphocytes + GRS * predicted_Lymphocytes'
#     reg_model = smf.ols(this_formula, data=this_quants).fit()

    # return the coef, stderr, adjusted r2, number of terms, and p-value
    return [reg_model.params[term], reg_model.bse[term], reg_model.rsquared_adj, 
            reg_model.params.shape[0], reg_model.pvalues[term]]


def grsresgression(cov_df, traits_df, dep_term='', extra_dep_terms=None):
    print(dep_term)
    this_sample = set(cov_df.index) & set(traits_df.index)
    print(len(this_sample))
    
    this_cov_df = cov_df.loc[cov_df.index.isin(this_sample)]
    this_traits_df = traits_df.loc[traits_df.index.isin(this_sample)]
    this_cov_df = this_cov_df.reindex(this_traits_df.index)
    
    if not extra_dep_terms is None:
        this_cov_df = this_cov_df[[dep_term] + extra_dep_terms]
    else:
        this_cov_df = this_cov_df[[dep_term]]
    
    lm_results = this_traits_df.apply(lambda x: reg_model(x, this_cov_df, dep_term),
                                      result_type='expand').\
    rename(index={0: 'coef', 1: 'stderr', 2: 'r2_adj', 3:'term_cnt', 4:'p-value'}).T

    alpha=0.05
    method='fdr_bh'
    test_adjust = smm.multipletests(np.array(lm_results['p-value']),
                                    alpha=alpha, method=method)
    lm_results['bh_fdr'] = test_adjust[1]
    return lm_results

In [None]:
#plot the eQTS
def ploteqts(trait_id, study_name, score_df, traits_df):
    this_scores_df = score_df.loc[score_df.index.isin(traits_df.index)]
    this_scores_df = this_scores_df.reindex(traits_df.index)

    temp = traits_df.merge(this_scores_df,left_index=True,right_index=True)

    plt.figure(figsize=(9, 9))
    sns.regplot(x=trait_id,y='GRS', data=temp, ci=95)
    sns.scatterplot(x=trait_id,y='GRS', data=temp, hue='DX')
    plt.xlabel('Trait')
    plt.ylabel('GRS')
    plt.title(f'{trait_id} in {study_name}')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0,prop={'size': 10})
    plt.show()

#### load the known sample covariates

In [None]:
covs_df = pd.read_csv(covs_file, index_col=0)
print(covs_df.shape)
# display(known_covs_df.head())

#### load and add GRS

In [None]:
grs_df = pd.read_csv(grs_file)
print(grs_df.shape)
# display(grs_df.head())

In [None]:
# covs_df = covs_df.merge(grs_df, how='inner', left_on='wgsid', right_on='IID')
covs_df = covs_df.reset_index().merge(grs_df, how='inner', left_on='wgsid', 
                                      right_on='IID').set_index('sample_id')
print(covs_df.shape)
# display(covs_df.head())

#### load the expression data

In [None]:
%%time
expr_df = pd.read_hdf(quants_file, index_col=0)
print(expr_df.shape)
# display(expr_df.head())

#### run the eQTS (expression ~ GRS regression)

In [None]:
%%time
grs_results = grsresgression(covs_df, expr_df, dep_term='GRS')

In [None]:
grs_results.describe()

In [None]:
print(grs_results.shape)
print(grs_results.loc[grs_results['bh_fdr'] <= 0.05].shape)

#### plot the most significant results

In [None]:
scaled_expr_df =  pd.DataFrame(data=MinMaxScaler().fit_transform(expr_df), 
                               index=expr_df.index, columns=expr_df.columns)
print(scaled_expr_df.shape)
# display(scaled_expr_df.head())

In [None]:
this_trait = grs_results.loc[grs_results['bh_fdr'] == min(grs_results['bh_fdr']),].index[0]
print(grs_results.loc[this_trait])
ploteqts(this_trait, cohort.upper(),  covs_df, expr_df)

In [None]:
ploteqts(this_trait, cohort.upper(), covs_df, scaled_expr_df)

#### now run the model with the cell fractions as covariates

In [None]:
%%time
cell_cols = covs_df.columns[covs_df.columns.str.startswith('predicted_')].to_list()
# cell_cols = ['age_at_baseline'] + cell_cols
grs_results = grsresgression(covs_df, expr_df, dep_term='GRS', extra_dep_terms=cell_cols)

In [None]:
grs_results.describe()

In [None]:
print(grs_results.shape)
print(grs_results.loc[grs_results['bh_fdr'] <= 0.05].shape)

#### plot the most significant results

In [None]:
this_trait = grs_results.loc[grs_results['bh_fdr'] == min(grs_results['bh_fdr']),].index[0]
print(grs_results.loc[this_trait])
ploteqts(this_trait, cohort.upper(),  covs_df, expr_df)

In [None]:
ploteqts(this_trait, cohort.upper(), covs_df, scaled_expr_df)

#### save the results files

In [None]:
grs_results.to_csv(eqts_file)