## Notebook to browse the replicated ieQTL result for risk variants

In [1]:
!date

Fri Jul  2 21:52:04 UTC 2021


#### import libraries and set notebook variables

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import concurrent.futures
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# parameters
disc_cohort = 'ppmi'
rep_cohort = 'pdbp'
tissue = 'wb'
interaction_term = 'GRS'

In [4]:
# naming
cohort_build = f'{disc_cohort}.{rep_cohort}'

# directories
home_dir = '/home/jupyter'
wrk_dir = f'{home_dir}/{disc_cohort}'
genos_dir = f'{wrk_dir}/genotypes'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'
gwas_dir = f'{home_dir}/gwas'

# input files
gencode_pkl = f'{home_dir}/amppd/expression/gencode.v29.primary_assembly.annotation.pkl'
eqtl_results_file = f'{results_dir}/{cohort_build}.replicated.wb.{interaction_term}.cis.ieqtl.csv'
meta5_st2_clean_file = f'{gwas_dir}/pd.table_s2.clean.txt'
meta5_stats_file = f'{gwas_dir}/pdmeta_sumstats_hg38.h5'

# output files

# constant values
autosomes = [str(x) for x in list(range(1,23))]
alpha_value = 0.05
capture_out = !(nproc)
max_threads = int(capture_out[0])


#### analysis functions

In [5]:
def mixed_model(formula, df, group_name):
    model = sm.MixedLM.from_formula(formula, df, groups=df[group_name])
    result = model.fit()
    return result

def regress_zscores_keep(this_df):
    ret_value = False
    # run the regression, mixed effects model with visit as random effect
    this_formula = 'gwas_zscore_abs ~ eqtl_zscore_abs'
    grouping = 'month'
    result = mixed_model(this_formula, this_df, grouping)
#     print(result.summary())
    term = 'eqtl_zscore_abs'
#     print(['feature', 'coef', 'stderr', 'term_cnt', 'p-value'])
#     print(result.params[term], result.bse[term], 
#           result.params.shape[0], result.pvalues[term])    
    return [result.params[term], result.bse[term], 
            result.params.shape[0], result.pvalues[term]]

def create_merged_df_to_regress(eqtl_df, pheno_id, gwas_df):
    pheno_df = eqtl_df.loc[eqtl_df['phenotype_id'] == pheno_id]
    merged_df = pheno_df.merge(gwas_df, how='inner', 
                               left_on='variant_id', right_on='SNP')
    # will test regression on absolute z-score instead of say p-value
    merged_df['month'] =  merged_df['month'].astype(object)
    merged_df['gwas_zscore_abs'] = np.abs(merged_df['b']/merged_df['se'])
    merged_df['eqtl_zscore_abs'] = np.abs(merged_df['b_gi']/merged_df['b_gi_se'])

    # for regression will only consider variants that are nominally
    # significant in gwas or eqtl
    df_to_return = merged_df.loc[(merged_df['pval_gi'] < alpha_value) | 
                                 (merged_df['p'] < alpha_value)]
    #     print(pheno_df.shape, merged_df.shape, alpha_df.shape)
    return df_to_return

def load_chrom_result(chrom, visits, in_dir, cohort, tissue, term):
    # have to do pass to find all phenos to possible capture
    df_to_return = None
    phenos_oi = []
    for visit in visits:
        chrom_file = f'{in_dir}/{cohort}.{tissue}{visit}.{term}.cis_qtl_pairs.chr{chrom}.parquet'
        chrom_eqtl_df = pd.read_parquet(chrom_file)
        oi_chrom_eqtl_df = chrom_eqtl_df.loc[chrom_eqtl_df['pval_gi'] < max_pvalue]
        oi_results = oi_chrom_eqtl_df.loc[oi_chrom_eqtl_df['variant_id'].isin(variants_oi_df['SNP'])]
        phenos_oi = phenos_oi + list(oi_results['phenotype_id'].unique())

    # do pass to keep results that belong those phenos
    for visit in visits:
        chrom_file = f'{in_dir}/{cohort}.{tissue}{visit}.{term}.cis_qtl_pairs.chr{chrom}.parquet'
        chrom_eqtl_df = pd.read_parquet(chrom_file)
        possible_results_oi = chrom_eqtl_df.loc[chrom_eqtl_df['phenotype_id'].isin(phenos_oi)].copy()
        possible_results_oi['month'] = visit
        df_to_return = pd.concat([df_to_return, possible_results_oi])
    return phenos_oi, df_to_return

def process_regression_check(chrom, visits, in_dir, cohort, tissue, term, gwas_df):
    results_to_keep = None
    phenos_oi, results_to_test = load_chrom_result(chrom, visits, in_dir, cohort, tissue, term)
    print(f'chr {chrom} shape {results_to_test.shape}')
    # display(results_to_test.sample(5))

    # test the GWAS~eQTL regression for possible significance
    for phenotype_id in phenos_oi:
        alpha_df = create_merged_df_to_regress(results_to_test, phenotype_id, gwas_df)

        # ['coef', 'stderr', 'term_cnt', 'p-value']
        ret_vals = regress_zscores_keep(alpha_df)
        # must have postive coefficient and nomically significant p-value
        if ret_vals[0] > 0 and ret_vals[3] < alpha_value:
            results_to_keep = pd.concat([results_to_keep, alpha_df])
    return results_to_keep

#### load the gencode annotations

In [6]:
%%time
gencode_df = pd.read_pickle(gencode_pkl)
# drop the ont and tag columns
discard_cols = gencode_df.columns[(gencode_df.columns.str.startswith('ont:')) |
                                (gencode_df.columns.str.startswith('tag:'))]
gencode_df.drop(columns=discard_cols, inplace=True)
# should only be autosomal but sometimes annotation quirks allow in others, so force
gencode_df = gencode_df.loc[(gencode_df['seqname'].str.startswith('chr')) & 
                           (~gencode_df['seqname'].isin(['chrX','chrY','chrM']))]
# every now and again having problem with mics_RNA genes mapping every chrom so drop
gencode_df = gencode_df.loc[~gencode_df['gene_type'].isin(['misc_RNA'])]
print(gencode_df.shape)
display(gencode_df.head())

(2636995, 22)


Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,ccdsid,exon_id,...,gene_name,gene_type,havana_gene,havana_transcript,level,protein_id,transcript_id,transcript_name,transcript_support_level,transcript_type
0,chr1,HAVANA,gene,11869,14409,.,+,.,,,...,DDX11L1,transcribed_unprocessed_pseudogene,OTTHUMG00000000961.2,,2,,,,,
1,chr1,HAVANA,transcript,11869,14409,.,+,.,,,...,DDX11L1,transcribed_unprocessed_pseudogene,OTTHUMG00000000961.2,OTTHUMT00000362751.1,2,,ENST00000456328.2,DDX11L1-202,1.0,processed_transcript
2,chr1,HAVANA,exon,11869,12227,.,+,.,,ENSE00002234944.1,...,DDX11L1,transcribed_unprocessed_pseudogene,OTTHUMG00000000961.2,OTTHUMT00000362751.1,2,,ENST00000456328.2,DDX11L1-202,1.0,processed_transcript
3,chr1,HAVANA,exon,12613,12721,.,+,.,,ENSE00003582793.1,...,DDX11L1,transcribed_unprocessed_pseudogene,OTTHUMG00000000961.2,OTTHUMT00000362751.1,2,,ENST00000456328.2,DDX11L1-202,1.0,processed_transcript
4,chr1,HAVANA,exon,13221,14409,.,+,.,,ENSE00002312635.1,...,DDX11L1,transcribed_unprocessed_pseudogene,OTTHUMG00000000961.2,OTTHUMT00000362751.1,2,,ENST00000456328.2,DDX11L1-202,1.0,processed_transcript


CPU times: user 5.39 s, sys: 2.22 s, total: 7.61 s
Wall time: 7.61 s


#### load the risk variants of interest

In [7]:
variants_oi_df = pd.read_csv(meta5_st2_clean_file, sep='\t')
print(variants_oi_df.shape)
variants_oi_df.head()

(91, 12)


Unnamed: 0,SNP,CHR,BP,Nearest_Gene,QTL_Nominated_Gene,Effect_allele,Other_allele,EAF,Beta_all_studies,SE_all_studies,P_all_studies,P_COJO_all_studies
0,rs114138760,1,154898185,PMVK,,c,g,0.0112,0.2812,0.0478,4.19e-09,1.09e-08
1,rs35749011,1,155135036,KRTCAP2,EFNA3,a,g,0.0169,0.6068,0.0342,1.72e-70,3.8199999999999996e-77
2,rs76763715,1,155205634,GBAP1,,t,c,0.9953,-0.7467,0.0765,1.59e-22,9.900000000000001e-23
3,rs6658353,1,161469054,FCGR2A,FCGR2A,c,g,0.5011,0.065,0.0094,6.1e-12,4.69e-12
4,rs11578699,1,171719769,VAMP4,VAMP4,t,c,0.1949,-0.0704,0.012,4.47e-09,4.45e-09


#### load the full gwas summary stats

In [8]:
%%time
gwas_stats_df = pd.read_hdf(meta5_stats_file)
print(gwas_stats_df.shape)
display(gwas_stats_df.head())

(7769022, 11)


Unnamed: 0,SNP,A1,A2,freq,b,se,p,N,chr,position,id
2802885,rs61818144,T,C,0.7683,-0.0136,0.0119,0.2531,1460059,chr1,112204228,rs61818144
3747368,rs117477158,A,C,0.0965,0.0039,0.016,0.8079,1474097,chr22,50090194,rs117477158
6997819,rs1484646,T,G,0.2516,-0.0015,0.011,0.8919,1474097,chr8,10170886,rs1484646
2114874,rs150320983,A,G,0.0313,-0.0228,0.0306,0.4562,1460059,chr16,82860411,rs150320983
2275491,rs118177303,A,G,0.9459,0.0134,0.0256,0.6013,1460059,chr17,57782246,rs118177303


CPU times: user 24.5 s, sys: 2.05 s, total: 26.6 s
Wall time: 26.6 s


#### load the replicated eQTL

In [9]:
eqtl_df = pd.read_csv(eqtl_results_file)
print(eqtl_df.shape)
number_genes = len(eqtl_df['phenotype_id'].unique())
print(f'replicated eqtl for {number_genes} genes')
eqtl_df.head()

(20, 22)
replicated eqtl for 2 genes


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_g,b_g,b_g_se,pval_i,...,pval_gi,b_gi,b_gi_se,cispair,month,log10_pvalue,z_score,z_score_abs,is_sig,cohort
0,ENSG00000214401.4,rs76969039,-486579,0.418205,802,997,1.1131840000000001e-23,-0.969207,0.094553,4.441574e-11,...,2.397787e-08,1.392393,0.247816,ENSG00000214401.4:rs76969039,0,7.620189,5.618665,5.618665,1,ppmi
1,ENSG00000214401.4,rs76969039,-486579,0.407115,499,618,1.3094319999999999e-20,-1.065344,0.111134,3.184163e-11,...,2.598226e-08,1.893998,0.336624,ENSG00000214401.4:rs76969039,12,7.585323,5.626458,5.626458,1,ppmi
2,ENSG00000214425.7,rs9898399,322621,0.378828,466,569,1.793138e-30,-1.215024,0.101201,0.07218832,...,1.126462e-07,1.574987,0.293973,ENSG00000214425.7:rs9898399,6,6.948283,5.357581,5.357581,1,ppmi
3,ENSG00000214401.4,rs76969039,-486579,0.404794,489,608,5.3010629999999995e-20,-1.03033,0.109307,1.100697e-10,...,1.424551e-07,1.633694,0.30747,ENSG00000214401.4:rs76969039,6,6.846322,5.313346,5.313346,0,ppmi
4,ENSG00000214425.7,rs9898399,322621,0.361224,296,354,2.512451e-20,-1.45583,0.150572,0.02606193,...,7.086006e-07,2.778783,0.552912,ENSG00000214425.7:rs9898399,36,6.149598,5.025722,5.025722,0,ppmi


#### which visit are present in the results (typically will be all in cohort)

In [10]:
visits = sorted(list(eqtl_df.loc[eqtl_df['cohort'] == disc_cohort]['month'].unique()))
print(visits)

[0, 6, 12, 24, 36]


#### grab the max pval_gi

In [11]:
max_nomical_pvalue = eqtl_df[eqtl_df['is_sig'] == 1]['pval_gi'].max()
try:
    max_cnt_pvalue = 0.05/number_genes
except ZeroDivisionError:
    print('number of genes less than one, switching to raw alpha value')
    max_cnt_pvalue = alpha_value
    
print(f'max_nomical_pvalue == {max_nomical_pvalue}')
print(f'max_cnt_pvalue == {max_cnt_pvalue}')

max_pvalue = max_nomical_pvalue if max_nomical_pvalue > max_cnt_pvalue else max_cnt_pvalue
print(f'max nominal pvalue {max_pvalue}')

max p-value: 0.0008990619741416


#### see if any of the top significant results happen to be risk independent variants

In [12]:
def check_eqtls_for_variants(eqtl_df, variants_df, gencode_df):
    eqtl_variants_oi_df = eqtl_df.loc[eqtl_df['variant_id'].isin(variants_df['SNP'])]
    print(eqtl_variants_oi_df.shape)
    this_cnt = len(eqtl_variants_oi_df['variant_id'].unique())
    print(f'variants {this_cnt}')
    print(eqtl_variants_oi_df['variant_id'].unique())
    this_cnt = len(eqtl_variants_oi_df['phenotype_id'].unique())
    print(f'genes {this_cnt}')
    oi_genes = gencode_df.loc[gencode_df['gene_id'].isin(eqtl_variants_oi_df['phenotype_id']), 
                              ['gene_name']]['gene_name'].unique()
    print(oi_genes)
    return eqtl_variants_oi_df

In [13]:
risk_genes = check_eqtls_for_variants(eqtl_df, variants_oi_df, gencode_df)
if len(risk_genes) > 0:
    display(risk_genes.head())

(0, 22)
variants 0
[]
genes 0
[]


#### now load rest of results and see if risk index variants are eQTL

In [14]:
import warnings
# warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

In [15]:
%%time

fs_list = []
lm_results = []
with concurrent.futures.ProcessPoolExecutor(max_workers=max_threads*2) as ppe:
    for chrom in autosomes:
        fs_list.append(ppe.submit(process_regression_check, chrom, visits, 
                                  tensorqtl_dir, disc_cohort, tissue, 
                                  interaction_term, gwas_stats_df))
# for future in concurrent.futures.as_completed(fs_list):
#     lm_results.append(future.result())

# # flatten the list
# results_to_keep = pd.concat([item for item in lm_results])

chr 1 shape (25039, 16)
chr 2 shape (0, 16)
chr 3 shape (0, 16)
chr 4 shape (0, 16)
chr 5 shape (0, 16)
chr 6 shape (0, 16)
chr 7 shape (0, 16)
chr 8 shape (0, 16)
chr 9 shape (0, 16)
chr 10 shape (0, 16)
chr 11 shape (0, 16)
chr 13 shape (0, 16)
chr 12 shape (0, 16)
chr 14 shape (0, 16)
chr 15 shape (0, 16)
chr 16 shape (0, 16)
chr 18 shape (0, 16)
chr 17 shape (19803, 16)
chr 20 shape (0, 16)
chr 19 shape (0, 16)
chr 21 shape (0, 16)
chr 22 shape (0, 16)
CPU times: user 3min 17s, sys: 28.8 s, total: 3min 46s
Wall time: 4min 5s


In [None]:
for future in concurrent.futures.as_completed(fs_list):
    lm_results.append(future.result())

# flatten the list
results_to_keep = pd.concat([item for item in lm_results])

In [None]:
print(results_to_keep.shape)
display(results_to_keep.head())

In [None]:
print(len(results_to_keep['phenotype_id'].unique()))
print(results_to_keep['phenotype_id'].unique())

In [None]:
genes_kept  = gencode_df.loc[gencode_df['gene_id'].isin(results_to_keep['phenotype_id']), 
                             ['gene_name']]['gene_name'].unique()
print(len(genes_kept))
print(genes_kept)

In [None]:
import random
phenotype_id = random.choice(results_to_keep['phenotype_id'].unique())
print(phenotype_id)

In [None]:
gencode_df.loc[gencode_df['gene_id'] == phenotype_id, 
               ['gene_name']]['gene_name'].unique()

In [None]:
# phenotype_id = 'ENSG00000143537.13'
# phenotype_id = 'ENSG00000164733.20'
temp = results_to_keep.loc[results_to_keep['phenotype_id'] == phenotype_id]
print(temp.shape)
display(temp.head())

In [None]:
sns.lmplot(x='gwas_zscore_abs', y='eqtl_zscore_abs', hue='month', data=temp)

In [None]:
sns.relplot(x='gwas_zscore_abs', 
            y='eqtl_zscore_abs', 
            hue='month', alpha=.5, palette="dark", 
            height=12, data=temp) 

In [None]:
this_formula = 'gwas_zscore_abs ~ eqtl_zscore_abs'
grouping = 'month'
result = mixed_model(this_formula, temp, grouping)
print(result.summary())

In [None]:
term = 'eqtl_zscore_abs'
print(['feature', 'coef', 'stderr', 'term_cnt', 'p-value'])
print(result.params[term], result.bse[term], 
      result.params.shape[0], result.pvalues[term])

In [None]:
this_formula = 'gwas_zscore_abs ~ eqtl_zscore_abs'
reg_model = smf.ols(this_formula, data=temp).fit()
print(reg_model.summary())

In [None]:
this_formula = 'gwas_zscore_abs ~ eqtl_zscore_abs + month'
reg_model = smf.ols(this_formula, data=temp).fit()
print(reg_model.summary())

In [None]:
this_formula = 'gwas_zscore_abs ~ eqtl_zscore_abs + month + eqtl_zscore_abs * month'
reg_model = smf.ols(this_formula, data=temp).fit()
print(reg_model.summary())

In [None]:
this_formula = 'gwas_zscore_abs ~ eqtl_zscore_abs + (1|month)'
reg_model = smf.ols(this_formula, data=temp).fit()
print(reg_model.summary())

#### now scan kept genes in the replication cohort

#### which visit are present in the results (typically will be all in cohort)

In [None]:
rep_visits = sorted(list(eqtl_df.loc[eqtl_df['cohort'] == rep_cohort]['month'].unique()))
print(rep_visits)

In [None]:
def load_pheno_result(ids, chrom, visits, in_dir, cohort, tissue, term):
    df_to_return = None
    for visit in visits:
        chrom_file = f'{in_dir}/{cohort}.{tissue}{visit}.{term}.cis_qtl_pairs.chr{chrom}.parquet'
        chrom_eqtl_df = pd.read_parquet(chrom_file)
        possible_results_oi = chrom_eqtl_df.loc[chrom_eqtl_df['phenotype_id'].isin(ids)].copy()
        possible_results_oi['month'] = visit
        df_to_return = pd.concat([df_to_return, possible_results_oi])
    return df_to_return

def process_replication_check(pheno_ids, chrom, visits, in_dir, cohort, tissue, 
                              term, gwas_df):
    results_to_keep = None
    results_to_test = load_pheno_result(pheno_ids, chrom, visits, in_dir, cohort, 
                                        tissue, term)

    # test the GWAS~eQTL regression for possible significance
    phenos_oi = list(results_to_test['phenotype_id'].unique())
    print(f'chr {chrom} shape {results_to_test.shape} pheno-count {len(phenos_oi)}')
    for phenotype_id in phenos_oi:
        alpha_df = create_merged_df_to_regress(results_to_test, phenotype_id, gwas_df)

        if alpha_df.shape[0] > 0 and regress_zscores_keep(alpha_df):
            results_to_keep = pd.concat([results_to_keep, alpha_df])
    return results_to_keep

In [None]:
%%time

rep_tensorqtl_dir = f'{home_dir}/{rep_cohort}/tensorqtl'

pheno_ids = list(results_to_keep['phenotype_id'].unique())

rep_results_to_keep = None
for chrom in autosomes:
    chr_result = process_replication_check(pheno_ids, chrom, rep_visits, rep_tensorqtl_dir, 
                                           rep_cohort, tissue, interaction_term, gwas_stats_df)
    if not chr_result is None:
        rep_results_to_keep = pd.concat([rep_results_to_keep, chr_result])

print(rep_results_to_keep.shape)
display(rep_results_to_keep.head())

In [None]:
print(len(rep_results_to_keep['phenotype_id'].unique()))
print(rep_results_to_keep['phenotype_id'].unique())

In [None]:
rep_genes_kept  = gencode_df.loc[gencode_df['gene_id'].isin(rep_results_to_keep['phenotype_id']), 
                             ['gene_name']]['gene_name'].unique()
print(len(rep_genes_kept))
print(rep_genes_kept)

In [None]:
set(genes_kept) - set(rep_genes_kept)