## Notebook to look for ieQTL replicated between cohorts

In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import concurrent.futures
import os

In [None]:
# parameters
disc_cohort = 'ppmi'
rep_cohort = 'pdbp'
tissue = 'wb'
term = 'GRS'

In [None]:
# directories
home_dir = '/home/jupyter'
rep_tensorqtl_dir = f'{home_dir}/{rep_cohort}/tensorqtl'

# in files
disc_eqtl_file = f'{home_dir}/{disc_cohort}/results/{disc_cohort}.{tissue}.{term}.cis.ieqtl.csv'
rep_eqtl_file = f'{home_dir}/{rep_cohort}//results/{rep_cohort}.{tissue}.{term}.cis.ieqtl.csv'

# out files
replicated_file = f'{home_dir}/{disc_cohort}/results/{disc_cohort}.{rep_cohort}.\
replicated.{tissue}.{term}.cis.ieqtl.csv'

# constants
autosomes = [str(x) for x in list(range(1,23))]
capture_out = !(nproc)
max_threads = int(capture_out[0])
months = [0, 1, 6 , 12, 18, 24, 36]

#### load top-level results

In [None]:
# load discovery results
disc_eqtl_df = pd.read_csv(disc_eqtl_file)
print(disc_eqtl_df.shape)
display(disc_eqtl_df.head())
# load replication results
rep_eqtl_df = pd.read_csv(rep_eqtl_file)
print(rep_eqtl_df.shape)
display(rep_eqtl_df.head())

#### not expecting each cohort to have exact same index eQTL variant but check anyway

In [None]:
cis_pairs = disc_eqtl_df.loc[disc_eqtl_df['is_sig'] == 1]['cispair'].unique()
print(f'disc cohort cis pairs count {len(cis_pairs)}')
features = disc_eqtl_df.loc[disc_eqtl_df['is_sig'] == 1]['phenotype_id'].unique()
print(f'disc gene count {len(features)}')

rep_cis_pairs = rep_eqtl_df.loc[rep_eqtl_df['is_sig'] == 1]['cispair'].unique()
print(f'rep cohort cis pairs count {len(rep_cis_pairs)}')
rep_features = rep_eqtl_df.loc[rep_eqtl_df['is_sig'] == 1]['phenotype_id'].unique()
print(f'rep gene count {len(rep_features)}')

cp_replicated = set(cis_pairs) & set(rep_cis_pairs)
print(f'replicated index cis pairs {len(cp_replicated)}')
genes_replicated = set(features) & set(rep_features)
print(f'replicated eQTL genes {len(genes_replicated)}')

#### what is the max p-value in the discovery cohort signals

In [None]:
# here I'm picking lower threshold of was FDR significant 
# in the full analysis of data from the repliation cohort
# or simple bonferronni based on number of features

max_fdr_pvalue = rep_eqtl_df.loc[rep_eqtl_df['is_sig'] == 1]['pval_gi'].max()
# max_pvalue = disc_eqtl_df.loc[disc_eqtl_df['is_sig'] == 1]['pval_gi'].max()
# max_pvalue = disc_eqtl_df.loc[(disc_eqtl_df['phenotype_id'].isin(genes_replicated)) & 
#                               (disc_eqtl_df['is_sig'] == 1)]['pval_gi'].max()
max_cnt_pvalue = 0.05/len(features)
print(f'max_fdr_pvalue == {max_fdr_pvalue}')
print(f'max_cnt_pvalue == {max_cnt_pvalue}')
max_pvalue = max_fdr_pvalue if max_fdr_pvalue > max_cnt_pvalue else max_cnt_pvalue
print(f'max nominal pvalue {max_pvalue}')

#### now that all the cispairs of interest are known get complete data for these

In [None]:
def load_missing_qtl_results(find_items, in_file, month):
    eqtl_df = pd.read_parquet(in_file)
    eqtl_df['cispair'] = eqtl_df['phenotype_id'] + ':' + eqtl_df['variant_id']
    found_df = eqtl_df.loc[eqtl_df['cispair'].isin(find_items)].copy()
    found_df['month'] = month
    return found_df

In [None]:
cis_pairs_to_find = disc_eqtl_df.loc[(disc_eqtl_df['is_sig'] == 1)]['cispair'].unique()
print(f'total cis pairs to pull {len(cis_pairs_to_find)}')

In [None]:
%%time
fs_list = []
lm_results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads*2) as tpe:
    for month in months:
        cohort_build = f'{rep_cohort}.{tissue}{month}'
        for chrom in autosomes:
            this_result_file = f'{rep_tensorqtl_dir}/{cohort_build}.{term}.cis_qtl_pairs.chr{chrom}.parquet'
            if os.path.isfile(this_result_file):
                fs_list.append(tpe.submit(load_missing_qtl_results, cis_pairs_to_find, 
                                          this_result_file, month))
    for future in concurrent.futures.as_completed(fs_list):
        lm_results.append(future.result())

In [None]:
# combine the read results
results_df = pd.concat(lm_results)
print(results_df.shape)
display(results_df.head())

In [None]:
replicated_results = results_df.loc[results_df['pval_gi'] < max_pvalue]
print(replicated_results.shape)
display(replicated_results.head())

In [None]:
replicated_results['month'].value_counts()

In [None]:
replicated_cis_pairs = set(replicated_results['cispair'])
print(f'count of replicated cis pairs {len(replicated_cis_pairs)}')

In [None]:
# before computing log10 of p-value if zero present replace with 
# numpy float64 next up from zero
results_df.loc[results_df['pval_gi'] == 0, 'pval_gi'] = np.nextafter(0, 1)

results_df['log10_pvalue'] = np.log10(results_df['pval_gi'])*-1
results_df['z_score'] = results_df['b_gi']/results_df['b_gi_se']
results_df['is_sig'] = np.where(results_df['pval_gi'] < max_pvalue, 1, 0)

#### subset and combine the discovery and replication cohorts based on the replicated cis pairs
ie there might be individual months that have a sub-significant value but that is ok

In [None]:
disc_replicated_df = disc_eqtl_df.loc[disc_eqtl_df['cispair'].isin(replicated_cis_pairs)].copy()
disc_replicated_df['cohort'] = disc_cohort
print(f'disc shape {disc_replicated_df.shape}')
# pull from the retrieve replication cohort results
rep_replicated_df = results_df.loc[results_df['cispair'].isin(replicated_cis_pairs)].copy()
rep_replicated_df['cohort'] = rep_cohort
print(f'rep shape {disc_replicated_df.shape}')
# now combine, why I added cohort label to rows
replicated_df = pd.concat([disc_replicated_df, rep_replicated_df])

In [None]:
print(replicated_df.shape)
display(replicated_df.head())

#### sort this final table by most sig and then month

In [None]:
replicated_df.sort_values(by=['pval_gi', 'month'], inplace=True)

In [None]:
display(replicated_df.head())
display(replicated_df.tail())

#### save the file

In [None]:
replicated_df.to_csv(replicated_file, index=False)

#### annotate direction of effect for plotting

In [None]:
disc_replicated_df['Direction'] = np.where(disc_replicated_df['b_gi'] > 0, 
                                           'Increase', 'Descrease')
display(disc_replicated_df.head())

#### do some quick plotting

In [None]:
sns.lmplot(x='month', y='log10_pvalue', hue='Direction', 
           data=disc_replicated_df, palette='Set1')

In [None]:
sns.lmplot(x='month', y='z_score', hue='Direction', 
           data=disc_replicated_df, palette='Set1')

In [None]:
sns.lmplot(x='month', y='b_gi', hue='Direction', 
           data=disc_replicated_df, palette='Set1')

In [None]:
sns.lmplot(x='month', y='b_gi_se', hue='Direction', 
           data=disc_replicated_df, palette='Set1')

In [None]:
sns.lmplot(x='month', y='ma_count', hue='Direction', 
           data=disc_replicated_df, palette='Set1')

In [None]:
sns.lmplot(x='month', y='tss_distance', hue='Direction', 
           data=disc_replicated_df, palette='Set1')

In [None]:
sns.relplot(x='month', y='z_score', kind='line', hue='Direction',
            data=disc_replicated_df, palette='Set1')

In [None]:
sns.relplot(x='month', y='b_gi', kind='line', hue='Direction',
            data=disc_replicated_df, palette='Set1')

In [None]:
sns.relplot(x='month', y='b_gi_se', kind='line', hue='Direction',
            data=disc_replicated_df, palette='Set1')

In [None]:
sns.relplot(x='month', y='log10_pvalue', kind='line', hue='Direction',
            data=disc_replicated_df, palette='Set1')

In [None]:
sns.relplot(x='tss_distance', y='log10_pvalue',hue='Direction',
            data=disc_replicated_df, palette='Set1')

In [None]:
sns.scatterplot(x='tss_distance', y='log10_pvalue', hue='month',
                data=disc_replicated_df.sample(frac=1), palette='Set1')

In [None]:
sns.relplot(x='tss_distance', y='z_score',hue='Direction',
            data=disc_replicated_df, palette='Set1')

In [None]:
sns.relplot(x='tss_distance', y='z_score',hue='month',
            data=disc_replicated_df.sample(frac=1), palette='Set1')

In [None]:
disc_replicated_df.loc[disc_replicated_df['month'] == 0]['Direction'].value_counts()

In [None]:
disc_replicated_df.loc[(disc_replicated_df['month'] == 0) & 
                       (disc_replicated_df['Direction'] == 'Increase')]['log10_pvalue'].value_counts()