## Notebook to look for eQTS replicated between cohorts

In [1]:
!date

Mon May 10 21:47:39 UTC 2021


#### import libraries and set notebook variables

In [2]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.multitest as smm
import seaborn as sns
import matplotlib.pyplot as plt
import concurrent.futures

In [3]:
# parameters
disc_cohort = 'ppmi'
rep_cohort = 'pdbp'
tissue = 'wb'

# directories
home_dir = '/home/jupyter'

# in files
disc_eqtl_file = f'{home_dir}/{disc_cohort}/eqts/{disc_cohort}.{tissue}.eqts.csv'
rep_eqtl_file = f'{home_dir}/{rep_cohort}/eqts/{rep_cohort}.{tissue}.eqts.csv'

# out files
replicated_file = f'{home_dir}/{disc_cohort}/eqts/{disc_cohort}.{rep_cohort}.replicated.wb.eqts.csvv'

# constants
autosomes = [str(x) for x in list(range(1,23))]
capture_out = !(nproc)
max_threads = int(capture_out[0])

#### plotting functions

In [18]:
#plot the eQTS
def ploteqts(trait_id, study_name, score_df, traits_df):
    this_scores_df = score_df.loc[score_df.index.isin(traits_df.index)]
    this_scores_df = this_scores_df.reindex(traits_df.index)

    temp = traits_df.merge(this_scores_df,left_index=True,right_index=True)

    plt.figure(figsize=(9, 9))
    sns.regplot(x=trait_id,y='GRS', data=temp, ci=95)
    sns.scatterplot(x=trait_id,y='GRS', data=temp, hue='DX')
    plt.xlabel('Trait')
    plt.ylabel('GRS')
    plt.title(f'{trait_id} in {study_name}')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0,prop={'size': 10})
    plt.show()

#### load top-level results

In [4]:
# load discovery results
disc_eqtl_df = pd.read_csv(disc_eqtl_file)
print(disc_eqtl_df.shape)
display(disc_eqtl_df.head())
# load replication results
rep_eqtl_df = pd.read_csv(rep_eqtl_file)
print(rep_eqtl_df.shape)
display(rep_eqtl_df.head())

(11062, 8)


Unnamed: 0,Name,coef,stderr,r2_adj,term_cnt,p-value,bh_fdr,month
0,ENSG00000253251.2,-0.716597,0.187662,0.019267,7.0,0.000141,0.001355,0
1,ENSG00000136014.11,0.598698,0.174418,0.16238,7.0,0.000619,0.004356,0
2,ENSG00000223819.2,-0.681088,0.161034,0.275661,7.0,2.5e-05,0.000335,0
3,ENSG00000100335.14,0.591136,0.178069,0.125939,7.0,0.000929,0.006035,0
4,ENSG00000090487.10,0.64402,0.165394,0.261476,7.0,0.000104,0.001058,0


(15, 8)


Unnamed: 0,Name,coef,stderr,r2_adj,term_cnt,p-value,bh_fdr,month
0,ENSG00000234997.1,-0.967461,0.227221,0.127156,7.0,2.228767e-05,0.03961992,0
1,ENSG00000265798.6,-1.019431,0.240515,0.021482,7.0,2.425709e-05,0.03961992,0
2,ENSG00000264057.1,-2.064923,0.237017,0.064595,7.0,9.946178e-18,1.462088e-13,0
3,ENSG00000176681.14,-1.413974,0.2313,0.036641,7.0,1.326473e-09,4.874789e-06,0
4,ENSG00000266504.1,-1.119212,0.236929,0.054644,7.0,2.593112e-06,0.006353124,0


#### see if any of the discovery cohort eQTS are also detected in the replications cohort
this is without regard for visit, ie can be in any month

In [7]:
replicated_features = set(disc_eqtl_df['Name']) & set(rep_eqtl_df['Name'])
print(len(replicated_features))

9


In [8]:
replicated_features

{'ENSG00000176681.14',
 'ENSG00000214401.4',
 'ENSG00000214425.7',
 'ENSG00000226904.1',
 'ENSG00000234997.1',
 'ENSG00000238083.7',
 'ENSG00000264057.1',
 'ENSG00000265798.6',
 'ENSG00000266504.1'}

#### so there are a few, but some look to be MAPT region
pull results for these and save

In [9]:
disc_replicated_df = disc_eqtl_df.loc[disc_eqtl_df['Name'].isin(replicated_features)]
disc_replicated_df['cohort'] = disc_cohort
print(f'disc shape {disc_replicated_df.shape}')
# pull from the retrieve replication cohort results
rep_replicated_df = rep_eqtl_df.loc[rep_eqtl_df['Name'].isin(replicated_features)]
rep_replicated_df['cohort'] = rep_cohort
print(f'rep shape {rep_replicated_df.shape}')
# now combine, why I added cohort label to rows
replicated_df = pd.concat([disc_replicated_df, rep_replicated_df])

disc shape (24, 9)
rep shape (15, 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
print(disc_replicated_df['month'].value_counts())
print(rep_replicated_df['month'].value_counts())

6     9
0     8
12    4
24    3
Name: month, dtype: int64
0     9
6     2
24    2
12    2
Name: month, dtype: int64


In [11]:
print(replicated_df.shape)
display(replicated_df.sample(10))

(39, 9)


Unnamed: 0,Name,coef,stderr,r2_adj,term_cnt,p-value,bh_fdr,month,cohort
5179,ENSG00000238083.7,-0.987833,0.252762,0.026735,7.0,0.0001014846,0.002242,6,ppmi
6144,ENSG00000265798.6,-0.90595,0.246473,0.029177,7.0,0.0002542891,0.004337,6,ppmi
5,ENSG00000238083.7,-1.212011,0.233676,0.058494,7.0,2.51824e-07,0.00074,0,pdbp
10156,ENSG00000264057.1,-1.470728,0.278569,0.030603,7.0,1.696075e-07,2.1e-05,12,ppmi
549,ENSG00000266504.1,-0.871806,0.188467,0.028187,7.0,4.14207e-06,7.7e-05,0,ppmi
9244,ENSG00000214401.4,-1.14679,0.277851,0.017236,7.0,4.079918e-05,0.001236,12,ppmi
7,ENSG00000226904.1,-0.961887,0.218599,0.175704,7.0,1.179388e-05,0.024767,0,pdbp
10373,ENSG00000264057.1,-1.167509,0.297714,0.017414,7.0,9.701653e-05,0.009686,24,ppmi
2314,ENSG00000214425.7,1.007172,0.18561,0.026147,7.0,6.972136e-08,3e-06,0,ppmi
3588,ENSG00000226904.1,-0.628665,0.170569,0.185682,7.0,0.0002383909,0.002058,0,ppmi


#### sort this final table by most sig and then month

In [14]:
replicated_df.sort_values(by=['p-value', 'month'], inplace=True)

In [15]:
display(replicated_df.head())
display(replicated_df.tail())

Unnamed: 0,Name,coef,stderr,r2_adj,term_cnt,p-value,bh_fdr,month,cohort
2,ENSG00000264057.1,-2.064923,0.237017,0.064595,7.0,9.946178e-18,1.462088e-13,0,pdbp
6,ENSG00000214401.4,-1.736081,0.237274,0.043284,7.0,4.688636e-13,3.446147e-09,0,pdbp
8,ENSG00000214425.7,1.467655,0.237622,0.060612,7.0,9.014108e-10,4.416913e-06,0,pdbp
3,ENSG00000176681.14,-1.413974,0.2313,0.036641,7.0,1.326473e-09,4.874789e-06,0,pdbp
2452,ENSG00000264057.1,-1.078744,0.186873,0.030056,7.0,9.958665e-09,5.555977e-07,0,ppmi


Unnamed: 0,Name,coef,stderr,r2_adj,term_cnt,p-value,bh_fdr,month,cohort
6274,ENSG00000176681.14,-0.829641,0.250616,0.018198,7.0,0.000977,0.011206,6,ppmi
10580,ENSG00000214401.4,-0.966901,0.304893,0.015106,7.0,0.001587,0.039668,24,ppmi
2558,ENSG00000238083.7,-0.539441,0.184592,0.004358,7.0,0.00354,0.016928,0,ppmi
5264,ENSG00000266504.1,-0.688821,0.251348,0.02433,7.0,0.006281,0.041667,6,ppmi
9678,ENSG00000265798.6,-0.739441,0.279022,0.035795,7.0,0.008215,0.048494,12,ppmi


#### save the file

In [16]:
replicated_df.to_csv(replicated_file, index=False)