# DECON-EQTL PART3: Looking at results

## Looking at Decon-eQTL results in ppmi blood bulk expression data with cell fractions

### Novemeber 18, 2020

#### https://github.com/molgenis/systemsgenetics/tree/master/Decon2/Decon-eQTL


### Set up environment

In [2]:
import pandas as pd
from rpy2.robjects.packages import importr
#from rpy2.robjects.vectors import FloatVector
import h5py

stats = importr('stats')
#qvalue = importr('qvalue')


cohort = 'nabec'
WRKDIR = f'/labshare/anni/eqtl/Decon-eQTL/{cohort}'
CHROMOSOMES = [str(x) for x in list(range(1,23))]
CHROMOSOMES_sub = [str(x) for x in list(range(2,23))]
cell_types = ['Microglia','Astrocyte','OPC','Oligodendrocyte','Neuron']
cell1 = 'Microglia'
cell2 = 'Astrocyte'
cell3 = 'OPC'
cell4 = 'Oligodendrocyte'
cell5 = 'Neuron'


outdir = f'/labshare/anni/eqtl/Decon-eQTL/{cohort}/results'
indir = f'/labshare/anni/eqtl/Decon-eQTL/{cohort}/results'

In [3]:
##functions

def find_signif(cell_type):
    sig_df = results_df.loc[results_df[f'{cell_type}_bh_fdr'] < 0.05]
    print(f'{cell_type}: '+str(sig_df.shape))
    return sig_df

def find_nonsignif(cell_type):
    sig_df = results_df.loc[results_df[f'{cell_type}_bh_fdr'] > 0.05]
    print(f'{cell_type}: '+str(sig_df.shape))
    return sig_df

def find_nonsignif_genes(cell_type_sig_genes, cell_type):
    nonsig_genes = set(all_genes_decon) - set(cell_type_sig_genes)
    print(f'{cell_type}: '+str(len(nonsig_genes)))
    return nonsig_genes
    
def to_list(sig_df):
    sig_vars = sig_df.index.tolist()
    sig_genes = list(set(sig_df['GENEID'].tolist()))
    print('significant genes: '+str(len(sig_genes)))
    return sig_vars,sig_genes

    
def to_list_non(sig_df):
    sig_vars = sig_df.index.tolist()
    sig_genes = list(set(sig_df['GENEID'].tolist()))
    print('nonsignificant genes: '+str(len(sig_genes)))
    return sig_vars,sig_genes

def compare(list1, list2, cell_type):
    overlap = list(set(list1) & set(list2))
    print(f'{cell_type}: '+str(len(overlap)))
    return overlap

def compare_plinknonsig(cell_list, cell_type):
    overlap = list(set(cell_list) - set(plink_sig_genes))
    print(f'{cell_type}: '+str(len(overlap)))
    return overlap

def genelist_to_file(gene_list, file_name):
    with open(f'{outdir}/{file_name}', 'w') as f:
        for item in gene_list:
            f.write("%s\n" % item)
            
def file_to_list(filename, cell_type):
    list_name = [line.rstrip('\n') for line in open(f'{indir}/{filename}')]
    print(f'{cell_type}: '+str(len(list_name)))
    return list_name

def p_adj(cell_type, in_df):
    in_df[f'{cell_type}_pvalue'] = in_df[f'{cell_type}_pvalue'].astype(float)

    ##adjust p value
    bhAdj = stats.p_adjust((in_df[f'{cell_type}_pvalue'].tolist()), method = 'BH')
    in_df[f'{cell_type}_bh_fdr'] = bhAdj
    print(in_df.shape)
    return in_df

def calc_fdr(chrom):
    results_df = pd.read_csv(f'{WRKDIR}/{cohort}.deconResults.nov2020.chr{chrom}.csv', sep='\t', index_col=0, header=None)
    results_df.columns = ['Microglia_pvalue','Astrocyte_pvalue','OPC_pvalue','Oligodendrocyte_pvalue','Neuron_pvalue','Beta1_Microglia','Beta2_Astrocyte','Beta3_OPC','Beta4_Oligodendrocyte','Beta5_Neuron','Beta6_Microglia:GT','Beta7_Astrocyte:GT','Beta8_OPC:GT','Beta9_Oligodendrocyte:GT','Beta10_Neuron:GT']
    results_df['VAR'] = results_df.index
    results_df['GENEID'] = results_df['VAR'].str.split("_").str[0]
    # for cell in cell_types:
    #     results_df = p_adj(cell, results_df)
    print(f'chrom{chrom}: '+str(results_df.shape))
    for cell in cell_types:
        results_df = p_adj(cell, results_df)
    results_df.to_csv(f'{WRKDIR}/{cohort}.deconResults.nov2020.chr{chrom}.fdr.csv',sep='\t',index=True)




In [None]:
for x in CHROMOSOMES:
    calc_fdr(x)

## Format uploaded biowulf files 


In [None]:
## move to results location to manipulate
# !cp /labshare/anni/eqtl/Decon-eQTL/ppmi/from_biowulf /labshare/anni/eqtl/Decon-eQTL/ppmi/results

In [None]:
# ## format biowulf files
# ##stripping out any headers that also got concatenated
# for chrom in CHROMOSOMES:
#     cmd = 'awk '"'BEGIN {{ OFS=FS="'"\\t"'" }} $2 !~ /^Microglia/'"' {}.deconeqtl.chr{}.results.csv > {}.deconeqtl.chr{}.results.strip.csv &'.format(cohort,chrom,cohort, chrom)
#     print(cmd)
    
# ##concat all into one file
# cmd = f'cat ./*strip.csv > {cohort}.deconeqtl.results.all.cat.csv'  
    

In [None]:
## filter for just significant variants for each cell type
## subset for significant into separate files

# awk -F"\t" '$2<0.05 {print}' ppmi.deconeqtl.results.all.cat.csv > ppmi.deconeqtl.results.all.cat.basophils_sig.csv &
# awk -F"\t" '$3<0.05 {print}' ppmi.deconeqtl.results.all.cat.csv > ppmi.deconeqtl.results.all.cat.eosinophils_sig.csv &
# awk -F"\t" '$4<0.05 {print}' ppmi.deconeqtl.results.all.cat.csv > ppmi.deconeqtl.results.all.cat.lymphocytes_sig.csv &
# awk -F"\t" '$5<0.05 {print}' ppmi.deconeqtl.results.all.cat.csv > ppmi.deconeqtl.results.all.cat.monocytes_sig.csv &
# awk -F"\t" '$6<0.05 {print}' ppmi.deconeqtl.results.all.cat.csv > ppmi.deconeqtl.results.all.cat.neutrophils_sig.csv &




## Correcting p values

In [5]:
#print(f'cp /labshare/anni/notebooks/eqtl/ppmi/a_adj.by_chrom.py /labshare/anni/notebooks/eqtl/{cohort}/')

cp /labshare/anni/notebooks/eqtl/ppmi/a_adj.by_chrom.py /labshare/anni/notebooks/eqtl/nabec/


In [22]:
## /labshare/anni/notebooks/eqtl/nabec/nabec_a_adj.by_chrom.py

import pandas as pd
from rpy2.robjects.packages import importr
#from rpy2.robjects.vectors import FloatVector



stats = importr('stats')
#qvalue = importr('qvalue')


cohort = 'nabec'
WRKDIR = f'/labshare/anni/eqtl/Decon-eQTL/{cohort}/results_biowulf'
CHROMOSOMES = [str(x) for x in list(range(1,23))]
CHROMOSOMES_sub = [str(x) for x in list(range(2,23))]
cell_types = ['Microglia','Astrocyte','OPC','Oligodendrocyte','Neuron']

def p_adj(cell_type, in_df):
    in_df[f'{cell_type}_pvalue'] = in_df[f'{cell_type}_pvalue'].astype(float)

    ##adjust p value
    bhAdj = stats.p_adjust((in_df[f'{cell_type}_pvalue'].tolist()), method = 'BH')
    in_df[f'{cell_type}_bh_fdr'] = bhAdj
    print(in_df.shape)
    return in_df

def calc_fdr(chrom):
    results_df = pd.read_csv(f'{WRKDIR}/{cohort}.deconResults.nov2020.chr{chrom}.csv', sep='\t', index_col=0, header=None)
    results_df.columns = ['Microglia_pvalue','Astrocyte_pvalue','OPC_pvalue','Oligodendrocyte_pvalue','Neuron_pvalue','Beta1_Microglia','Beta2_Astrocyte','Beta3_OPC','Beta4_Oligodendrocyte','Beta5_Neuron','Beta6_Microglia:GT','Beta7_Astrocyte:GT','Beta8_OPC:GT','Beta9_Oligodendrocyte:GT','Beta10_Neuron:GT']
    results_df['VAR'] = results_df.index
    results_df['GENEID'] = results_df['VAR'].str.split("_").str[0]
    # for cell in cell_types:
    #     results_df = p_adj(cell, results_df)
    print(f'chrom{chrom}: '+str(results_df.shape))
    for cell in cell_types:
        results_df = p_adj(cell, results_df)
    results_df.to_csv(f'{WRKDIR}/{cohort}.deconResults.nov2020.chr{chrom}.fdr.csv',sep='\t',index=True)

    

for x in CHROMOSOMES:
    calc_fdr(x)

chrom1: (6360474, 17)
(6360474, 18)
(6360474, 19)
(6360474, 20)
(6360474, 21)
(6360474, 22)
chrom2: (4902388, 17)
(4902388, 18)
(4902388, 19)
(4902388, 20)
(4902388, 21)
(4902388, 22)
chrom3: (4366689, 17)
(4366689, 18)
(4366689, 19)
(4366689, 20)
(4366689, 21)
(4366689, 22)
chrom4: (3249307, 17)
(3249307, 18)
(3249307, 19)
(3249307, 20)
(3249307, 21)
(3249307, 22)
chrom5: (3581520, 17)
(3581520, 18)
(3581520, 19)
(3581520, 20)
(3581520, 21)
(3581520, 22)
chrom6: (5128387, 17)
(5128387, 18)
(5128387, 19)
(5128387, 20)
(5128387, 21)
(5128387, 22)
chrom7: (3767901, 17)
(3767901, 18)
(3767901, 19)
(3767901, 20)
(3767901, 21)
(3767901, 22)
chrom8: (2875022, 17)
(2875022, 18)
(2875022, 19)
(2875022, 20)
(2875022, 21)
(2875022, 22)
chrom9: (2775991, 17)
(2775991, 18)
(2775991, 19)
(2775991, 20)
(2775991, 21)
(2775991, 22)
chrom10: (2905647, 17)
(2905647, 18)
(2905647, 19)
(2905647, 20)
(2905647, 21)
(2905647, 22)
chrom11: (3797409, 17)
(3797409, 18)
(3797409, 19)
(3797409, 20)
(3797409, 21)


## Load in all chrom results

In [None]:
# #first 6 columns are the p-values for each of the cell types, and the last 6 columns are the beta (effect size) of the cell type - 
# #genotype interaction effect, relative to the allele that is coded as 2 in the dosage file.

# results_df = pd.read_csv(f'{WRKDIR}/ppmi.deconeqtl.results.chr1.cat.strip.csv', sep='\t', index_col=0)
# results_df.columns = ['Basophils_pvalue','Eosinophils_pvalue','Lymphocytes_pvalue','Monocytes_pvalue','Neutrophils_pvalue','Beta1_Basophils','Beta2_Eosinophils','Beta3_Lymphocytes','Beta4_Monocytes','Beta5_Neutrophils','Beta6_Basophils:GT','Beta7_Eosinophils:GT','Beta8_Lymphocytes:GT','Beta9_Monocytes:GT','Beta10_Neutrophils:GT']
# results_df['VAR'] = results_df.index
# results_df['GENEID'] = results_df['VAR'].str.split("_").str[0]
# # for cell in cell_types:
# #     results_df = p_adj(cell, results_df)
# print('chrom1: '+str(results_df.shape))

# for chrom in CHROMOSOMES_sub:
#     next_df = pd.read_csv(f'{WRKDIR}/ppmi.deconeqtl.results.chr{chrom}.cat.strip.csv', sep='\t', index_col=0)
#     next_df.columns = ['Basophils_pvalue','Eosinophils_pvalue','Lymphocytes_pvalue','Monocytes_pvalue','Neutrophils_pvalue','Beta1_Basophils','Beta2_Eosinophils','Beta3_Lymphocytes','Beta4_Monocytes','Beta5_Neutrophils','Beta6_Basophils:GT','Beta7_Eosinophils:GT','Beta8_Lymphocytes:GT','Beta9_Monocytes:GT','Beta10_Neutrophils:GT']
#     next_df['VAR'] = next_df.index
#     next_df['GENEID'] = next_df['VAR'].str.split("_").str[0]
# #     for cell in cell_types:
# #         results_df = p_adj(cell, results_df)
#     results_df = pd.concat([results_df,next_df])
#     print(f'plus chrom{chrom}: '+str(results_df.shape))

   
# results_df.head()

In [32]:
#first 6 columns are the p-values for each of the cell types, and the last 6 columns are the beta (effect size) of the cell type - 
#genotype interaction effect, relative to the allele that is coded as 2 in the dosage file.

results_df = pd.read_csv(f'{WRKDIR}/results_biowulf/{cohort}.deconResults.nov2020.chr1.fdr.csv', sep='\t', index_col=0)
print('chrom1: '+str(results_df.shape))

for chrom in CHROMOSOMES_sub:
    next_df = pd.read_csv(f'{WRKDIR}/results_biowulf/{cohort}.deconResults.nov2020.chr{chrom}.fdr.csv', sep='\t', index_col=0)
    results_df = pd.concat([results_df,next_df])
    print(f'plus chrom{chrom}: '+str(results_df.shape))

results_df.head()

chrom1: (6360474, 22)
plus chrom2: (11262862, 22)
plus chrom3: (15629551, 22)
plus chrom4: (18878858, 22)
plus chrom5: (22460378, 22)
plus chrom6: (27588765, 22)
plus chrom7: (31356666, 22)
plus chrom8: (34231688, 22)
plus chrom9: (37007679, 22)
plus chrom10: (39913326, 22)
plus chrom11: (43710735, 22)
plus chrom12: (47852587, 22)
plus chrom13: (49441116, 22)
plus chrom14: (52017366, 22)
plus chrom15: (54418046, 22)
plus chrom16: (57503395, 22)
plus chrom17: (61445011, 22)
plus chrom18: (62805122, 22)
plus chrom19: (67930583, 22)
plus chrom20: (69720560, 22)
plus chrom21: (70569647, 22)
plus chrom22: (72241266, 22)


Unnamed: 0_level_0,Microglia_pvalue,Astrocyte_pvalue,OPC_pvalue,Oligodendrocyte_pvalue,Neuron_pvalue,Beta1_Microglia,Beta2_Astrocyte,Beta3_OPC,Beta4_Oligodendrocyte,Beta5_Neuron,...,Beta8_OPC:GT,Beta9_Oligodendrocyte:GT,Beta10_Neuron:GT,VAR,GENEID,Microglia_bh_fdr,Astrocyte_bh_fdr,OPC_bh_fdr,Oligodendrocyte_bh_fdr,Neuron_bh_fdr
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000457.14_chr1:168849631:C:A,1.0,1.0,1.0,0.358177,1.0,0.0,0.0,0.0,0.001498,0.0,...,0.0,-0.015554,0.0,ENSG00000000457.14_chr1:168849631:C:A,ENSG00000000457.14,1.0,1.0,1.0,1.0,1.0
ENSG00000000457.14_chr1:168851510:G:A,1.0,1.0,1.0,0.54974,0.830599,0.0,0.0,0.0,0.0,0.0,...,0.0,0.011772,-0.000231,ENSG00000000457.14_chr1:168851510:G:A,ENSG00000000457.14,1.0,1.0,1.0,1.0,1.0
ENSG00000000457.14_chr1:168851541:T:G,1.0,1.0,1.0,0.54974,0.830599,0.0,0.0,0.0,0.0,0.0,...,0.0,0.011772,-0.000231,ENSG00000000457.14_chr1:168851541:T:G,ENSG00000000457.14,1.0,1.0,1.0,1.0,1.0
ENSG00000000457.14_chr1:168851651:G:A,1.0,1.0,1.0,0.552327,1.0,0.0,0.0,0.0,0.002907,0.0,...,0.0,0.010761,0.0,ENSG00000000457.14_chr1:168851651:G:A,ENSG00000000457.14,1.0,1.0,1.0,1.0,1.0
ENSG00000000457.14_chr1:168852658:C:T,1.0,1.0,1.0,0.640277,1.0,0.0,0.0,0.0,0.005355,0.0,...,0.0,-0.009515,0.0,ENSG00000000457.14_chr1:168852658:C:T,ENSG00000000457.14,1.0,1.0,1.0,1.0,1.0


In [6]:
# all_genes_decon = list(set(results_df['GENEID'].tolist()))

# genelist_to_file(all_genes_decon, 'deconeqtl.all.genes')

all_genes_decon = [line.rstrip('\n') for line in open(f'{WRKDIR}/results/deconeqtl.all.genes')]
    
print('total genes covered: '+str(len(all_genes_decon)))

total genes covered: 16663


In [7]:
##### find variants found to be signif in each cell type


 
# cell1_sig_df = find_signif(cell1)
# cell1_sig_vars,cell1_sig_genes = to_list(cell1_sig_df)    
    
# cell2_sig_df = find_signif(cell2)
# cell2_sig_vars,cell2_sig_genes = to_list(cell2_sig_df)

# cell3_sig_df = find_signif(cell3)
# cell3_sig_vars,cell3_sig_genes = to_list(cell3_sig_df)

# cell4_sig_df = find_signif(cell4)
# cell4_sig_vars,cell4_sig_genes = to_list(cell4_sig_df)

# cell5_sig_df = find_signif(cell5)
# cell5_sig_vars,cell5_sig_genes = to_list(cell5_sig_df)


# ## saving lists to files
# genelist_to_file(cell1_sig_genes, f'deconeqtl.significant.fdr.genes.{cell1}')
# genelist_to_file(cell2_sig_genes, f'deconeqtl.significant.fdr.genes.{cell2}')
# genelist_to_file(cell3_sig_genes, f'deconeqtl.significant.fdr.genes.{cell3}')
# genelist_to_file(cell4_sig_genes, f'deconeqtl.significant.fdr.genes.{cell4}')
# genelist_to_file(cell5_sig_genes, f'deconeqtl.significant.fdr.genes.{cell5}')

# genelist_to_file(cell1_sig_vars, f'deconeqtl.significant.fdr.variants.{cell1}')
# genelist_to_file(cell2_sig_vars, f'deconeqtl.significant.fdr.variants.{cell2}')
# genelist_to_file(cell3_sig_vars, f'deconeqtl.significant.fdr.variants.{cell3}')
# genelist_to_file(cell4_sig_vars, f'deconeqtl.significant.fdr.variants.{cell4}')
# genelist_to_file(cell5_sig_vars, f'deconeqtl.significant.fdr.variants.{cell5}')


##loading in lists
print('Significant genes:')
cell1_sig_genes = file_to_list(f'deconeqtl.significant.fdr.genes.{cell1}', cell1)
cell2_sig_genes = file_to_list(f'deconeqtl.significant.fdr.genes.{cell2}', cell2)
cell3_sig_genes = file_to_list(f'deconeqtl.significant.fdr.genes.{cell3}', cell3)
cell4_sig_genes = file_to_list(f'deconeqtl.significant.fdr.genes.{cell4}', cell4)
cell5_sig_genes = file_to_list(f'deconeqtl.significant.fdr.genes.{cell5}', cell5)


print('')
print('Signficant variants:')
cell1_sig_vars = file_to_list(f'deconeqtl.significant.fdr.variants.{cell1}',cell1)
cell2_sig_vars = file_to_list(f'deconeqtl.significant.fdr.variants.{cell2}',cell2)
cell3_sig_vars = file_to_list(f'deconeqtl.significant.fdr.variants.{cell3}',cell3)
cell4_sig_vars = file_to_list(f'deconeqtl.significant.fdr.variants.{cell4}',cell4)
cell5_sig_vars = file_to_list(f'deconeqtl.significant.fdr.variants.{cell5}',cell5)

Significant genes:
Microglia: 0
Astrocyte: 0
OPC: 34
Oligodendrocyte: 741
Neuron: 39

Signficant variants:
Microglia: 0
Astrocyte: 0
OPC: 1618
Oligodendrocyte: 17493
Neuron: 15817


In [8]:
##### find variants found to be nonsignif in each cell type

def find_nonsignif_vars(cell_type):
    nonsig_df = results_df.loc[results_df[f'{cell_type}_bh_fdr'] > 0.05]
    nonsig_vars = nonsig_df.index.tolist()
    print(f'{cell_type} variants: '+str(len(nonsig_vars)))
    return nonsig_vars

def find_nonsignif_genes(cell_type_sig_genes, cell_type):
    nonsig_genes = set(all_genes_decon) - set(cell_type_sig_genes)
    print(f'{cell_type} genes: '+str(len(nonsig_genes)))
    return nonsig_genes

    
    
# cell1_nonsig_vars = find_nonsignif_vars(cell1)
# cell1_nonsig_genes = find_nonsignif_genes(cell1_sig_genes,cell1)    
    
# cell2_nonsig_vars = find_nonsignif_vars(cell2)
# cell2_nonsig_genes = find_nonsignif_genes(cell2_sig_genes,cell2) 

# cell3_nonsig_vars = find_nonsignif_vars(cell3)
# cell3_nonsig_genes = find_nonsignif_genes(cell3_sig_genes,cell3) 

# cell4_nonsig_vars = find_nonsignif_vars(cell4)
# cell4_nonsig_genes = find_nonsignif_genes(cell4_sig_genes,cell4) 

# cell5_nonsig_vars = find_nonsignif_vars(cell5)
# cell5_nonsig_genes = find_nonsignif_genes(cell5_sig_genes,cell5) 

# # saving lists to files
# genelist_to_file(cell1_nonsig_genes, f'deconeqtl.nonsignificant.fdr.genes.{cell1}')
# genelist_to_file(cell2_nonsig_genes, f'deconeqtl.nonsignificant.fdr.genes.{cell2}')
# genelist_to_file(cell3_nonsig_genes, f'deconeqtl.nonsignificant.fdr.genes.{cell3}')
# genelist_to_file(cell4_nonsig_genes, f'deconeqtl.nonsignificant.fdr.genes.{cell4}')
# genelist_to_file(cell5_nonsig_genes, f'deconeqtl.nonsignificant.fdr.genes.{cell5}')

# genelist_to_file(cell1_nonsig_vars, f'deconeqtl.nonsignificant.fdr.variants.{cell1}')
# genelist_to_file(cell2_nonsig_vars, f'deconeqtl.nonsignificant.fdr.variants.{cell2}')
# genelist_to_file(cell3_nonsig_vars, f'deconeqtl.nonsignificant.fdr.variants.{cell3}')
# genelist_to_file(cell4_nonsig_vars, f'deconeqtl.nonsignificant.fdr.variants.{cell4}')
# genelist_to_file(cell5_nonsig_vars, f'deconeqtl.nonsignificant.fdr.variants.{cell5}')



##loading in lists
print('Nonsignificant genes:')
cell1_nonsig_genes = file_to_list(f'deconeqtl.nonsignificant.fdr.genes.{cell1}', cell1)
cell2_nonsig_genes = file_to_list(f'deconeqtl.nonsignificant.fdr.genes.{cell2}', cell2)
cell3_nonsig_genes = file_to_list(f'deconeqtl.nonsignificant.fdr.genes.{cell3}', cell3)
cell4_nonsig_genes = file_to_list(f'deconeqtl.nonsignificant.fdr.genes.{cell4}', cell4)
cell5_nonsig_genes = file_to_list(f'deconeqtl.nonsignificant.fdr.genes.{cell5}', cell5)


print('')
print('Nonsignficant variants:')
cell1_nonsig_vars = file_to_list(f'deconeqtl.nonsignificant.fdr.variants.{cell1}',cell1)
cell2_nonsig_vars = file_to_list(f'deconeqtl.nonsignificant.fdr.variants.{cell2}',cell2)
cell3_nonsig_vars = file_to_list(f'deconeqtl.nonsignificant.fdr.variants.{cell3}',cell3)
cell4_nonsig_vars = file_to_list(f'deconeqtl.nonsignificant.fdr.variants.{cell4}',cell4)
cell5_nonsig_vars = file_to_list(f'deconeqtl.nonsignificant.fdr.variants.{cell5}',cell5)

Nonsignificant genes:
Microglia: 16663
Astrocyte: 16663
OPC: 16629
Oligodendrocyte: 15922
Neuron: 16624

Nonsignficant variants:
Microglia: 72241266
Astrocyte: 72241266
OPC: 72239648
Oligodendrocyte: 72223773
Neuron: 72225449


## Load in plink bulk results to compare

In [42]:
# ##load significant variants
plink_df = pd.read_hdf(f'/labshare/raph/eqtl/{cohort}/plink/output/glm/{cohort}.aug2020.chr1.glm.hdf5')
plink_df.head()
plink_df['#CHROM'] = plink_df['#CHROM'].astype(str)
plink_df['POS'] = plink_df['POS'].astype(str)
plink_df['REF'] = plink_df['REF'].astype(str)
plink_df['ALT'] = plink_df['ALT'].astype(str)
plink_df['SNP'] = 'chr'+plink_df['#CHROM']+':'+plink_df['POS']+':'+plink_df['REF']+':'+plink_df['ALT']
plink_df['PAIR'] = plink_df['gene_id']+'_'+plink_df['SNP']
plink_sig = plink_df.loc[plink_df['bh_fdr'] < 0.05]
print('chrom1: '+str(plink_sig.shape))

for chrom in CHROMOSOMES_sub:
    next_df = pd.read_hdf(f'/labshare/raph/eqtl/{cohort}/plink/output/glm/{cohort}.aug2020.chr{chrom}.glm.hdf5')
    
    next_df['#CHROM'] = next_df['#CHROM'].astype(str)
    next_df['POS'] = next_df['POS'].astype(str)
    next_df['REF'] = next_df['REF'].astype(str)
    next_df['ALT'] = next_df['ALT'].astype(str)
    next_df['SNP'] = 'chr'+next_df['#CHROM']+':'+next_df['POS']+':'+next_df['REF']+':'+next_df['ALT']
    next_df['PAIR'] = next_df['gene_id']+'_'+next_df['SNP']

    ## non signif bulk pairs
    next_sig = next_df.loc[next_df['bh_fdr'] < 0.05]

    plink_sig = pd.concat([plink_sig,next_sig])
    print(f'plus chrom{chrom}: '+str(plink_sig.shape))


chrom1: (15995, 17)
plus chrom2: (30228, 17)
plus chrom3: (40394, 17)
plus chrom4: (48403, 17)
plus chrom5: (61537, 17)
plus chrom6: (92701, 17)
plus chrom7: (112683, 17)
plus chrom8: (118803, 17)
plus chrom9: (129890, 17)
plus chrom10: (140077, 17)
plus chrom11: (152663, 17)
plus chrom12: (163495, 17)
plus chrom13: (167440, 17)
plus chrom14: (171244, 17)
plus chrom15: (179757, 17)
plus chrom16: (186377, 17)
plus chrom17: (230093, 17)
plus chrom18: (232536, 17)
plus chrom19: (249923, 17)
plus chrom20: (253927, 17)
plus chrom21: (258081, 17)
plus chrom22: (263144, 17)


In [41]:
plink_sig_genes = list(set(plink_sig['gene_id'].tolist()))
print(len(plink_sig_genes))
plink_sig_vars = plink_sig['PAIR'].tolist()
print(len(plink_sig_vars))

# ## saving list to file
genelist_to_file(plink_sig_genes, 'plink.eqtl.significant.genes')
genelist_to_file(plink_sig_vars, 'plink.eqtl.significant.variants')

9007
263144


In [43]:
# ##load nonsignificant variants
plink_df = pd.read_hdf(f'/labshare/raph/eqtl/nabec/plink/output/glm/{cohort}.aug2020.chr1.glm.hdf5')
plink_df.head()
plink_df['#CHROM'] = plink_df['#CHROM'].astype(str)
plink_df['POS'] = plink_df['POS'].astype(str)
plink_df['REF'] = plink_df['REF'].astype(str)
plink_df['ALT'] = plink_df['ALT'].astype(str)
plink_df['SNP'] = 'chr'+plink_df['#CHROM']+':'+plink_df['POS']+':'+plink_df['REF']+':'+plink_df['ALT']
plink_df['PAIR'] = plink_df['gene_id']+'_'+plink_df['SNP']
plink_nonsig = plink_df.loc[plink_df['bh_fdr'] > 0.05]
print('chrom1: '+str(plink_nonsig.shape))

for chrom in CHROMOSOMES_sub:
    next_df = pd.read_hdf(f'/labshare/raph/eqtl/nabec/plink/output/glm/{cohort}.aug2020.chr{chrom}.glm.hdf5')
    
    next_df['#CHROM'] = next_df['#CHROM'].astype(str)
    next_df['POS'] = next_df['POS'].astype(str)
    next_df['REF'] = next_df['REF'].astype(str)
    next_df['ALT'] = next_df['ALT'].astype(str)
    next_df['SNP'] = 'chr'+next_df['#CHROM']+':'+next_df['POS']+':'+next_df['REF']+':'+next_df['ALT']
    next_df['PAIR'] = next_df['gene_id']+'_'+next_df['SNP']

    ## non signif bulk pairs
    next_nonsig = next_df.loc[next_df['bh_fdr'] > 0.05]

    plink_nonsig = pd.concat([plink_nonsig,next_nonsig])
    print(f'plus chrom{chrom}: '+str(plink_nonsig.shape))



chrom1: (9361342, 17)
plus chrom2: (16540548, 17)
plus chrom3: (22848540, 17)
plus chrom4: (27508649, 17)
plus chrom5: (32769992, 17)
plus chrom6: (39973600, 17)
plus chrom7: (45447560, 17)
plus chrom8: (49667451, 17)
plus chrom9: (53744946, 17)
plus chrom10: (57927128, 17)
plus chrom11: (63404111, 17)
plus chrom12: (69352218, 17)
plus chrom13: (71656824, 17)
plus chrom14: (75390984, 17)
plus chrom15: (78900649, 17)
plus chrom16: (83501800, 17)
plus chrom17: (89168326, 17)
plus chrom18: (91116993, 17)
plus chrom19: (98501090, 17)
plus chrom20: (101086524, 17)
plus chrom21: (102260890, 17)
plus chrom22: (104712693, 17)


In [44]:
plink_nonsig_genes = list(set(plink_nonsig['gene_id'].tolist()))
print(len(plink_nonsig_genes))
plink_nonsig_vars = plink_nonsig['PAIR'].tolist()
print(len(plink_nonsig_vars))

# ## saving list to file
genelist_to_file(plink_nonsig_genes, 'plink.eqtl.nonsignificant.genes')
genelist_to_file(plink_nonsig_vars, 'plink.eqtl.nonsignificant.variants')

16670
104712693


In [45]:
##all
print(len(list(set(plink_sig_genes + plink_nonsig_genes))))
print(len(plink_sig_vars + plink_nonsig_vars))

16670
104975837


In [None]:
# #plink_df = pd.read_csv(f'/labshare/anni/eqtl/ppmi/plink/output/eqtl/V08.genes.chr1.plink.glm.linear.gz', sep = '\t')
# plink_df = pd.read_csv(f'/labshare/raph/eqtl/nabec/plink/output/glm/{cohort}.aug2020.chr1.glm.hdf5', sep = '\t')


# #plink_df['PAIR'] = plink_df['GENEID']+'_'+plink_df['ID']
# #plink_nonsig = plink_df.loc[plink_df['P'] > 0.05]
# print('chrom1: '+str(plink_df.shape))

# for chrom in CHROMOSOMES_sub:
#     #next_df = pd.read_csv(f'/labshare/anni/eqtl/ppmi/plink/output/eqtl/V08.genes.chr{chrom}.plink.glm.linear.gz', sep = '\t')
#     next_df = pd.read_csv(f'/labshare/anni/eqtl/nabec/plink/output/glm/{cohort}.aug2020.chr{chrom}.glm.hdf5', sep = '\t')


#     #next_df['PAIR'] = next_df['GENEID']+'_'+next_df['ID']

#     ## non signif bulk pairs
#     #next_nonsig = next_df.loc[next_df['P'] > 0.05]

#     plink_df = pd.concat([plink_df,next_df])
#     print(f'plus chrom{chrom}: '+str(plink_df.shape))

# plink_genes = list(set(plink_nonsig['GENEID'].tolist()))
# plink_vars = list(set(plink_nonsig['PAIR'].tolist()))
# plink_df.head()

In [None]:
# ##load significant variants
# #plink_df = pd.read_csv(f'/labshare/anni/eqtl/ppmi/plink/output/eqtl/V08.genes.chr1.plink.glm.linear.gz', sep = '\t')
# plink_sig = pd.read_csv(f'/labshare/anni/eqtl/ppmi/plink/output/eqtl/V08.chr1.suggestive.plink.glm.linear.gz', sep = '\t')


# plink_sig['PAIR'] = plink_sig['GENEID']+'_'+plink_sig['ID']
# #print(plink_df.shape)
# plink_sig = plink_sig.loc[plink_sig['P'] < 0.05]
# #print(plink_sig.shape)
# print('chrom1: '+str(plink_sig.shape))

# for chrom in CHROMOSOMES_sub:
#     #next_df = pd.read_csv(f'/labshare/anni/eqtl/ppmi/plink/output/eqtl/V08.genes.chr{chrom}.plink.glm.linear.gz', sep = '\t')
#     next_df = pd.read_csv(f'/labshare/anni/eqtl/ppmi/plink/output/eqtl/V08.chr{chrom}.suggestive.plink.glm.linear.gz', sep = '\t')
#     next_df['PAIR'] = next_df['GENEID']+'_'+next_df['ID']

#     ## non signif bulk pairs
#     next_df = next_df.loc[next_df['P'] < 0.05]

#     plink_sig = pd.concat([plink_sig,next_df])
#     print(f'plus chrom{chrom}: '+str(plink_sig.shape))
    
# plink_sig_genes = list(set(plink_sig['GENEID'].tolist()))
# plink_sig_vars = list(set(plink_sig['PAIR'].tolist()))

# plink_df.head()

In [10]:


##loading in lists from file
plink_sig_genes = file_to_list('plink.eqtl.significant.genes','All')
plink_sig_vars = file_to_list('plink.eqtl.significant.variants','All')


# saving list to file
#genelist_to_file(plink_nonsig_genes, 'plink.eqtl.nonsignificant.genes')
#genelist_to_file(plink_nonsig_vars, 'plink.eqtl.nonsignificant.variants')

#loading in lists from file
#plink_nonsig_genes = file_to_list('plink.eqtl.nonsignificant.genes')
plink_nonsig_vars = file_to_list('plink.eqtl.nonsignificant.variants','All')

All: 9007
All: 263144
All: 104712693


In [46]:
test = list(set(plink_nonsig_genes) & set(plink_sig_genes))
len(test)

9007

In [32]:
# print(len(_sig_genes))
# baso_noplink = set(baso_sig_genes) - set(plink_sig_genes)
# print(len(baso_noplink))
# baso_plink = set(baso_sig_genes) & set(plink_sig_genes)
# print(len(baso_plink))

2201
199
2002


### Significant in cell type of interest, not in plink bulk

In [17]:

outdir = '/labshare/anni/eqtl/Decon-eQTL/ppmi/results'

# def nobulk_cell_genes(cell):
#     cell_sig_genes = f'{cell}_sig_genes'
#     nobulk_cell_genes = compare_plinknonsig(cell_sig_genes, cell)
#     genelist_to_file(nobulk_cell_genes, f'decon_sig_plink_nonsig/{cohort}.{cell}_sig.fdr.plink_nonsig.genes')
    
# def nobulk_cell_variants(cell):
#     cell_sig_vars = f'{cell}_sig_vars'
#     nobulk_cell_vars = compare(plink_nonsig_vars, cell_sig_vars, cell)
#     genelist_to_file(nobulk_cell_vars, f'decon_sig_plink_nonsig/{cohort}.{cell}_sig.fdr.plink_nonsig.variants')
    

    
# print('genes signif in cell type, not in bulk')
# for x in cell_types:
#     nobulk_cell_genes(x)
#     nobulk_cell_variants(x)

nobulk_cell1_genes = compare_plinknonsig(cell1_sig_genes, cell1)
nobulk_cell2_genes = compare_plinknonsig(cell2_sig_genes, cell2)
nobulk_cell3_genes = compare_plinknonsig(cell3_sig_genes, cell3)
nobulk_cell4_genes = compare_plinknonsig(cell4_sig_genes, cell4)
nobulk_cell5_genes = compare_plinknonsig(cell5_sig_genes, cell5)

## saving gene lists to files           
genelist_to_file(nobulk_cell1_genes, f'decon_sig_plink_nonsig/{cohort}.{cell1}_sig.fdr.plink_nonsig.genes')
genelist_to_file(nobulk_cell2_genes, f'decon_sig_plink_nonsig/{cohort}.{cell2}_sig.fdr.plink_nonsig.genes')
genelist_to_file(nobulk_cell3_genes, f'decon_sig_plink_nonsig/{cohort}.{cell3}_sig.fdr.plink_nonsig.genes')
genelist_to_file(nobulk_cell4_genes, f'decon_sig_plink_nonsig/{cohort}.{cell4}_sig.fdr.plink_nonsig.genes')
genelist_to_file(nobulk_cell5_genes, f'decon_sig_plink_nonsig/{cohort}.{cell5}_sig.fdr.plink_nonsig.genes')


print('variants')
nobulk_cell1_vars = compare(plink_nonsig_vars, cell1_sig_vars, cell1)
nobulk_cell2_vars = compare(plink_nonsig_vars, cell2_sig_vars, cell2)
nobulk_cell3_vars = compare(plink_nonsig_vars, cell3_sig_vars, cell3)
nobulk_cell4_vars = compare(plink_nonsig_vars, cell4_sig_vars, cell4)
nobulk_cell5_vars = compare(plink_nonsig_vars, cell5_sig_vars, cell5)



# saving variant lists to files           
genelist_to_file(nobulk_cell1_vars, f'decon_sig_plink_nonsig/{cohort}.{cell1}_sig.fdr.plink_nonsig.variants')
genelist_to_file(nobulk_cell2_vars, f'decon_sig_plink_nonsig/{cohort}.{cell2}_sig.fdr.plink_nonsig.variants')
genelist_to_file(nobulk_cell3_vars, f'decon_sig_plink_nonsig/{cohort}.{cell3}_sig.fdr.plink_nonsig.variants')
genelist_to_file(nobulk_cell4_vars, f'decon_sig_plink_nonsig/{cohort}.{cell4}_sig.fdr.plink_nonsig.variants')
genelist_to_file(nobulk_cell5_vars, f'decon_sig_plink_nonsig/{cohort}.{cell5}_sig.fdr.plink_nonsig.variants')

Microglia: 0
Astrocyte: 0
OPC: 17
Oligodendrocyte: 252
Neuron: 0
variants
Microglia: 0
Astrocyte: 0
OPC: 1558
Oligodendrocyte: 17485
Neuron: 0


### Significant in bulk not in cell type of interest

In [16]:
## comparing significant genes in plink bulk, not in the cell types
# find overlap

# def bulk_nocell_genes(cell):
#     cell_nonsig_genes = f'{cell}_nonsig_genes'
#     bulk_nocell_genes = compare(plink_sig_genes, cell_nonsig_genes, cell)
#     genelist_to_file(bulk_nocell_genes, f'decon_nonsig_plink_sig/{cohort}.{cell}_nonsig.fdr.plink_sig.genes')
    
# def bulk_nocell_variants(cell):
#     cell_nonsig_vars = f'{cell}_nonsig_vars'
#     bulk_nocell_vars = compare(plink_sig_genes, cell_nonsig_vars, cell)
#     genelist_to_file(bulk_nocell_vars, f'decon_nonsig_plink_sig/{cohort}.{cell}_nonsig.fdr.plink_sig.variants')
    
   
# print('genes signif in bulk, not in cell type')
# for x in cell_types:
#     bulk_nocell_genes(x)
#     bulk_nocell_variants(x)
    
    
    

print('genes signif in bulk, not in cell type')
bulk_nocell1_genes = compare(plink_sig_genes, cell1_nonsig_genes, cell1)
bulk_nocell2_genes = compare(plink_sig_genes, cell2_nonsig_genes, cell2)
bulk_nocell3_genes = compare(plink_sig_genes, cell3_nonsig_genes, cell3)
bulk_nocell4_genes = compare(plink_sig_genes, cell4_nonsig_genes, cell4)
bulk_nocell5_genes = compare(plink_sig_genes, cell5_nonsig_genes, cell5)

## saving gene lists to files
genelist_to_file(bulk_nocell1_genes, f'decon_nonsig_plink_sig/{cohort}.{cell1}_nonsig.fdr.plink_sig.genes')
genelist_to_file(bulk_nocell2_genes, f'decon_nonsig_plink_sig/{cohort}.{cell2}_nonsig.fdr.plink_sig.genes')
genelist_to_file(bulk_nocell3_genes, f'decon_nonsig_plink_sig/{cohort}.{cell3}_nonsig.fdr.plink_sig.genes')
genelist_to_file(bulk_nocell4_genes, f'decon_nonsig_plink_sig/{cohort}.{cell4}_nonsig.fdr.plink_sig.genes')
genelist_to_file(bulk_nocell5_genes, f'decon_nonsig_plink_sig/{cohort}.{cell5}_nonsig.fdr.plink_sig.genes')


print('variants')
bulk_nocell1_vars = compare(plink_sig_vars, cell1_nonsig_vars, cell1)
bulk_nocell2_vars = compare(plink_sig_vars, cell2_nonsig_vars, cell2)
bulk_nocell3_vars = compare(plink_sig_vars, cell3_nonsig_vars, cell3)
bulk_nocell4_vars = compare(plink_sig_vars, cell4_nonsig_vars, cell4)
bulk_nocell5_vars = compare(plink_sig_vars, cell5_nonsig_vars, cell5)



## saving gene lists to files           
genelist_to_file(bulk_nocell1_vars, f'decon_nonsig_plink_sig/{cohort}.{cell1}_nonsig.fdr.plink_sig.variants')
genelist_to_file(bulk_nocell2_vars, f'decon_nonsig_plink_sig/{cohort}.{cell2}_nonsig.fdr.plink_sig.variants')
genelist_to_file(bulk_nocell3_vars, f'decon_nonsig_plink_sig/{cohort}.{cell3}_nonsig.fdr.plink_sig.variants')
genelist_to_file(bulk_nocell4_vars, f'decon_nonsig_plink_sig/{cohort}.{cell4}_nonsig.fdr.plink_sig.variants')
genelist_to_file(bulk_nocell5_vars, f'decon_nonsig_plink_sig/{cohort}.{cell5}_nonsig.fdr.plink_sig.variants')

genes signif in bulk, not in cell type
Microglia: 9003
Astrocyte: 9003
OPC: 8986
Oligodendrocyte: 8514
Neuron: 8964
variants
Microglia: 224474
Astrocyte: 224474
OPC: 224414
Oligodendrocyte: 224466
Neuron: 208657


### Significant in both cell type of interest and in plink bulk

In [18]:
# # comparing significant genes in different cell types and overall bulk
# # find overlap


# def bulk_cell_genes(cell):
#     cell_sig_genes = f'{cell}_sig_genes'
#     bulk_cell_genes = compare(plink_sig_genes, cell_sig_genes, cell)
#     genelist_to_file(bulk_cell_genes, f'decon_sig_plink_sig/{cohort}.{cell}_sig.fdr.plink_sig.genes')
    
# def bulk_cell_variants(cell):
#     cell_sig_vars = f'{cell}_sig_vars'
#     bulk_cell_vars = compare(plink_sig_genes, cell_sig_vars, cell)
#     genelist_to_file(bulk_cell_vars, f'decon_sig_plink_sig/{cohort}.{cell}_sig.fdr.plink_sig.variants')
    
   
# print('genes signif in bulk and cell type')
# for x in cell_types:
#     bulk_cell_genes(x)
#     bulk_cell_variants(x)



print('significant genes in bulk and decon:')
bulk_cell1_genes = compare(plink_sig_genes, cell1_sig_genes, cell1)
bulk_cell2_genes = compare(plink_sig_genes, cell2_sig_genes, cell2)
bulk_cell3_genes = compare(plink_sig_genes, cell3_sig_genes, cell3)
bulk_cell4_genes = compare(plink_sig_genes, cell4_sig_genes, cell4)
bulk_cell5_genes = compare(plink_sig_genes, cell5_sig_genes, cell5)

## saving gene lists to files
genelist_to_file(bulk_cell1_genes, f'decon_sig_plink_sig/{cohort}.{cell1}_sig.fdr.plink_sig.genes')
genelist_to_file(bulk_cell2_genes, f'decon_sig_plink_sig/{cohort}.{cell2}_sig.fdr.plink_sig.genes')
genelist_to_file(bulk_cell3_genes, f'decon_sig_plink_sig/{cohort}.{cell3}_sig.fdr.plink_sig.genes')
genelist_to_file(bulk_cell4_genes, f'decon_sig_plink_sig/{cohort}.{cell4}_sig.fdr.plink_sig.genes')
genelist_to_file(bulk_cell5_genes, f'decon_sig_plink_sig/{cohort}.{cell5}_sig.fdr.plink_sig.genes')

print('significant variants in bulk and decon:')
bulk_cell1_vars = compare(plink_sig_vars, cell1_sig_vars, cell1)
bulk_cell2_vars = compare(plink_sig_vars, cell2_sig_vars, cell2)
bulk_cell3_vars = compare(plink_sig_vars, cell3_sig_vars, cell3)
bulk_cell4_vars = compare(plink_sig_vars, cell4_sig_vars, cell4)
bulk_cell5_vars = compare(plink_sig_vars, cell5_sig_vars, cell5)

## saving gene lists to files
genelist_to_file(bulk_cell1_vars, f'decon_sig_plink_sig/{cohort}.{cell1}_sig.fdr.plink_sig.variants')
genelist_to_file(bulk_cell2_vars, f'decon_sig_plink_sig/{cohort}.{cell2}_sig.fdr.plink_sig.variants')
genelist_to_file(bulk_cell3_vars, f'decon_sig_plink_sig/{cohort}.{cell3}_sig.fdr.plink_sig.variants')
genelist_to_file(bulk_cell4_vars, f'decon_sig_plink_sig/{cohort}.{cell4}_sig.fdr.plink_sig.variants')
genelist_to_file(bulk_cell5_vars, f'decon_sig_plink_sig/{cohort}.{cell5}_sig.fdr.plink_sig.variants')

significant genes in bulk and decon:
Microglia: 0
Astrocyte: 0
OPC: 17
Oligodendrocyte: 489
Neuron: 39
significant variants in bulk and decon:
Microglia: 0
Astrocyte: 0
OPC: 60
Oligodendrocyte: 8
Neuron: 15817


In [None]:
## PUlling up specific gene lists

In [5]:
neutro_sig_noplink = file_to_list('decon_sig_plink_nonsig/ppmi.neutrophil_sig.fdr.plink_nonsig.genes', 'neutrophil')
neutro_sig_noplink

neutrophil: 44


['ENSG00000117115.13',
 'ENSG00000165650.12',
 'ENSG00000106772.18',
 'ENSG00000163993.7',
 'ENSG00000257433.6',
 'ENSG00000277879.1',
 'ENSG00000162069.16',
 'ENSG00000123838.11',
 'ENSG00000260286.4',
 'ENSG00000091039.17',
 'ENSG00000279996.1',
 'ENSG00000005020.13',
 'ENSG00000170448.12',
 'ENSG00000256660.6',
 'ENSG00000241764.4',
 'ENSG00000135953.11',
 'ENSG00000258476.6',
 'ENSG00000172322.14',
 'ENSG00000148110.16',
 'ENSG00000127946.17',
 'ENSG00000256751.6',
 'ENSG00000163293.12',
 'ENSG00000171617.14',
 'ENSG00000260943.1',
 'ENSG00000278745.1',
 'ENSG00000072401.15',
 'ENSG00000124588.20',
 'ENSG00000129295.9',
 'ENSG00000188056.12',
 'ENSG00000112299.8',
 'ENSG00000259235.2',
 'ENSG00000287733.1',
 'ENSG00000110203.9',
 'ENSG00000285106.2',
 'ENSG00000116299.17',
 'ENSG00000163803.13',
 'ENSG00000234426.3',
 'ENSG00000184106.8',
 'ENSG00000164251.5',
 'ENSG00000286830.1',
 'ENSG00000286480.1',
 'ENSG00000242110.8',
 'ENSG00000274256.1',
 'ENSG00000237803.6']