In [1]:
import os,glob
import pandas as pd
import pickle
import numpy as np
import scipy.stats as stats

In [7]:
rna_genes_files = glob.glob('../data/processed/fig1/rna/*genes.csv')
print(rna_genes_files)
hichip_genes_files = glob.glob('../data/processed/fig1/hichip/*genes.csv')
print(hichip_genes_files)
atac_gene_files = glob.glob('../data/processed/fig1/atac/*genes.csv')
atac_gene_files

['../data/processed/fig1/rna/purple_genes.csv', '../data/processed/fig1/rna/green_genes.csv', '../data/processed/fig1/rna/all_genes.csv', '../data/processed/fig1/rna/grey_genes.csv', '../data/processed/fig1/rna/blue_genes.csv']
['../data/processed/fig1/hichip/purple_genes.csv', '../data/processed/fig1/hichip/green_genes.csv', '../data/processed/fig1/hichip/grey_genes.csv', '../data/processed/fig1/hichip/blue_genes.csv']


['../data/processed/fig1/atac/purple_genes.csv',
 '../data/processed/fig1/atac/green_genes.csv',
 '../data/processed/fig1/atac/all_genes.csv',
 '../data/processed/fig1/atac/grey_genes.csv',
 '../data/processed/fig1/atac/blue_genes.csv']

In [56]:
save_dir = '../data/processed/fig1/'

In [35]:
def get_genes(gene_files):
    all_genes_bg = []
    all_gene_dict = {}
    for file in gene_files:
        filename = os.path.basename(file)
        color = filename.split('_')[0]
        genes = list(pd.read_csv(file,header=None).loc[:,0])
        all_gene_dict[color] = sorted(set(genes))
        all_genes_bg+=all_gene_dict[color] 
    
    all_gene_dict['background'] = sorted(set(all_genes_bg))
    return all_gene_dict
        
rna_genes = get_genes(rna_genes_files)
hichip_genes = get_genes(hichip_genes_files)
atac_genes = get_genes(atac_gene_files)

In [60]:
np.arange(4).reshape((2,2)).sum(axis=0)

array([2, 4])

In [113]:
def comp_two_gene_dicts(gene_dict1, gene_dict2, pseudo=0):
    """
    1: rows
    2: columns
    """
    bg = set(gene_dict1['background']) | set(gene_dict2['background'])
    
    results = {}
    result_idx = 0
    for color_1 in ['purple','green', 'grey', 'blue']:
        for color_2 in ['purple','green', 'grey', 'blue']:
            geneset_1 = set(gene_dict1[color_1])
            geneset_2 = set(gene_dict2[color_2])
            
            
            in_1_and_in_2 = geneset_1 & geneset_2
            in_1_not_2 = geneset_1 - geneset_2
            not_1_in_2 = geneset_2 - geneset_1
            in_1_or_in_2 = geneset_1 | geneset_2
            not_1_not_2 = bg - in_1_or_in_2
            
            mat_counts = np.array([[len(in_1_and_in_2), len(in_1_not_2)],
                                  [len(not_1_in_2), len(not_1_not_2)]]).reshape((2,2))
            mat_counts_pseudo = mat_counts+pseudo
#             if (color_1=='grey') & (color_2=='grey'):
#                 print(mat_counts)
    
            num_in_1 = mat_counts.sum(axis=1)[0]
            num_in_2 = mat_counts.sum(axis=0)[0]
            observed_num = mat_counts[0][0]
            expected_num = num_in_1*num_in_2/sum(sum(mat_counts))
            oddsratio_pseudo, pvalue_pseudo = stats.fisher_exact(mat_counts_pseudo,alternative='greater')
            jaccard = len(in_1_and_in_2)/len(in_1_or_in_2)
            intersect_over_min = len(in_1_and_in_2)/min(num_in_1,num_in_2)
            
            results[result_idx] = {'color_row':color_1, 'color_col':color_2,
                                   'jaccard':jaccard,'intersect_over_min':intersect_over_min,
                                    'intersection':len(in_1_and_in_2), 
                                   'union':len(in_1_or_in_2), 
                                   'num_in_1':num_in_1,'num_in_2':num_in_2,
                                   'observed':observed_num, 'expected':expected_num, 
                                   'oddsratio':oddsratio_pseudo, 'pval':pvalue_pseudo}
    
            result_idx+=1
    result_df = pd.DataFrame.from_dict(results,orient='index')
    result_df['pval_bonf'] = result_df.pval.apply(lambda x: min(1, x* sum(sum(mat_counts))))#result_df.shape[0]))
    result_df['log_pval_bonf'] = result_df.pval_bonf.apply(lambda x: min(100,-np.log10(x+1e-100)))
    return result_df

In [114]:
rna_hichip_df = comp_two_gene_dicts(rna_genes, hichip_genes)
rna_hichip_df.to_csv(os.path.join(save_dir, 'rna_hichip_df.csv'))

In [115]:
rna_atac_df = comp_two_gene_dicts(rna_genes, atac_genes)
rna_atac_df.to_csv(os.path.join(save_dir, 'rna_atac_df.csv'))

In [116]:
atac_hichip_df = comp_two_gene_dicts(atac_genes, hichip_genes,pseudo=1)
atac_hichip_df.to_csv(os.path.join(save_dir, 'atac_hichip_df.csv'))

In [117]:
atac_hichip_df

Unnamed: 0,color_row,color_col,jaccard,intersect_over_min,intersection,union,num_in_1,num_in_2,observed,expected,oddsratio,pval,pval_bonf,log_pval_bonf
0,purple,purple,0.223009,0.706542,1890,8475,7690,2675,1890,1107.025616,4.192097,2.1054139999999998e-240,3.9122799999999997e-236,100.0
1,purple,green,0.16665,0.43476,1636,9817,7690,3763,1636,1557.285007,1.11369,0.001879675,1.0,-0.0
2,purple,grey,0.171483,0.565314,1519,8858,7690,2687,1519,1111.991712,2.048768,1.0643739999999999e-65,1.97782e-61,60.703813
3,purple,blue,0.022214,0.764444,172,7743,7690,225,172,93.114304,4.618719,7.415245e-27,1.3779010000000001e-22,21.860782
4,green,purple,0.152778,0.485234,1298,8496,7119,2675,1298,1024.826445,1.633239,2.009004e-31,3.733131e-27,26.427927
5,green,green,0.232669,0.545841,2054,8828,7119,3763,2054,1441.653051,2.314077,2.14924e-114,3.993717e-110,100.0
6,green,grey,0.179882,0.556383,1495,8311,7119,2687,1495,1029.423797,2.289938,1.7601209999999998e-86,3.270657e-82,81.485365
7,green,blue,0.022414,0.715556,161,7183,7119,225,161,86.200355,4.082815,3.247567e-24,6.03463e-20,19.219349
8,grey,purple,0.164603,0.526729,1409,8560,7294,2675,1409,1050.018835,1.895047,2.6041900000000002e-52,4.8391059999999993e-48,47.315235
9,grey,green,0.167952,0.422535,1590,9467,7294,3763,1590,1477.091917,1.16939,1.371797e-05,0.2549074,0.593618


In [102]:
stats.fisher_exact([[ 2067,  5227],
 [  620, 10668]])

(6.804234835253677, 0.0)

In [75]:
rna_atac_df.pivot(index='color_row',columns='color_col',values='log_pval_bonf')

color_col,blue,green,grey,purple
color_row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
blue,111.606543,21.584649,10.629782,18.079265
green,-0.0,109.046824,0.721344,-0.0
grey,1.491707,65.651829,236.974806,10.261124
purple,-0.0,-0.0,-0.0,72.881129


In [76]:
rna_atac_df.pivot(index='color_row',columns='color_col',values='jaccard')

color_col,blue,green,grey,purple
color_row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
blue,0.076139,0.045996,0.042335,0.044718
green,0.019503,0.053349,0.030454,0.028359
grey,0.025686,0.041135,0.058258,0.029718
purple,0.025901,0.02966,0.030967,0.049079


In [77]:
rna_atac_df.pivot(index='color_row',columns='color_col',values='intersect_over_min')

color_col,blue,green,grey,purple
color_row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
blue,0.281971,0.372117,0.351153,0.387841
green,0.090909,0.525692,0.31357,0.306983
grey,0.138978,0.488818,0.696486,0.383387
purple,0.115578,0.286432,0.305276,0.498744
