In [1]:
import os,glob
import pandas as pd
import pickle
import numpy as np
import scipy.stats as stats

In [65]:
rna_genes_files = glob.glob('../data/processed/fig1/rna/rna_genes*.csv')
print(rna_genes_files)
hichip_genes_files = glob.glob('../data/processed/fig1/hichip/time_time_genes_*.csv')
print(hichip_genes_files)
atac_gene_files =['../data/processed/fig1/atac/atac_time_genes_H9_D0.csv', '../data/processed/fig1/atac/atac_time_genes_H9_D2.csv',
                 '../data/processed/fig1/atac/atac_time_genes_H9_D10.csv','../data/processed/fig1/atac/atac_time_genes_H9_D28.csv']
#glob.glob('../data/processed/fig1/atac/atac_time_genes*.csv')
atac_gene_files

['../data/processed/fig1/rna/rna_genes_H9_time_H9_D10.csv', '../data/processed/fig1/rna/rna_genes_H9_time_H9_D28.csv', '../data/processed/fig1/rna/rna_genes_H9_time_H9_D2.csv', '../data/processed/fig1/rna/rna_genes_H9_time_H9_D0.csv']
['../data/processed/fig1/hichip/time_time_genes_H9_D0.csv', '../data/processed/fig1/hichip/time_time_genes_H9_D2.csv', '../data/processed/fig1/hichip/time_time_genes_H9_D10.csv', '../data/processed/fig1/hichip/time_time_genes_H9_D28.csv']


['../data/processed/fig1/atac/atac_time_genes_H9_D0.csv',
 '../data/processed/fig1/atac/atac_time_genes_H9_D2.csv',
 '../data/processed/fig1/atac/atac_time_genes_H9_D10.csv',
 '../data/processed/fig1/atac/atac_time_genes_H9_D28.csv']

In [66]:
save_dir = '../data/processed/fig1/'

In [67]:
def get_genes(gene_files):
    all_genes_bg = []
    all_gene_dict = {}
    for file in gene_files:
        filename = os.path.basename(file).split('.')[0]
        tissue = '_'.join(filename.split('_')[-2:] )## modify
        genes = list(pd.read_csv(file,header=None).loc[:,0])
        all_gene_dict[tissue] = sorted(set(genes))
        all_genes_bg+=all_gene_dict[tissue] 
    
    all_gene_dict['background'] = sorted(set(all_genes_bg))
    return all_gene_dict
        
rna_genes = get_genes(rna_genes_files)
hichip_genes = get_genes(hichip_genes_files)
atac_genes = get_genes(atac_gene_files)

In [68]:
atac_genes.keys()

dict_keys(['H9_D0', 'H9_D2', 'H9_D10', 'H9_D28', 'background'])

In [69]:
def comp_two_gene_dicts(gene_dict1, gene_dict2, comp_groups, pseudo=0):
    """
    1: rows
    2: columns
    """
    bg = set(gene_dict1['background']) | set(gene_dict2['background'])
    
    results = {}
    result_idx = 0
    for color_1 in comp_groups:
        for color_2 in comp_groups:
            geneset_1 = set(gene_dict1[color_1])
            geneset_2 = set(gene_dict2[color_2])
            
            
            in_1_and_in_2 = geneset_1 & geneset_2
            in_1_not_2 = geneset_1 - geneset_2
            not_1_in_2 = geneset_2 - geneset_1
            in_1_or_in_2 = geneset_1 | geneset_2
            not_1_not_2 = bg - in_1_or_in_2
            
            mat_counts = np.array([[len(in_1_and_in_2), len(in_1_not_2)],
                                  [len(not_1_in_2), len(not_1_not_2)]]).reshape((2,2))
            mat_counts_pseudo = mat_counts+pseudo
#             if (color_1=='grey') & (color_2=='grey'):
#                 print(mat_counts)
    
            num_in_1 = mat_counts.sum(axis=1)[0]
            num_in_2 = mat_counts.sum(axis=0)[0]
            observed_num = mat_counts[0][0]
            expected_num = num_in_1*num_in_2/sum(sum(mat_counts))
            oddsratio_pseudo, pvalue_pseudo = stats.fisher_exact(mat_counts_pseudo,alternative='greater')
            jaccard = len(in_1_and_in_2)/len(in_1_or_in_2)
            intersect_over_min = len(in_1_and_in_2)/min(num_in_1,num_in_2)
            
            results[result_idx] = {'color_row':color_1, 'color_col':color_2,
                                   'jaccard':jaccard,'intersect_over_min':intersect_over_min,
                                    'intersection':len(in_1_and_in_2), 
                                   'union':len(in_1_or_in_2), 
                                   'num_in_1':num_in_1,'num_in_2':num_in_2,
                                   'observed':observed_num, 'expected':expected_num, 
                                   'oddsratio':oddsratio_pseudo, 'pval':pvalue_pseudo}
    
            result_idx+=1
    result_df = pd.DataFrame.from_dict(results,orient='index')
    result_df['pval_bonf'] = result_df.pval.apply(lambda x: min(1, x* sum(sum(mat_counts))))#result_df.shape[0]))
    result_df['log_pval_bonf'] = result_df.pval_bonf.apply(lambda x: min(100,-np.log10(x+1e-100)))
    return result_df

In [70]:
comp_val = ['H9_D0', 'H9_D2', 'H9_D10', 'H9_D28']

In [71]:
rna_hichip_df = comp_two_gene_dicts(rna_genes, hichip_genes,comp_groups=comp_val)
rna_hichip_df.to_csv(os.path.join(save_dir, 'rna_hichip_df.csv'))

In [72]:
rna_atac_df = comp_two_gene_dicts(rna_genes, atac_genes,comp_groups=comp_val)
rna_atac_df.to_csv(os.path.join(save_dir, 'rna_atac_df.csv'))

In [73]:
atac_hichip_df = comp_two_gene_dicts(atac_genes, hichip_genes,comp_groups=comp_val,pseudo=1)
atac_hichip_df.to_csv(os.path.join(save_dir, 'atac_hichip_df.csv'))

In [75]:
rna_hichip_df

Unnamed: 0,color_row,color_col,jaccard,intersect_over_min,intersection,union,num_in_1,num_in_2,observed,expected,oddsratio,pval,pval_bonf,log_pval_bonf
0,H9_D0,H9_D0,0.009688,0.342857,72,7432,210,7294,72,110.794937,0.46185,1.0,1,-0.0
1,H9_D0,H9_D2,0.007134,0.157143,33,4626,210,4449,33,67.579747,0.388376,1.0,1,-0.0
2,H9_D0,H9_D10,0.007811,0.285714,60,7681,210,7531,60,114.394937,0.328952,1.0,1,-0.0
3,H9_D0,H9_D28,0.006073,0.190476,40,6587,210,6417,40,97.473418,0.267063,1.0,1,-0.0
4,H9_D2,H9_D0,0.003664,0.267327,27,7368,101,7294,27,53.287089,0.324196,1.0,1,-0.0
5,H9_D2,H9_D2,0.003972,0.178218,18,4532,101,4449,18,32.50264,0.454829,0.999627,1,-0.0
6,H9_D2,H9_D10,0.004475,0.336634,34,7598,101,7531,34,55.018517,0.421498,0.999992,1,-0.0
7,H9_D2,H9_D28,0.004779,0.306931,31,6487,101,6417,31,46.880072,0.508877,0.999569,1,-0.0
8,H9_D10,H9_D0,0.018816,0.318486,143,7600,449,7294,143,236.890127,0.406806,1.0,1,-0.0
9,H9_D10,H9_D2,0.019143,0.2049,92,4806,449,4449,92,144.491935,0.533446,1.0,1,-0.0


In [49]:
stats.fisher_exact([[ 2067,  5227],
 [  620, 10668]])

(6.804234835253677, 0.0)

In [50]:
rna_atac_df.pivot(index='color_row',columns='color_col',values='log_pval_bonf')

color_col,H9_D0,H9_D10,H9_D2,H9_D28
color_row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
H9_D0,-0.0,-0.0,-0.0,-0.0
H9_D10,-0.0,-0.0,-0.0,-0.0
H9_D2,-0.0,-0.0,-0.0,-0.0
H9_D28,-0.0,-0.0,-0.0,-0.0


In [55]:
rna_atac_df.pivot(index='color_row',columns='color_col',values='jaccard')

color_col,H9_D0,H9_D10,H9_D2,H9_D28
color_row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
H9_D0,0.004625,0.005848,0.003937,0.006826
H9_D10,0.005311,0.008011,0.009434,0.016607
H9_D2,0.002051,0.002463,0.005013,0.004412
H9_D28,0.019542,0.009636,0.018478,0.023681


In [56]:
rna_atac_df.pivot(index='color_row',columns='color_col',values='intersect_over_min')

color_col,H9_D0,H9_D10,H9_D2,H9_D28
color_row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
H9_D0,0.02381,0.014286,0.009524,0.047619
H9_D10,0.01559,0.019608,0.023333,0.062361
H9_D2,0.019802,0.009901,0.019802,0.059406
H9_D28,0.045526,0.029412,0.056667,0.069074
