07/06/2020

In [1]:
# basic packages
import os, glob
import pandas as pd
import numpy as np; np.random.seed(0)
import itertools
from collections import Counter, defaultdict
import time

# machine learning packages from sklearn
from sklearn.preprocessing import MinMaxScaler #StandardScaler 
from sklearn import preprocessing, metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV
from IPython.display import Image
from scipy import stats

# for IRF
from functools import reduce
# Needed for the scikit-learn wrapper function
import irf
from irf import (irf_utils, utils,
                 irf_jupyter_utils)
from irf.ensemble.wrf import RandomForestClassifierWithWeights
from math import ceil

# Import our custom utilities
from imp import reload


# Import tools needed for visualization
import seaborn as sns; sns.set()
import matplotlib
import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz
import pydot

In [2]:
%load_ext autoreload
%autoreload 2

In [48]:
save_dir = '../data/processed/fig4_modelling/common_genes'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [4]:
THRES=1

In [5]:
normal_tissues = ['Airway','Astrocytes','Bladder','Colon','Esophageal','GDSD6','GM12878','HMEC','Melanocytes','Ovarian',
'Pancreas','Prostate','Renal','Thyroid','Uterine']
normal_tissues_dict = dict(zip(normal_tissues,range(len(normal_tissues))))

In [14]:
rna_df = pd.read_csv('../data/interim/rna/tissue_tpm_sym.csv',index_col=0)
rna_df_norm = rna_df[normal_tissues]
rna_D0_dict = pd.Series(rna_df.GDSD0.values, index=rna_df.index.values).to_dict()
rna_D3_dict = pd.Series(rna_df.GDSD3.values, index=rna_df.index.values).to_dict()
rna_D6_dict = pd.Series(rna_df.GDSD6.values, index=rna_df.index.values).to_dict()

# 0. CRM data wrangling

In [7]:
%%time
# import
data_all = pd.read_csv('/Users/mguo123/Google Drive/1_khavari/omics_project-LD/pan_omics/data/processed/tissue_crms/all_count_comb_overall.csv',index_col=0,header=0)
data_all = data_all[data_all.tissue.isin(normal_tissues)]
data_all = data_all[data_all.iloc[:,2:].sum(axis=1)>1e-1]

# expression labels
exp_label = list(np.log10(data_all.exp.values+1e-2))
labels_all  = np.array(np.array(exp_label)>THRES)

tissues_label  = data_all.tissue.values#np.array((data_all.exp>THRES).values)
tissue_num_labels =  data_all.tissue.map(normal_tissues_dict).values

genes_all = data_all.index.values
gene_to_num_dict = dict(zip(np.unique(genes_all),range(len(np.unique(genes_all)))))
genes_num_all = np.vectorize(gene_to_num_dict.get)(genes_all)



In [8]:
print('files_loaded', data_all.shape)


files_loaded (137054, 1063)


In [9]:
data_all[:5]

Unnamed: 0,tissue,exp,num_loop_counts,num_loops,num_atac_regions_pro,num_atac_regions_loop,AHR_pro,ARID5B_pro,ARNT_pro,ARNTL_pro,...,MAFA_loop,MEOX1_loop,E2F5_pro,E2F5_loop,ESR2_pro,KLF14_pro,TBX15_pro,ESR2_loop,KLF14_loop,TBX15_loop
A4GALT,Airway,5.435,45.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GNT,Airway,0.0,5060.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAMP,Airway,43.71,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AANAT,Airway,0.0,13.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ABAT,Airway,0.125,21.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
## only tfs

data_all.drop(['tissue','exp','num_loop_counts','num_loops','num_atac_regions_pro','num_atac_regions_loop'],axis=1,inplace=True)

data_all.shape




(137054, 1057)

In [11]:
selector = VarianceThreshold()
data_all_varfilt = selector.fit_transform(data_all)
data_all_varfilt_cols = data_all.columns[selector.get_support()]
print(data_all.shape, data_all_varfilt.shape, len(data_all_varfilt_cols))
scaler = MinMaxScaler()
data_all_norm = scaler.fit_transform(data_all_varfilt)
data_all_norm = pd.DataFrame(data_all_norm, columns = data_all_varfilt_cols)

(137054, 1057) (137054, 980) 980


In [12]:
data_all_norm[:5]

Unnamed: 0,AHR_pro,ARID5B_pro,ARNT_pro,ARNTL_pro,ATF1_pro,ATF2_pro,ATF3_pro,ATF4_pro,ATF6_pro,ATF7_pro,...,TFAP2B_pro,ZNF382_pro,MYOD1_loop,NR4A3_loop,TFAP2B_loop,ZNF382_loop,CUX2_pro,CUX2_loop,E2F5_pro,E2F5_loop
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 0. get common genes

In [99]:
common_genes = []
# counter = 0 
for gene, row in pd.DataFrame(rna_df_norm>THRES).iterrows():
#     if counter>5:
#         break
#     counter +=1
    if np.array(row).all():
        common_genes.append(gene)

In [100]:
len(common_genes)

7106

In [101]:
pd.Series(common_genes).to_csv(os.path.join(save_dir, 'common_genes.csv'),index=False, header=False)

# 1. Per Cell type enrichment of TFs used by common genes
- background is all common genes across all tis
- check signficance of using a TF within the common genes (all common genes) in a particular tissue, so it's like relative significance

A: is in common genes AND tissue of interest AND looking at TF of interest 
B: is in common genes AND NOT in tissue of interest AND looking at TF of interest 
C: is in common genes AND in tissue of interest AND NOT looking at TF of interest 
D: is in common genes AND NOT in tissue of interest AND NOT looking at TF of interest 

In [30]:
#tfs_feat_dict: dict
# key: tf
# value: list of crm features
tfs_feat_dict = defaultdict(list)
for feat in data_all.columns:
    tfs_feat_dict[feat.split('_')[0]].append(feat)

In [38]:
# get common subset
data_all_common = data_all[data_all.index.isin(common_genes)]
tissues_label_common = tissues_label[data_all.index.isin(common_genes)]
data_all_common.shape, tissues_label_common.shape

((60713, 1057), (60713,))

In [155]:
%%time
results_df = pd.DataFrame(columns = ['tf', 'tissue', 'jaccard', 'intersect_over_min','intersection','union', 'num_in_1', 'num_in_2', 'observed', 'expected', 'oddsratio', 'pval' ])
counter = 0
count_all = data_all_common.sum().sum()

for tissue in normal_tissues:
    tissue_crm = data_all_common[tissues_label_common==tissue]
    tissue_crm = tissue_crm[tissue_crm.index.isin(common_genes)]
    count_tissue_gene = tissue_crm.sum().sum() #mat_counts.sum(axis=1)[0], sum first row

    for tf, feat_list in tfs_feat_dict.items():
        if len(feat_list)>0:
#           print(tf)
            tissue_crm_selfeat = tissue_crm[feat_list]
            count_selfeat_tissue = tissue_crm_selfeat.sum().sum() # A
            count_selfeat = data_all_common[feat_list].sum().sum() #mat_counts.sum(axis=0)[0], sum down first col
            count_selfeat_neg = count_selfeat - count_selfeat_tissue # B: not in selected genes but in selected feature
            count_neg_tissue = count_tissue_gene - count_selfeat_tissue #C: not in selected feature but in selected genes
            count_neg_neg = count_all - count_selfeat_tissue- count_selfeat_neg - count_neg_tissue #D
            mat_counts = np.array([[count_selfeat_tissue,count_neg_tissue],
                           [count_selfeat_neg, count_neg_neg]]).reshape((2,2))
            pseudo = 1
            mat_counts_pseudo = mat_counts+pseudo
            num_in_1 = mat_counts.sum(axis=1)[0] #count_KRTgene
            num_in_2 = mat_counts.sum(axis=0)[0] #count_KRTtf
            in_1_and_in_2 = count_selfeat_tissue # A
            in_1_or_in_2 = count_selfeat_tissue +count_selfeat_neg+count_neg_tissue # A+B+C
            in_1 = count_selfeat_tissue+count_neg_tissue # A+C
            in_2 = count_selfeat_tissue+count_selfeat_neg#A+B
            observed_num = mat_counts[0][0] #count_KRTtf_KRTgene
            expected_num = num_in_1*num_in_2/sum(sum(mat_counts))
            oddsratio_pseudo, pvalue_pseudo = stats.fisher_exact(mat_counts_pseudo,alternative='greater')
            jaccard = in_1_and_in_2/in_1_or_in_2
            intersect_over_min = in_1_and_in_2/min(in_1,in_2)

            results_df.at[counter] = {'tf':tf, 'tissue':tissue,
                                        'jaccard':jaccard,'intersect_over_min':intersect_over_min,
                                    'intersection':in_1_and_in_2, 
                                   'union':in_1_or_in_2, 
                                   'num_in_1':num_in_1,'num_in_2':num_in_2,
                                   'observed':observed_num, 'expected':expected_num, 
                                   'oddsratio':oddsratio_pseudo, 'pval':pvalue_pseudo}
            counter+=1
            if (counter %1000)==0:
                print(tf, tissue, counter)
                




GATA4 Astrocytes 1000
PITX3 Colon 2000
NKX3-1 GDSD6 3000
ZNF264 HMEC 4000
STAT3 Ovarian 5000
NR4A2 Prostate 6000
IRX3 Thyroid 7000
CPU times: user 1min 2s, sys: 642 ms, total: 1min 3s
Wall time: 1min 3s


In [184]:
mat_counts_pseudo

array([[8.270000e+02, 3.952610e+05],
       [9.075000e+03, 6.796266e+06]])

In [157]:
results_df['pval_bonf'] = results_df.pval.apply(lambda x: min(1, x* sum(sum(mat_counts))))#result_df.shape[0]))
results_df['log_pval_bonf'] = results_df.pval_bonf.apply(lambda x: min(100,-np.log10(x+1e-100)))
results_df.to_csv(os.path.join(save_dir, 'stats_fisher_all_tfs.csv'))


In [158]:
results_df_filt = results_df[results_df.pval_bonf<0.05]
print(results_df_filt.shape)
results_df_filt

(1486, 14)


Unnamed: 0,tf,tissue,jaccard,intersect_over_min,intersection,union,num_in_1,num_in_2,observed,expected,oddsratio,pval,pval_bonf,log_pval_bonf
1,ARID5B,Airway,0.0145877,0.0361636,4691,321573,196548,129716,4691,3540.33,1.34573,6.59891e-80,4.752159e-73,72.323109
11,BARX2,Airway,0.0167728,0.0729905,4189,249750,196548,57391,4189,1566.37,2.84612,0,0.000000e+00,100.000000
58,ETV3,Airway,0.000743992,0.0585675,148,198927,196548,2527,148,68.9692,2.23213,1.3893e-17,1.000497e-10,9.999784
61,FOS,Airway,0.00506796,0.0412752,1129,222772,196548,27353,1129,746.544,1.53874,1.93131e-40,1.390822e-33,32.856729
62,FOSB,Airway,0.00384768,0.0400886,833,216494,196548,20779,833,567.12,1.49219,1.01957e-26,7.342378e-20,19.134163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7841,ZNF589,Uterine,0.0281692,0.50599,23483,833641,810714,46410,23483,5224.69,8.28498,0,0.000000e+00,100.000000
7845,ZSCAN16,Uterine,0.000343557,0.168275,279,812093,810714,1658,279,186.652,1.59962,6.3366e-12,4.563258e-05,4.340725
7847,DLX5,Uterine,0.000489317,0.38998,397,811335,810714,1018,397,114.603,5.04597,9.44418e-116,6.801153e-109,100.000000
7850,HOXB8,Uterine,0.000580752,0.607742,471,811018,810714,775,471,87.2471,12.2055,9.30417e-240,6.700326e-233,100.000000


In [159]:
results_df_common = results_df

In [98]:
results_df_filt[results_df_filt.tf=='TP63']

Unnamed: 0,tf,tissue,jaccard,intersect_over_min,intersection,union,num_in_1,num_in_2,observed,expected,oddsratio,pval,pval_bonf,log_pval_bonf
268,TP63,Airway,0.00425534,0.0718869,885,207974,196548,12311,885,336.003,2.77129,1.73959e-142,1.252756e-135,100.0
1326,TP63,Bladder,0.00433312,0.143124,1762,406635,396086,12311,1762,677.118,2.87953,6.062479999999999e-287,4.365851e-280,100.0
2913,TP63,GDSD6,0.00524078,0.187556,2309,440583,430581,12311,2309,736.088,3.64554,0.0,0.0,100.0
3971,TP63,HMEC,0.00436171,0.108277,1333,305614,294636,12311,1333,503.687,2.85667,1.32838e-219,9.566219e-213,100.0
5558,TP63,Pancreas,0.00223143,0.116969,1440,645326,634455,12311,1440,1084.62,1.37273,1.21088e-27,8.720037e-21,20.059482
6087,TP63,Prostate,0.00321616,0.0729429,898,279215,267802,12311,898,457.814,2.04273,6.6898199999999995e-78,4.817627e-71,70.317167
7674,TP63,Uterine,0.0044963,0.299245,3684,819341,810714,12311,3684,1385.93,3.37753,0.0,0.0,100.0


In [65]:
# Counter(results_df_filt.tf)

# 2. Per Cell type enrichment of TFs used by common genes
- background is all genes across all tis
- check signficance of using a TF within the common genes (all common genes) in a particular tissue, so it's like relative significance

A: is in common genes AND tissue of interest AND looking at TF of interest 
B: is in common genes AND NOT in tissue of interest AND looking at TF of interest 
C: is in common genes AND in tissue of interest AND NOT looking at TF of interest 
D: is in common genes AND NOT in tissue of interest AND NOT looking at TF of interest 

In [102]:
# get common subset
data_all_common = data_all[data_all.index.isin(common_genes)]
tissues_label_common = tissues_label[data_all.index.isin(common_genes)]
data_all_common.shape, tissues_label_common.shape

((60713, 1057), (60713,))

In [103]:
%%time
results_df = pd.DataFrame(columns = ['tf', 'tissue', 'jaccard', 'intersect_over_min','intersection','union', 'num_in_1', 'num_in_2', 'observed', 'expected', 'oddsratio', 'pval' ])
counter = 0
count_all = data_all.sum().sum()

for tissue in normal_tissues:
    tissue_crm = data_all_common[tissues_label_common==tissue]
    tissue_crm = tissue_crm[tissue_crm.index.isin(common_genes)]
    count_tissue_gene = tissue_crm.sum().sum() #mat_counts.sum(axis=1)[0], sum first row

    for tf, feat_list in tfs_feat_dict.items():
        if len(feat_list)>0:
#           print(tf)
            tissue_crm_selfeat = tissue_crm[feat_list]
            count_selfeat_tissue = tissue_crm_selfeat.sum().sum() # A
            count_selfeat = data_all_common[feat_list].sum().sum() #mat_counts.sum(axis=0)[0], sum down first col
            count_selfeat_neg = count_selfeat - count_selfeat_tissue # B: not in selected genes but in selected feature
            count_neg_tissue = count_tissue_gene - count_selfeat_tissue #C: not in selected feature but in selected genes
            count_neg_neg = count_all - count_selfeat_tissue - count_selfeat_neg - count_neg_tissue #D
            mat_counts = np.array([[count_selfeat_tissue,count_neg_tissue],
                           [count_selfeat_neg, count_neg_neg]]).reshape((2,2))
            pseudo = 1
            mat_counts_pseudo = mat_counts+pseudo
            num_in_1 = mat_counts.sum(axis=1)[0] #count_KRTgene
            num_in_2 = mat_counts.sum(axis=0)[0] #count_KRTtf
            in_1_and_in_2 = count_selfeat_tissue # A
            in_1_or_in_2 = count_selfeat_tissue +count_selfeat_neg+count_neg_tissue # A+B+C
            in_1 = count_selfeat_tissue+count_neg_tissue # A+C
            in_2 = count_selfeat_tissue+count_selfeat_neg#A+B
            observed_num = mat_counts[0][0] #count_KRTtf_KRTgene
            expected_num = num_in_1*num_in_2/sum(sum(mat_counts))
            oddsratio_pseudo, pvalue_pseudo = stats.fisher_exact(mat_counts_pseudo,alternative='greater')
            jaccard = in_1_and_in_2/in_1_or_in_2
            intersect_over_min = in_1_and_in_2/min(in_1,in_2)

            results_df.at[counter] = {'tf':tf, 'tissue':tissue,
                                        'jaccard':jaccard,'intersect_over_min':intersect_over_min,
                                    'intersection':in_1_and_in_2, 
                                   'union':in_1_or_in_2, 
                                   'num_in_1':num_in_1,'num_in_2':num_in_2,
                                   'observed':observed_num, 'expected':expected_num, 
                                   'oddsratio':oddsratio_pseudo, 'pval':pvalue_pseudo}
            counter+=1
            if (counter %1000)==0:
                print(tf, tissue, counter)
                




GATA4 Astrocytes 1000
PITX3 Colon 2000
NKX3-1 GDSD6 3000
ZNF264 HMEC 4000
STAT3 Ovarian 5000
NR4A2 Prostate 6000
IRX3 Thyroid 7000
CPU times: user 1min 10s, sys: 1.14 s, total: 1min 12s
Wall time: 1min 12s


In [104]:
mat_counts_pseudo

array([[1.0000000e+00, 8.1071500e+05],
       [1.0000000e+00, 1.4428401e+07]])

In [105]:
results_df['pval_bonf'] = results_df.pval.apply(lambda x: min(1, x* sum(sum(mat_counts))))#result_df.shape[0]))
results_df['log_pval_bonf'] = results_df.pval_bonf.apply(lambda x: min(100,-np.log10(x+1e-100)))
results_df.to_csv(os.path.join(save_dir, 'stats_fisher_all_tfs_backall.csv'))


In [106]:
results_df_filt = results_df[results_df.pval_bonf<0.05]
print(results_df_filt.shape)
results_df_filt

(4390, 14)


Unnamed: 0,tf,tissue,jaccard,intersect_over_min,intersection,union,num_in_1,num_in_2,observed,expected,oddsratio,pval,pval_bonf,log_pval_bonf
1,ARID5B,Airway,0.0145877,0.0361636,4691,321573,196548,129716,4691,1673.03,2.91793,0,0.000000e+00,100.000000
2,ARNT,Airway,0.00108708,0.0281369,222,204216,196548,7890,222,101.762,2.22683,8.8612e-26,1.350368e-18,17.869548
3,ARNTL,Airway,0.000619508,0.0233645,125,201773,196548,5350,125,69.0022,1.84577,3.64793e-10,5.559124e-03,2.254994
4,ATF1,Airway,0.000612141,0.0272788,123,200934,196548,4509,123,58.1553,2.16397,2.75776e-14,4.202589e-07,6.376483
5,ATF2,Airway,0.00350769,0.0287786,782,222939,196548,27173,782,350.466,2.27567,1.53028e-89,2.332008e-82,81.632270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7841,ZNF589,Uterine,0.0281692,0.50599,23483,833641,810714,46410,23483,2468.99,18.7426,0,0.000000e+00,100.000000
7845,ZSCAN16,Uterine,0.000343557,0.168275,279,812093,810714,1658,279,88.2049,3.61191,1.51369e-64,2.306724e-57,56.637004
7847,DLX5,Uterine,0.000489317,0.38998,397,811335,810714,1018,397,54.1571,11.393,6.92264e-228,1.054949e-220,100.000000
7850,HOXB8,Uterine,0.000580752,0.607742,471,811018,810714,775,471,41.2297,27.5572,0,0.000000e+00,100.000000


In [107]:
results_df_filt[results_df_filt.tf=='TP63']

Unnamed: 0,tf,tissue,jaccard,intersect_over_min,intersection,union,num_in_1,num_in_2,observed,expected,oddsratio,pval,pval_bonf,log_pval_bonf
268,TP63,Airway,0.00425534,0.0718869,885,207974,196548,12311,885,158.782,5.95638,0.0,0.0,100.0
1326,TP63,Bladder,0.00433312,0.143124,1762,406635,396086,12311,1762,319.98,6.28578,0.0,0.0,100.0
2913,TP63,GDSD6,0.00524078,0.187556,2309,440583,430581,12311,2309,347.847,7.97957,0.0,0.0,100.0
3971,TP63,HMEC,0.00436171,0.108277,1333,305614,294636,12311,1333,238.023,6.18638,0.0,0.0,100.0
5558,TP63,Pancreas,0.00223143,0.116969,1440,645326,634455,12311,1440,512.548,3.05568,1.00163e-262,1.52639e-255,100.0
6087,TP63,Prostate,0.00321616,0.0729429,898,279215,267802,12311,898,216.345,4.41462,5.39287e-271,8.218257e-264,100.0
7674,TP63,Uterine,0.0044963,0.299245,3684,819341,810714,12311,3684,654.94,7.63125,0.0,0.0,100.0


# 3. Try the similar idea with KRT genes, but change the background to all genes

A: is in KRT genes AND tissue of interest AND looking at TF of interest 
B: is in any genes AND NOT in tissue of interest AND looking at TF of interest 
C: is in KRT genes AND in tissue of interest AND NOT looking at TF of interest 
D: is in any genes AND NOT in tissue of interest AND NOT looking at TF of interest 

In [108]:
KRT_tfs = sorted(set(list(pd.read_csv('../data/external/krt_tfs_063020.csv')['tfs'])))
print(KRT_tfs,len(KRT_tfs))

['ATF1', 'ATF2', 'ATF3', 'ATF4', 'ATF5', 'BARX2', 'CEBPA', 'CEBPG', 'DLX3', 'DLX5', 'E2F1', 'E2F3', 'E2F4', 'E2F7', 'ELF5', 'ESRRA', 'ETS1', 'ETV4', 'FOS', 'FOSL1', 'FOSL2', 'FOXF2', 'FOXK2', 'FOXP1', 'GRHL1', 'GRHL2', 'HES1', 'HES2', 'HOMEZ', 'HOXA1', 'HOXC11', 'HOXC13', 'ID3', 'IRX2', 'IRX4', 'JUN', 'JUNB', 'JUND', 'KER2', 'KLF16', 'KLF4', 'KLF5', 'LDB2', 'MAF', 'MAFB', 'MAML3', 'NR3C1', 'OTX1', 'OVOL1', 'PBX1', 'PBX2', 'POU1F1', 'POU2F1', 'POU2F2', 'POU2F3', 'POU3F1', 'POU3F2', 'POU3F3', 'POU3F4', 'POU4F1', 'POU4F2', 'POU4F3', 'POU5F1', 'POU5F1B', 'POU6F1', 'POU6F2', 'PRDM1', 'PRRX1', 'RARG', 'RELB', 'RORA', 'RUNX1', 'SMAD4', 'SOX11', 'SOX15', 'SOX6', 'SOX7', 'SOX9', 'SP1', 'SP3', 'STAT1', 'STAT6', 'TCF4', 'TCF7L2', 'TFAP2A', 'TFAP2B', 'TFAP2C', 'TFAP2D', 'TFDP1', 'TP63', 'TWIST2', 'VDR', 'VGLL1', 'XBP1', 'ZBTB7B', 'ZEB1', 'ZNF219'] 97


In [140]:
# krt genes: expressed in KRT D6 but not in KRT d0
KRT_genes = []
for gene, TPM in rna_D6_dict.items():
    TPM_d0 = rna_D0_dict[gene]
    if (TPM>THRES)& (TPM_d0 <=THRES):
        KRT_genes.append(gene)
len(KRT_genes)
pd.Series(KRT_genes).to_csv(os.path.join(save_dir, 'KRT_genes.csv'),header=False, index=False)

In [141]:
# # TPM >1 in GDSD6 only genes
# KRT_genes = pd.read_csv('../../rnaseq/unique_gene_lists/'+'GDSD6'+'_genes.txt',header=None).loc[:,0]
# len(KRT_genes)

In [142]:
# get common subset
data_all_KRT = data_all[data_all.index.isin(KRT_genes)]
tissues_label_KRT = tissues_label[data_all.index.isin(KRT_genes)]
data_all_KRT.shape, tissues_label_KRT.shape

((4865, 1057), (4865,))

In [143]:
tissue_crm.shape

(91, 1057)

In [150]:
%%time
results_df = pd.DataFrame(columns = ['tf', 'tissue', 'jaccard', 'intersect_over_min','intersection','union', 'num_in_1', 'num_in_2', 'observed', 'expected', 'oddsratio', 'pval' ])
counter = 0
# count_all = data_all_KRT.sum().sum()
count_all = data_all.sum().sum()

for tissue in normal_tissues:
    tissue_crm = data_all_KRT[tissues_label_KRT==tissue]
    tissue_crm = tissue_crm[tissue_crm.index.isin(KRT_genes)]
    count_tissue_gene = tissue_crm.sum().sum() #mat_counts.sum(axis=1)[0], sum first row

    for tf, feat_list in tfs_feat_dict.items():
        if len(feat_list)>0:
#           print(tf)
            tissue_crm_selfeat = tissue_crm[feat_list]
            count_selfeat_tissue = tissue_crm_selfeat.sum().sum() # A
            count_selfeat = data_all_KRT[feat_list].sum().sum() #mat_counts.sum(axis=0)[0], sum down first col
            count_selfeat_neg = count_selfeat - count_selfeat_tissue # B: not in selected genes but in selected feature
            count_neg_tissue = count_tissue_gene - count_selfeat_tissue #C: not in selected feature but in selected genes
            count_neg_neg = count_all - count_selfeat_tissue - count_selfeat_neg - count_neg_tissue #D
            mat_counts = np.array([[count_selfeat_tissue,count_neg_tissue],
                           [count_selfeat_neg, count_neg_neg]]).reshape((2,2))
            pseudo = 1
            mat_counts_pseudo = mat_counts+pseudo
            num_in_1 = mat_counts.sum(axis=1)[0] #count_KRTgene
            num_in_2 = mat_counts.sum(axis=0)[0] #count_KRTtf
            in_1_and_in_2 = count_selfeat_tissue # A
            in_1_or_in_2 = count_selfeat_tissue +count_selfeat_neg+count_neg_tissue # A+B+C
            in_1 = count_selfeat_tissue+count_neg_tissue # A+C
            in_2 = count_selfeat_tissue+count_selfeat_neg#A+B
            observed_num = mat_counts[0][0] #count_KRTtf_KRTgene
            expected_num = num_in_1*num_in_2/sum(sum(mat_counts))
            oddsratio_pseudo, pvalue_pseudo = stats.fisher_exact(mat_counts_pseudo,alternative='greater')
            jaccard = in_1_and_in_2/in_1_or_in_2
            intersect_over_min = in_1_and_in_2/min(in_1,in_2)

            results_df.at[counter] = {'tf':tf, 'tissue':tissue,
                                        'jaccard':jaccard,'intersect_over_min':intersect_over_min,
                                    'intersection':in_1_and_in_2, 
                                   'union':in_1_or_in_2, 
                                   'num_in_1':num_in_1,'num_in_2':num_in_2,
                                   'observed':observed_num, 'expected':expected_num, 
                                   'oddsratio':oddsratio_pseudo, 'pval':pvalue_pseudo}
            counter+=1
            if (counter %1000)==0:
                print(tf, tissue, counter)
                




GATA4 Astrocytes 1000
PITX3 Colon 2000
NKX3-1 GDSD6 3000
ZNF264 HMEC 4000
STAT3 Ovarian 5000
NR4A2 Prostate 6000
IRX3 Thyroid 7000
CPU times: user 33.7 s, sys: 563 ms, total: 34.2 s
Wall time: 34.3 s


In [152]:
results_df['pval_bonf'] = results_df.pval.apply(lambda x: min(1, x* sum(sum(mat_counts))))#result_df.shape[0]))
results_df['log_pval_bonf'] = results_df.pval_bonf.apply(lambda x: min(100,-np.log10(x+1e-100)))
# results_df.to_csv(os.path.join(save_dir, 'stats_fisher_KRT_tfs.csv'))
results_df.to_csv(os.path.join(save_dir, 'stats_fisher_KRT_tfs_backall.csv'))


In [153]:
results_df_filt = results_df[results_df.pval_bonf<0.05]
print(results_df.shape, results_df_filt.shape)#,results_df_filt_backall.shape)
# results_df_filt

(7935, 14) (4378, 14)


In [154]:
results_df_filt[results_df_filt.tf=='TP63']

Unnamed: 0,tf,tissue,jaccard,intersect_over_min,intersection,union,num_in_1,num_in_2,observed,expected,oddsratio,pval,pval_bonf,log_pval_bonf
268,TP63,Airway,0.00383308,0.0506692,53,13827,12834,1046,53,0.880915,64.7104,5.0599e-76,7.710846000000001e-69,68.112898
1326,TP63,Bladder,0.00399459,0.121415,127,31793,30874,1046,127,2.11917,68.811,6.98103e-179,1.063846e-171,100.0
2913,TP63,GDSD6,0.00569591,0.274379,287,50387,49628,1046,287,3.40642,116.65,0.0,0.0,100.0
3971,TP63,HMEC,0.00508884,0.110899,116,22795,21865,1046,116,1.5008,87.9198,3.1810000000000004e-176,4.8475670000000005e-169,100.0
5558,TP63,Pancreas,0.00227601,0.0889101,93,40861,39908,1046,93,2.73925,37.6111,1.56104e-108,2.37888e-101,99.907319
6087,TP63,Prostate,0.00316238,0.0659656,69,21819,20842,1046,69,1.43058,52.4296,1.6719199999999998e-91,2.547855e-84,83.593825
7674,TP63,Uterine,0.00455398,0.287763,301,66096,65351,1046,301,4.48564,94.4248,0.0,0.0,100.0


In [149]:
results_df_filt[results_df_filt.tf=='TP63']

Unnamed: 0,tf,tissue,jaccard,intersect_over_min,intersection,union,num_in_1,num_in_2,observed,expected,oddsratio,pval,pval_bonf,log_pval_bonf
1326,TP63,Bladder,0.00399459,0.121415,127,31793,30874,1046,127,63.916,2.14238,1.48277e-13,7.491833e-08,7.125412
2913,TP63,GDSD6,0.00569591,0.274379,287,50387,49628,1046,287,102.741,3.49344,4.69333e-59,2.371351e-53,52.625004
3971,TP63,HMEC,0.00508884,0.110899,116,22795,21865,1046,116,45.2654,2.78768,3.5758e-20,1.806707e-14,13.743112
7674,TP63,Uterine,0.00455398,0.287763,301,66096,65351,1046,301,135.291,2.73301,5.15597e-42,2.6051039999999998e-36,35.584175


# tf-tf pairs

first use `results_df_common` to limit # of tfs then run pairwise model



In [199]:
def find_tf_pairs(single_results, tfs_feat_dict=tfs_feat_dict, data_all=data_all):
    tfs_unique = single_results[single_results.pval_bonf<0.05].tf_key.unique()
    print('num single unique TFs', len(tfs_unique))
    TF_pair_dict = defaultdict(list)
    for tf1 in tfs_unique:
        for tf2 in tfs_unique:
            if (tf1<tf2):
                if (tf1 in tfs_feat_dict) and (tf2 in tfs_feat_dict):
                    possible_feats = [tf1+'_pro',tf1+'_loop',tf2+'_pro',tf2+'_loop']
                    for feat in possible_feats:
                        if feat in data_all.columns:
                            TF_pair_dict[tf1+'::'+tf2].append(feat)
    print('num pairs of TFs', len(TF_pair_dict))
    return TF_pair_dict

In [232]:
# TF_feat_dict: dict, key = tf_key (string of tfs sep by ::), val; list of features/columns of data_all
# backgound: relative means the background in the data_all_sel, if 'all' then background is data_all
# def tf_enrichment(geneset, tissues=normal_tissues, data_all=data_all,tissues_label=tissues_label, background='all'):
def tf_enrichment(geneset, TF_feat_dict, tissues=normal_tissues, 
                  data_all=data_all,tissues_label=tissues_label, 
                  background='relative',verbose=True, save_path=None):
    # get subset
    data_all_sel = data_all[data_all.index.isin(geneset)]
    tissues_label_sel = tissues_label[data_all.index.isin(geneset)]
    if verbose:
        print('num genes in geneset',len(geneset))
        print('subsetting data,', data_all_sel.shape, tissues_label_sel.shape)
    
    # set background and initial variables
    results_df = pd.DataFrame(columns = ['tf_key', 'tissue', 'jaccard', 'intersect_over_min','intersection','union', 'num_in_gene', 'num_in_feat', 'observed', 'expected', 'oddsratio', 'pval' ])
    counter = 0
    if background=='all':
        count_all = data_all.sum().sum()
    else:
        count_all = data_all_sel.sum().sum() ## background
    
    # iterate through tissues
    if verbose:
        print('starting iteration')
        print('estimated count', len(TF_feat_dict)*len(tissues))
    for tissue in tissues:
        if verbose:
            print('*****iterating tissue, ', tissue)
        tissue_crm = data_all_sel[tissues_label_sel==tissue]
        tissue_crm = tissue_crm[tissue_crm.index.isin(geneset)]
        count_tissue_gene = tissue_crm.sum().sum() #mat_counts.sum(axis=1)[0], sum first row
        
        # iterate through tfs
        for tf_key, feat_list in sorted(TF_feat_dict.items(),key=lambda x: x[0]):
            if len(feat_list)>0:
                tissue_crm_selfeat = tissue_crm[feat_list]
                count_selfeat_tissue = tissue_crm_selfeat.sum().sum() # A
                count_selfeat = data_all_sel[feat_list].sum().sum() #mat_counts.sum(axis=0)[0], sum down first col
                count_selfeat_neg = count_selfeat - count_selfeat_tissue # B: not in selected genes but in selected feature
                count_neg_tissue = count_tissue_gene - count_selfeat_tissue #C: not in selected feature but in selected genes
                count_neg_neg = count_all - count_selfeat_tissue- count_selfeat_neg - count_neg_tissue #D
                mat_counts = np.array([[count_selfeat_tissue,count_neg_tissue],
                               [count_selfeat_neg, count_neg_neg]]).reshape((2,2))
                pseudo = 1
                mat_counts_pseudo = mat_counts+pseudo
                num_in_1 = mat_counts.sum(axis=1)[0] #count_tissue_gene
                num_in_2 = mat_counts.sum(axis=0)[0] #count_selfeat
                in_1_and_in_2 = count_selfeat_tissue # A
                in_1_or_in_2 = count_selfeat_tissue +count_selfeat_neg+count_neg_tissue # A+B+C
                in_1 = count_selfeat_tissue+count_neg_tissue # A+C
                in_2 = count_selfeat_tissue+count_selfeat_neg#A+B
                observed_num = mat_counts[0][0] #count_KRTtf_KRTgene
                expected_num = num_in_1*num_in_2/sum(sum(mat_counts))
                oddsratio_pseudo, pvalue_pseudo = stats.fisher_exact(mat_counts_pseudo,alternative='greater')
                jaccard = in_1_and_in_2/in_1_or_in_2
                intersect_over_min = in_1_and_in_2/min(in_1,in_2)

                results_df.at[counter] = {'tf_key':tf_key, 'tissue':tissue,
                                            'jaccard':jaccard,'intersect_over_min':intersect_over_min,
                                        'intersection':in_1_and_in_2, 
                                       'union':in_1_or_in_2, 
                                       'num_in_gene':num_in_1,'num_in_feat':num_in_2,
                                       'observed':observed_num, 'expected':expected_num, 
                                       'oddsratio':oddsratio_pseudo, 'pval':pvalue_pseudo}
                counter+=1
                if verbose:
                    if (counter %1000)==0:
                        print(tf_key, tissue, counter)

    
    results_df['pval_bonf'] = results_df.pval.apply(lambda x: min(1, x* sum(sum(mat_counts))))#result_df.shape[0]))
    results_df['log_pval_bonf'] = results_df.pval_bonf.apply(lambda x: min(100,-np.log10(x+1e-100)))
    
    if save_path is not None:
        results_df.to_csv(save_path)
        print('saved file', save_path)
    return results_df

In [None]:
rna

In [249]:
pd.read_csv('../../rnaseq/unique_gene_lists/Colon_genes.txt',header=None).loc[:,0]

0      RP11-290M5.4
1     RP11-344P13.6
2           SLC45A3
3             SYT14
4      RP11-23F23.2
5     RP11-624G17.3
6             FOXM1
7              WNT3
8      RP1-261G23.7
9      RP11-10A14.5
10     RP11-629O1.2
11             RLN1
12             RMRP
13            PHKA1
14    RP11-344P13.6
15            PHKA1
Name: 0, dtype: object

In [251]:
glob.glob('../../rnaseq/unique_gene_lists/*genes.txt')

['../../rnaseq/unique_gene_lists/Colon_genes.txt',
 '../../rnaseq/unique_gene_lists/GDSD0_genes.txt',
 '../../rnaseq/unique_gene_lists/Airway_genes.txt',
 '../../rnaseq/unique_gene_lists/GDSD6_genes.txt',
 '../../rnaseq/unique_gene_lists/Pancreas_genes.txt',
 '../../rnaseq/unique_gene_lists/Uterine_genes.txt',
 '../../rnaseq/unique_gene_lists/Ovarian_genes.txt',
 '../../rnaseq/unique_gene_lists/Astrocytes_genes.txt',
 '../../rnaseq/unique_gene_lists/Bladder_genes.txt',
 '../../rnaseq/unique_gene_lists/Melanocytes_genes.txt',
 '../../rnaseq/unique_gene_lists/HMEC_genes.txt',
 '../../rnaseq/unique_gene_lists/Prostate_genes.txt',
 '../../rnaseq/unique_gene_lists/GM12878_genes.txt',
 '../../rnaseq/unique_gene_lists/Renal_genes.txt',
 '../../rnaseq/unique_gene_lists/GDSD3_genes.txt',
 '../../rnaseq/unique_gene_lists/Esophageal_genes.txt',
 '../../rnaseq/unique_gene_lists/Thyroid_genes.txt']

In [252]:
normal_tissues

['Airway',
 'Astrocytes',
 'Bladder',
 'Colon',
 'Esophageal',
 'GDSD6',
 'GM12878',
 'HMEC',
 'Melanocytes',
 'Ovarian',
 'Pancreas',
 'Prostate',
 'Renal',
 'Thyroid',
 'Uterine']

In [253]:
print(glob.glob('../data/interim/rna/*_genes.csv'))
group_tissue_mapping = {
    'blue':['Astrocytes','Melanocytes'],
    'grey':['GM12878'],
    'green':['Colon','Esophageal','Ovarian','Pancreas','Renal','Thyroid'],
    'purple':['Airway','Bladder', 'GDSD6', 'HMEC', 'Prostate', 'Uterine'] 
}

['../data/interim/rna/purple_genes.csv', '../data/interim/rna/green_genes.csv', '../data/interim/rna/common_genes.csv', '../data/interim/rna/all_genes.csv', '../data/interim/rna/grey_genes.csv', '../data/interim/rna/blue_genes.csv']


In [205]:
save_dir_enrich = '../data/processed/fig4_modelling/pairwise_enrich'

In [254]:
results_df_KRT = tf_enrichment(geneset=KRT_genes, TF_feat_dict=tfs_feat_dict, tissues=normal_tissues, 
                  data_all=data_all,tissues_label=tissues_label, 
                  background='relative',verbose=True, save_path=os.path.join(save_dir_enrich, 'KRT_test_single.csv'))

num genes in geneset 722
subsetting data, (4865, 1057) (4865,)
starting iteration
estimated count 7935
*****iterating tissue,  Airway




*****iterating tissue,  Astrocytes
ZNF143 Astrocytes 1000
*****iterating tissue,  Bladder
*****iterating tissue,  Colon
TEAD3 Colon 2000
*****iterating tissue,  Esophageal
*****iterating tissue,  GDSD6
SALL4 GDSD6 3000
*****iterating tissue,  GM12878
*****iterating tissue,  HMEC
OLIG1 HMEC 4000
*****iterating tissue,  Melanocytes
*****iterating tissue,  Ovarian
MESP1 Ovarian 5000
*****iterating tissue,  Pancreas
*****iterating tissue,  Prostate
HOXD8 Prostate 6000
*****iterating tissue,  Renal
*****iterating tissue,  Thyroid
FUBP1 Thyroid 7000
*****iterating tissue,  Uterine
saved file ../data/processed/fig4_modelling/pairwise_enrich/KRT_test_single.csv


In [258]:
results_df_KRT_filt = results_df_KRT[(results_df_KRT.pval_bonf<0.05 )&( results_df_KRT.tissue=='GDSD6')]
tfs_feat_dict_KRT_pairs = find_tf_pairs(single_results=results_df_KRT_filt, tfs_feat_dict=tfs_feat_dict, data_all=data_all)


num single unique TFs 73
num pairs of TFs 2628


In [259]:
# results_df_KRT_filt.sort_values('oddsratio', ascending=False)

In [260]:
%%time
results_df_KRT_pairs = tf_enrichment(geneset=KRT_genes, TF_feat_dict=tfs_feat_dict_KRT_pairs, tissues=['GDSD6'], 
                  data_all=data_all,tissues_label=tissues_label, 
                  background='relative',verbose=True, save_path=os.path.join(save_dir_enrich, 'KRT_test_pair.csv'))

num genes in geneset 722
subsetting data, (4865, 1057) (4865,)
starting iteration
estimated count 2628
*****iterating tissue,  GDSD6
FOS::MZF1 GDSD6 1000
MAF::MSX2 GDSD6 2000
saved file ../data/processed/fig4_modelling/pairwise_enrich/KRT_test_pair.csv
CPU times: user 9.13 s, sys: 24.9 ms, total: 9.15 s
Wall time: 9.15 s


In [262]:
results_df_KRT_pairs_filt = results_df_KRT_pairs[(results_df_KRT_pairs.oddsratio>1) & (results_df_KRT_pairs.pval_bonf<0.05)]
# results_df_KRT_pairs_filt = results_df_KRT_pairs_filt[results_df_KRT_pairs_filt.num_in_2>=(results_df_KRT_pairs_filt.observed+10)]
print(results_df_KRT_pairs.shape, results_df_KRT_pairs_filt.shape)

(2628, 14) (2628, 14)


In [265]:
results_df_KRT_filt.sort_values('oddsratio', ascending=False)

Unnamed: 0,tf_key,tissue,jaccard,intersect_over_min,intersection,union,num_in_gene,num_in_feat,observed,expected,oddsratio,pval,pval_bonf,log_pval_bonf
2718,ELF5,GDSD6,0.00310309,1,154,49628,49628,154,154,15.1263,1427.45,7.06948e-155,3.571925e-149,100.000000
3066,TFE3,GDSD6,0.00122914,1,61,49628,49628,61,61,5.99158,569.909,1.81267e-61,9.158703e-56,55.038166
2964,POU2F3,GDSD6,0.00118885,1,59,49628,49628,59,59,5.79514,551.503,1.8232e-59,9.211911e-54,53.035650
2765,FOXP2,GDSD6,0.000644797,1,32,49628,49628,32,32,3.14313,303.161,1.68764e-32,8.526995e-27,26.069204
2999,SALL4,GDSD6,0.00519679,0.934783,258,49646,49628,276,258,27.1095,125.798,9.02341e-234,4.559168e-228,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2905,NFE2L1,GDSD6,0.0115078,0.133985,617,53616,49628,4605,617,452.316,1.42764,2.47462e-15,1.250325e-09,8.902977
2650,ARID5B,GDSD6,0.0233521,0.131617,1370,58667,49628,10409,1370,1022.4,1.40347,8.00301e-29,4.043599e-23,22.393232
2797,HMGA1,GDSD6,0.0102037,0.123306,546,53510,49628,4428,546,434.93,1.29655,2.37015e-08,1.197543e-02,1.921709
3027,SP2,GDSD6,0.0195806,0.117127,1140,58221,49628,9733,1140,956.002,1.22403,3.27761e-10,1.656043e-04,3.780928


In [267]:
results_df_KRT_pairs_filt.sort_values('oddsratio', ascending=False)[:50]

Unnamed: 0,tf_key,tissue,jaccard,intersect_over_min,intersection,union,num_in_gene,num_in_feat,observed,expected,oddsratio,pval,pval_bonf,log_pval_bonf
903,ELF5::TFE3,GDSD6,0.00433223,1.0,215,49628,49628,215,215,21.1179,1991.68,2.6868e-216,1.357533e-210,100.0
895,ELF5::POU2F3,GDSD6,0.00429193,1.0,213,49628,49628,213,213,20.9214,1973.16,2.78071e-214,1.404982e-208,100.0
863,ELF5::FOXP2,GDSD6,0.00374788,1.0,186,49628,49628,186,186,18.2694,1723.26,4.35001e-187,2.1978880000000003e-181,100.0
2404,POU2F3::TFE3,GDSD6,0.00241799,1.0,120,49628,49628,120,120,11.7867,1113.57,1.10648e-120,5.5906069999999995e-115,100.0
1236,FOXP2::TFE3,GDSD6,0.00187394,1.0,93,49628,49628,93,93,9.13471,864.614,1.47272e-93,7.441052999999999e-88,87.128366
1228,FOXP2::POU2F3,GDSD6,0.00183364,1.0,91,49628,49628,91,91,8.93827,846.183,1.49933e-91,7.575516e-86,85.120588
901,ELF5::TBX19,GDSD6,0.00350601,0.994286,174,49629,49628,175,174,17.189,806.143,4.1949499999999996e-173,2.119541e-167,100.0
877,ELF5::ISL1,GDSD6,0.00358647,0.983425,178,49631,49628,181,178,17.7783,412.316,9.11753e-174,4.606721e-168,100.0
2509,TBX19::TFE3,GDSD6,0.00163211,0.987805,81,49629,49628,82,81,8.05426,377.027,6.16744e-80,3.116162e-74,73.50638
2402,POU2F3::TBX19,GDSD6,0.00159181,0.9875,79,49629,49628,80,79,7.85782,367.816,6.10789e-78,3.086071e-72,71.510594


In [224]:
results_df_KRT_pairs_filt[results_df_KRT_pairs_filt.tf_key.str.contains('TP63')]

Unnamed: 0,tf_key,tissue,jaccard,intersect_over_min,intersection,union,num_in_1,num_in_2,observed,expected,oddsratio,pval,pval_bonf,log_pval_bonf
1286,ASCL2::TP63,GDSD6,0.00784811,0.323002,396,50458,49628,1226,396,120.421,4.41323,2.25979e-104,1.14178e-98,97.93863
2579,ID4::TP63,GDSD6,0.0110067,0.410807,555,50424,49628,1351,555,132.699,6.46578,2.27881e-201,1.151392e-195,100.0
2946,MZF1::TP63,GDSD6,0.00772966,0.32046,390,50455,49628,1217,390,119.537,4.36178,1.2417e-101,6.273815e-96,95.202461
4725,NFATC2::TP63,GDSD6,0.0083816,0.30658,424,50587,49628,1383,424,135.842,4.0908,3.42284e-103,1.729423e-97,96.761848
9338,TP63::ZNF467,GDSD6,0.0239493,0.320943,1252,52277,49628,3901,1252,383.167,4.4274,5e-324,2.496316e-318,100.0
9648,MAF::TP63,GDSD6,0.00950259,0.305644,482,50723,49628,1577,482,154.897,4.07577,1.72351e-116,8.708217e-111,100.0
16922,EGR2::TP63,GDSD6,0.022796,0.471376,1161,50930,49628,2463,1161,241.923,8.35947,0.0,0.0,100.0
17148,EGR3::TP63,GDSD6,0.00960662,0.361132,485,50486,49628,1343,485,131.913,5.23563,4.12044e-149,2.081896e-143,100.0
18093,HOXC13::TP63,GDSD6,0.00719667,0.308936,363,50440,49628,1175,363,115.412,4.13337,1.11023e-89,5.609564e-84,83.251071
18919,TFE3::TP63,GDSD6,0.00690654,0.314363,348,50387,49628,1107,348,108.733,4.23861,2.36783e-88,1.196367e-82,81.922135


In [169]:
# tfs_unique = results_df_common[results_df_common.pval_bonf<0.05].tf.unique()
# print(len(tfs_unique))
# TF_pair_dict = defaultdict(list)
# for tf1 in tfs_unique:
#     for tf2 in tfs_unique:
#         if (tf1<tf2):
#             if (tf1 in tfs_feat_dict) and (tf2 in tfs_feat_dict):
#                 possible_feats = [tf1+'_pro',tf1+'_loop',tf2+'_pro',tf2+'_loop']
#                 for feat in possible_feats:
#                     if feat in data_all.columns:
#                         TF_pair_dict[tf1+'::'+tf2].append(feat)
#     #                 KRT_TF_pair_dict[tf1+'::'+tf2]=[]
# len(TF_pair_dict)

438


95703

In [172]:
len(TF_pair_dict)*len(normal_tissues)

1435545

In [173]:
# get common subset
data_all_common = data_all[data_all.index.isin(common_genes)]
tissues_label_common = tissues_label[data_all.index.isin(common_genes)]
data_all_common.shape, tissues_label_common.shape

In [202]:
# %%time
# results_df = pd.DataFrame(columns = ['tf_key', 'tissue', 'jaccard', 'intersect_over_min','intersection','union', 'num_in_1', 'num_in_2', 'observed', 'expected', 'oddsratio', 'pval' ])
# counter = 0
# count_all = data_all_common.sum().sum()

# for tissue in normal_tissues:
#     tissue_crm = data_all_common[tissues_label_common==tissue]
#     tissue_crm = tissue_crm[tissue_crm.index.isin(common_genes)]
#     count_tissue_gene = tissue_crm.sum().sum() #mat_counts.sum(axis=1)[0], sum first row

#     for tf_key, feat_list in TF_pair_dict.items():
# #         tf1,tf2 = tf_key.split('::')
#         if len(feat_list)>0:
# #           print(tf)
#             tissue_crm_selfeat = tissue_crm[feat_list]
#             count_selfeat_tissue = tissue_crm_selfeat.sum().sum() # A
#             count_selfeat = data_all_common[feat_list].sum().sum() #mat_counts.sum(axis=0)[0], sum down first col
#             count_selfeat_neg = count_selfeat - count_selfeat_tissue # B: not in selected genes but in selected feature
#             count_neg_tissue = count_tissue_gene - count_selfeat_tissue #C: not in selected feature but in selected genes
#             count_neg_neg = count_all - count_selfeat_tissue- count_selfeat_neg - count_neg_tissue #D
#             mat_counts = np.array([[count_selfeat_tissue,count_neg_tissue],
#                            [count_selfeat_neg, count_neg_neg]]).reshape((2,2))
#             pseudo = 1
#             mat_counts_pseudo = mat_counts+pseudo
#             num_in_1 = mat_counts.sum(axis=1)[0] #count_KRTgene
#             num_in_2 = mat_counts.sum(axis=0)[0] #count_KRTtf
#             in_1_and_in_2 = count_selfeat_tissue # A
#             in_1_or_in_2 = count_selfeat_tissue +count_selfeat_neg+count_neg_tissue # A+B+C
#             in_1 = count_selfeat_tissue+count_neg_tissue # A+C
#             in_2 = count_selfeat_tissue+count_selfeat_neg#A+B
#             observed_num = mat_counts[0][0] #count_KRTtf_KRTgene
#             expected_num = num_in_1*num_in_2/sum(sum(mat_counts))
#             oddsratio_pseudo, pvalue_pseudo = stats.fisher_exact(mat_counts_pseudo,alternative='greater')
#             jaccard = in_1_and_in_2/in_1_or_in_2
#             intersect_over_min = in_1_and_in_2/min(in_1,in_2)

#             results_df.at[counter] = {'tf_key':tf_key, 'tissue':tissue,
#                                         'jaccard':jaccard,'intersect_over_min':intersect_over_min,
#                                     'intersection':in_1_and_in_2, 
#                                    'union':in_1_or_in_2, 
#                                    'num_in_1':num_in_1,'num_in_2':num_in_2,
#                                    'observed':observed_num, 'expected':expected_num, 
#                                    'oddsratio':oddsratio_pseudo, 'pval':pvalue_pseudo}
#             counter+=1
#             if (counter %1000)==0:
#                 print(tf_key, tissue, counter)
                


In [181]:
results_df['pval_bonf'] = results_df.pval.apply(lambda x: min(1, x* sum(sum(mat_counts))))#result_df.shape[0]))
results_df['log_pval_bonf'] = results_df.pval_bonf.apply(lambda x: min(100,-np.log10(x+1e-100)))
# results_df.to_csv(os.path.join(save_dir, 'stats_fisher_all_tfs_pairs.csv'))


In [183]:
results_df_filt = results_df[results_df.pval_bonf<0.05]
# results_df_filt.to_csv(os.path.join(save_dir, 'stats_fisher_all_tfs_pairs_filt.csv'))
print(results_df.shape, results_df_filt.shape)
results_df_filt

(236397, 14) (65520, 14)


Unnamed: 0,tf,tissue,jaccard,intersect_over_min,intersection,union,num_in_1,num_in_2,observed,expected,oddsratio,pval,pval_bonf,log_pval_bonf
0,,Airway,0.0236942,0.0474595,8880,374775,196548,187107,8880,5106.7,1.8126,0,0.000000e+00,100.000000
1,,Airway,0.0149374,0.0365917,4839,323952,196548,132243,4839,3609.3,1.36283,6.31291e-89,4.546193e-82,81.342352
2,,Airway,0.0167339,0.0370538,5820,347797,196548,157069,5820,4286.87,1.38295,1.2943e-115,9.320802e-109,100.000000
3,,Airway,0.0161748,0.0367055,5524,341519,196548,150495,5524,4107.45,1.3686,1.72119e-103,1.239500e-96,95.906719
4,,Airway,0.0162077,0.0368756,5524,340825,196548,149801,5524,4088.51,1.37532,1.61544e-106,1.163349e-99,98.898477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236391,,Bladder,0.0109276,0.0818429,4933,451427,396086,60274,4933,3315.13,1.53851,4.06914e-163,2.930362e-156,100.000000
236392,,Bladder,0.00276132,0.0957619,1123,406690,396086,11727,1123,644.997,1.82335,4.96772e-70,3.577464e-63,62.446425
236394,,Bladder,0.000387716,0.121643,154,397198,396086,1266,154,69.6313,2.39328,4.77013e-20,3.435172e-13,12.464052
236395,,Bladder,0.000387614,0.112409,154,397302,396086,1370,154,75.3515,2.18873,6.25211e-17,4.502413e-10,9.346555
