# 8B2_tf_tf_enrichment_cancer

07/20/2020

getting tf-tf enriched pairs for each tissue for each geneset

cancer - scc



In [1]:
# basic packages
import os, glob
import pandas as pd
import numpy as np; np.random.seed(0)
import itertools
from collections import Counter, defaultdict
import time

# machine learning packages from sklearn
from sklearn.preprocessing import MinMaxScaler #StandardScaler 
from sklearn import preprocessing, metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV
from IPython.display import Image
from scipy import stats

# Import tools needed for visualization
import seaborn as sns; sns.set()
import matplotlib
import matplotlib.pyplot as plt

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
save_dir = '../data/processed/fig4_modelling/tf_tf_pairs_mm/'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [4]:
THRES=5

In [5]:
tf_annon_df = pd.read_csv('../data/external/HOCOMOCOv11_annotation.csv',index_col=0)
tf_annon_df['id_trim'] = tf_annon_df['id'] + '.pwm.trim'
tf_name_to_id_dict = pd.Series(tf_annon_df.id_trim.values, index=tf_annon_df.tf.values).to_dict()
tf_id_to_name_dict = pd.Series(tf_annon_df.tf.values, index=tf_annon_df.id_trim.values).to_dict()

In [6]:
rna_df = pd.read_csv('../data/interim/rna/tissue_tpm_sym.csv',index_col=0)
# rna_df_norm = rna_df[normal_tissues]

In [7]:
rna_df.loc["BARX2",:]

A431-CTRLi             1.025
A431-p63i              1.390
Airway                 8.655
Astrocytes             0.065
Bladder                0.750
CAL27-CTRLi            0.615
CAL27-p63i             0.550
COLO_SCR_DMSO          0.000
COLO_SCR_PLX           0.005
COLO_shMITF_DMSO       0.015
COLO_shMITF_PLX        0.000
Colon                  1.775
D0-CTRLi               0.380
D0-p63i                0.220
D3-CTRLi               2.665
D3-p63i                0.640
Esophageal             0.405
GDSD0                  2.800
GDSD3                 14.180
GDSD6                  6.525
GM12878                0.055
HMEC                   0.105
Melanocytes            0.030
Ovarian                0.355
Pancreas               5.415
Prostate               1.625
Renal                  1.175
SCC13-CTRLi            0.330
SCC13-p63i             0.455
SKMEL5_SCR_DMSO        0.000
SKMEL5_SCR_PLX         0.020
SKMEL5_shMITF-DMSO     0.000
SKMEL5_shMITF_PLX      0.005
Thyroid                2.460
Uterine       

In [9]:
print(rna_df.columns)
# cancer_tissues = ['COLO_SCR_DMSO','SKMEL5_SCR_DMSO', 'WM_SCR_DMSO']
cancer_tissues = ['COLO_SCR_DMSO', 'WM_SCR_DMSO']  # SKMEL5_SCR_DMSO doesn't have HiChIP data
normal_tissues = ['Melanocytes']#'D0-CTRLi', 'D3-CTRLi']
sel_tissues = cancer_tissues+normal_tissues
tissues_dict = dict(zip(sel_tissues,range(len(sel_tissues))))
rna_df_sel = rna_df[sel_tissues]

Index(['A431-CTRLi', 'A431-p63i', 'Airway', 'Astrocytes', 'Bladder',
       'CAL27-CTRLi', 'CAL27-p63i', 'COLO_SCR_DMSO', 'COLO_SCR_PLX',
       'COLO_shMITF_DMSO', 'COLO_shMITF_PLX', 'Colon', 'D0-CTRLi', 'D0-p63i',
       'D3-CTRLi', 'D3-p63i', 'Esophageal', 'GDSD0', 'GDSD3', 'GDSD6',
       'GM12878', 'HMEC', 'Melanocytes', 'Ovarian', 'Pancreas', 'Prostate',
       'Renal', 'SCC13-CTRLi', 'SCC13-p63i', 'SKMEL5_SCR_DMSO',
       'SKMEL5_SCR_PLX', 'SKMEL5_shMITF-DMSO', 'SKMEL5_shMITF_PLX', 'Thyroid',
       'Uterine', 'WM_SCR_DMSO', 'WM_SCR_PLX', 'WM_shMITF_DMSO',
       'WM_shMITF_PLX'],
      dtype='object')


# 0. CRM data wrangling and global var setup

In [10]:
%%time
# import
data_all = pd.read_csv('/Users/mguo123/Google Drive/1_khavari/omics_project-LD/pan_omics/data/processed/tissue_crms/all_count_comb_overall.csv',index_col=0,header=0)
data_all = data_all[data_all.tissue.isin(sel_tissues)]
data_all = data_all[data_all.iloc[:,2:].sum(axis=1)>1e-1]

# expression labels
exp_label = list(np.log10(data_all.exp.values+1e-2))
labels_all  = np.array(np.array(exp_label)>THRES)


tissues_label  = data_all.tissue.values#np.array((data_all.exp>THRES).values)
tissue_num_labels =  data_all.tissue.map(tissues_dict).values
is_cancer_label = np.array(['cancer' if x in cancer_tissues  else 'normal' for x in tissues_label ])


genes_all = data_all.index.values
gene_to_num_dict = dict(zip(np.unique(genes_all),range(len(np.unique(genes_all)))))
genes_num_all = np.vectorize(gene_to_num_dict.get)(genes_all)



CPU times: user 1min 27s, sys: 20.6 s, total: 1min 48s
Wall time: 1min 33s


In [11]:
print('files_loaded', data_all.shape)


files_loaded (25770, 1063)


In [12]:
## only tfs

data_all.drop(['tissue','exp','num_loop_counts','num_loops','num_atac_regions_pro','num_atac_regions_loop'],axis=1,inplace=True)

data_all.shape




(25770, 1057)

In [13]:
data_all[:5]

Unnamed: 0,AHR_pro,ARID5B_pro,ARNT_pro,ARNTL_pro,ATF1_pro,ATF2_pro,ATF3_pro,ATF4_pro,ATF6_pro,ATF7_pro,...,MAFA_loop,MEOX1_loop,E2F5_pro,E2F5_loop,ESR2_pro,KLF14_pro,TBX15_pro,ESR2_loop,KLF14_loop,TBX15_loop
A4GALT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GNT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AADACL4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAK1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
#tfs_feat_dict: dict
# key: tf
# value: list of crm features
tfs_feat_dict = defaultdict(list)
for feat in data_all.columns:
    tfs_feat_dict[feat.split('_')[0]].append(feat)

# 1. Functions 

In [15]:
def find_tf_pairs(single_results, tfs_feat_dict=tfs_feat_dict, data_all=data_all):
    tfs_unique = single_results[single_results.pval_bonf<0.05].tf_key.unique()
    print('num single unique TFs', len(tfs_unique))
    TF_pair_dict = defaultdict(list)
    for tf1 in tfs_unique:
        for tf2 in tfs_unique:
            if (tf1<tf2):
                if (tf1 in tfs_feat_dict) and (tf2 in tfs_feat_dict):
                    possible_feats = [tf1+'_pro',tf1+'_loop',tf2+'_pro',tf2+'_loop']
                    for feat in possible_feats:
                        if feat in data_all.columns:
                            TF_pair_dict[tf1+'::'+tf2].append(feat)
    print('num pairs of TFs', len(TF_pair_dict))
    return TF_pair_dict

In [16]:
# TF_feat_dict: dict, key = tf_key (string of tfs sep by ::), val; list of features/columns of data_all
# backgound: relative means the background in the data_all_sel, if 'all' then background is data_all
# def tf_enrichment(geneset, tissues=normal_tissues, data_all=data_all,tissues_label=tissues_label, background='all'):
def tf_enrichment(geneset, TF_feat_dict, tissues=['cancer', 'normal'], 
                  data_all=data_all,tissues_label=is_cancer_label, 
                  background='relative',verbose=True, save_path=None):
    # get subset
    data_all_sel = data_all[data_all.index.isin(geneset)]
    tissues_label_sel = tissues_label[data_all.index.isin(geneset)]
    if verbose:
        print('num genes in geneset',len(geneset))
        print('subsetting data,', data_all_sel.shape, tissues_label_sel.shape)
    
    # set background and initial variables
    results_df = pd.DataFrame(columns = ['tf_key', 'tissue', 'jaccard', 'intersect_over_min','intersection','union', 'num_in_gene', 'num_in_feat', 'observed', 'expected', 'oddsratio', 'pval' ])
    counter = 0
    if background=='all':
        count_all = data_all.sum().sum()
    else:
        count_all = data_all_sel.sum().sum() ## background
    
    # iterate through tissues
    if verbose:
        print('starting iteration')
        print('estimated count', len(TF_feat_dict)*len(tissues))
    for tissue in tissues:
        if verbose:
            print('*****iterating tissue, ', tissue)
        tissue_crm = data_all_sel[tissues_label_sel==tissue]
        tissue_crm = tissue_crm[tissue_crm.index.isin(geneset)]
        count_tissue_gene = tissue_crm.sum().sum() #mat_counts.sum(axis=1)[0], sum first row
        
        # iterate through tfs
        for tf_key, feat_list in sorted(TF_feat_dict.items(),key=lambda x: x[0]):
            if len(feat_list)>0:
                tissue_crm_selfeat = tissue_crm[feat_list]
                count_selfeat_tissue = tissue_crm_selfeat.sum().sum() # A
                count_selfeat = data_all_sel[feat_list].sum().sum() #mat_counts.sum(axis=0)[0], sum down first col
                count_selfeat_neg = count_selfeat - count_selfeat_tissue # B: not in selected genes but in selected feature
                count_neg_tissue = count_tissue_gene - count_selfeat_tissue #C: not in selected feature but in selected genes
                count_neg_neg = count_all - count_selfeat_tissue- count_selfeat_neg - count_neg_tissue #D
                mat_counts = np.array([[count_selfeat_tissue,count_neg_tissue],
                               [count_selfeat_neg, count_neg_neg]]).reshape((2,2))
                pseudo = 1
                mat_counts_pseudo = mat_counts+pseudo
                num_in_1 = mat_counts.sum(axis=1)[0] #count_tissue_gene
                num_in_2 = mat_counts.sum(axis=0)[0] #count_selfeat
                in_1_and_in_2 = count_selfeat_tissue # A
                in_1_or_in_2 = count_selfeat_tissue +count_selfeat_neg+count_neg_tissue # A+B+C
                in_1 = count_selfeat_tissue+count_neg_tissue # A+C
                in_2 = count_selfeat_tissue+count_selfeat_neg#A+B
                observed_num = mat_counts[0][0] #count_KRTtf_KRTgene
                expected_num = num_in_1*num_in_2/sum(sum(mat_counts))
                oddsratio_pseudo, pvalue_pseudo = stats.fisher_exact(mat_counts_pseudo,alternative='greater')
                jaccard = in_1_and_in_2/in_1_or_in_2
                intersect_over_min = in_1_and_in_2/min(in_1,in_2)

                results_df.at[counter] = {'tf_key':tf_key, 'tissue':tissue,
                                            'jaccard':jaccard,'intersect_over_min':intersect_over_min,
                                        'intersection':in_1_and_in_2, 
                                       'union':in_1_or_in_2, 
                                       'num_in_gene':num_in_1,'num_in_feat':num_in_2,
                                       'observed':observed_num, 'expected':expected_num, 
                                       'oddsratio':oddsratio_pseudo, 'pval':pvalue_pseudo}
                counter+=1
                if verbose:
                    if (counter %1000)==0:
                        print(tf_key, tissue, counter)

    
    results_df['pval_bonf'] = results_df.pval.apply(lambda x: min(1, x* results_df.shape[0]))
    results_df['log_pval_bonf'] = results_df.pval_bonf.apply(lambda x: min(100,-np.log10(x+1e-100)))
    
    if save_path is not None:
        results_df.to_csv(save_path)
        print('saved file', save_path)
    return results_df

# 2. get gene sets

create dictionary {tissue: gene_list} for each category

cancer variant (to types of gene sets the differential one and then the normal one)

In [17]:
# 2A. unique gene lists for each cell type
# glob.glob('../../rnaseq/unique_gene_lists/*genes.txt')

In [18]:
# unique_genesets = {}
# for rna_file in glob.glob('../../rnaseq/unique_gene_lists/*genes.txt'):
#     tissue = os.path.basename(rna_file).split('_')[0]
#     geneset = sorted(pd.read_csv(rna_file,header=None).loc[:,0].dropna().unique())
#     print(tissue, len(geneset))
#     unique_genesets[tissue] = geneset
    

2B. differentially expressed gene lists for each cell type

In [19]:
# def t_test(row):
#     t_stat, pval =  stats.ttest_ind(row[cancer_tissues], row[normal_tissues], axis=0, equal_var=True, nan_policy='propagate')
#     fc = 
#     return t_stat, pval, fc

In [42]:
rna_df_sel_high = rna_df_sel[rna_df_sel.max(axis=1)>1]
rna_df_sel_high_pseudo = rna_df_sel_high+1

# ttest_rna_df = pd.DataFrame(stats.ttest_ind(rna_df_sel_high_pseudo[cancer_tissues], rna_df_sel_high_pseudo[normal_tissues], 
#                                             axis=1, equal_var=True, nan_policy='propagate')).T
# ttest_rna_df.columns = ['tstat','pval']
# ttest_rna_df = pd.concat([rna_df_sel_high.reset_index(), ttest_rna_df],axis=1)
# ttest_rna_df.sort_values('tstat',inplace=True)
rna_df_sel_high_pseudo['avg_ratio'] = (rna_df_sel_high_pseudo.COLO_SCR_DMSO+rna_df_sel_high_pseudo.WM_SCR_DMSO)/(2*rna_df_sel_high_pseudo.Melanocytes)
rna_df_sel_high_pseudo['min_ratio'] = rna_df_sel_high_pseudo[['COLO_SCR_DMSO','WM_SCR_DMSO']].min(axis=1)/(rna_df_sel_high_pseudo.Melanocytes)

rna_df_sel_high_pseudo.sort_values('min_ratio',ascending=False, inplace=True)


In [48]:
# for gprofile look: # for x in sorted(rna_df_sel_high_pseudo[rna_df_sel_high_pseudo.min_ratio>2].index):
#     print (x)

organonitrogen compound metabolic process was significant, some cancer type go terms occurs 

In [39]:
rna_df_sel_high_pseudo[['COLO_SCR_DMSO','WM_SCR_DMSO']].min(axis=0)

COLO_SCR_DMSO    1.0
WM_SCR_DMSO      1.0
dtype: float64

In [49]:
# print genes in cancer tissues
cancer_genes = sorted(rna_df_sel_high_pseudo[rna_df_sel_high_pseudo.min_ratio>2].index)
print(len(cancer_genes))
print(cancer_genes)

1143
['AAMP', 'ABCB7', 'ABCB9', 'ABTB2', 'ACOT13', 'ACOX2', 'ACP5', 'ACPP', 'ACSS1', 'ACTN1', 'ACTN4', 'ADAM15', 'ADAM22', 'ADAM23', 'ADAR', 'ADCK2', 'ADCY1', 'ADGRG1', 'ADGRG6', 'ADGRL1', 'AEN', 'AFAP1L1', 'AGMO', 'AK2', 'AK4', 'AKAP6', 'AKAP8L', 'AKT1S1', 'AKT2', 'ALDH1B1', 'ALDH4A1', 'ALDOA', 'ALG3', 'ALKBH4', 'ALYREF', 'AMACR', 'ANAPC1', 'ANGPT1', 'ANGPTL4', 'ANK3', 'ANKMY2', 'ANKRD27', 'ANKRD52', 'ANKZF1', 'AP1S1', 'AP2A2', 'APH1A', 'APOC1', 'APOC4', 'APOD', 'APP', 'ARHGAP15', 'ARHGAP27P1-BPTFP1-KPNA2P3', 'ARHGAP5', 'ARHGEF25', 'ARHGEF3', 'ARIH2', 'ARMC10', 'ARNT2', 'ARPC3', 'ART3', 'ASB9', 'ASPHD1', 'ATG10', 'ATG2A', 'ATOX1', 'ATP5F1E', 'ATP5MC2', 'ATP5MGL', 'ATP6V0A4', 'ATP6V0E2-AS1', 'ATP6V1B1-AS1', 'ATP6V1F', 'ATP8', 'ATP8B2', 'ATP8B5P', 'ATXN2L', 'AUP1', 'AURKB', 'AVL9', 'AZGP1', 'B3GNT7', 'B4GALNT1', 'B4GALT3', 'B4GAT1', 'BAALC-AS2', 'BACE2', 'BAGE2', 'BAK1', 'BAMBI', 'BAP1', 'BATF2', 'BAX', 'BBC3', 'BCAS3', 'BCKDHA', 'BCL2L11', 'BFSP1', 'BHLHE40', 'BIRC7', 'BORCS6', 'BPHL',

In [50]:
# print genes in normal tissues
normal_genes = sorted(rna_df_sel_high_pseudo[rna_df_sel_high_pseudo.min_ratio<0.5].index)
print(len(normal_genes))
print(normal_genes)

3128
['A2M', 'AAK1', 'AAMDC', 'AASS', 'ABCA5', 'ABCA7', 'ABCB4', 'ABCB5', 'ABCC1', 'ABCC10', 'ABCC3', 'ABCD3', 'ABCD4', 'ABI1', 'ABR', 'ACAA2', 'ACAD10', 'ACADM', 'ACBD5', 'ACO1', 'ACOT2', 'ACOT4', 'ACP2', 'ACSF3', 'ACSL1', 'ACSL4', 'ACTG1', 'ACTR1A', 'ACTR2', 'ACTR3', 'ACTRT3', 'ACVRL1', 'ADA2', 'ADAM10', 'ADAM19', 'ADAM8', 'ADAM9', 'ADAMTS1', 'ADAMTS12', 'ADAMTS2', 'ADAMTSL1', 'ADCY9', 'ADD1', 'ADGRL4', 'ADI1', 'ADIRF', 'ADK', 'ADORA2B', 'ADRB2', 'ADSS1', 'AEBP1', 'AFDN', 'AFF3', 'AFF4', 'AGAP2-AS1', 'AGL', 'AGPAT2', 'AGT', 'AGTPBP1', 'AHCYL1', 'AHDC1', 'AHI1', 'AHNAK', 'AIDA', 'AIG1', 'AIP', 'AJUBA', 'AK5', 'AKAP12', 'AKAP17A', 'AKR1A1', 'AKR1B1', 'AKR1B10', 'AKR1C3', 'ALAD', 'ALCAM', 'ALDH1A3', 'ALDH1L2', 'ALDH2', 'ALDH3B1', 'ALDH5A1', 'ALG2', 'AMIGO2', 'AMN1', 'AMOTL2', 'AMPD3', 'AMZ2P1', 'ANGPTL2', 'ANK1', 'ANK2', 'ANKRD1', 'ANKRD11', 'ANKRD12', 'ANKRD13C', 'ANKRD13D', 'ANKRD28', 'ANKRD40', 'ANKRD42', 'ANKS6', 'ANLN', 'ANO10', 'ANP32A', 'ANP32E', 'ANPEP', 'ANTKMT', 'ANXA11', 'ANX

In [51]:
diff_genesets = {
    'normal':normal_genes,
    'cancer':cancer_genes,
}
# for tissue in rna_df_norm.columns:
#     geneset = list(rna_df_norm[tissue][rna_df_norm[tissue]>THRES].index)
#     print(tissue, len(geneset))
#     expr_genesets[tissue] = geneset

2C. highly expressed gene lists for each cell type

In [52]:
THRES=10

In [53]:
cancer_genes = rna_df_sel.index[rna_df_sel[cancer_tissues].min(axis=1)>THRES].values
print(len(cancer_genes))

4655


In [54]:
normal_genes = rna_df_sel.index[rna_df_sel[normal_tissues].min(axis=1)>THRES].values
print(len(normal_genes))

5502


In [55]:
expr_genesets = {
    'normal':normal_genes,
    'cancer':cancer_genes,
}

In [56]:
# t_test(rna_df_sel.iloc[1,:])[1]

In [57]:
# # 2D. group genesets
# print(glob.glob('../data/interim/rna/*_genes.csv'))
# group_tissue_mapping = {
#     'blue':['Astrocytes','Melanocytes'],
#     'grey':['GM12878'],
#     'green':['Colon','Esophageal','Ovarian','Pancreas','Renal','Thyroid'],
#     'purple':['Airway','Bladder', 'GDSD6', 'HMEC', 'Prostate', 'Uterine'] 
# }

In [58]:
# group_genesets = {}
# for group, tissues in group_tissue_mapping.items():
#     geneset = sorted(pd.read_csv('../data/interim/rna/'+group+'_genes.csv',header=None).loc[:,0].dropna().unique())
#     for tissue in tissues:
#         group_genesets[tissue] = geneset
#         print(tissue, len(geneset))
        

# 3. running pairwise enrichments

In [59]:
def run_pair_enrichment(tissue, save_prefix, geneset_dict,sel_tissues=['cancer','normal'],
                       tissue_labels = is_cancer_label):
    if tissue not in geneset_dict:
        print(tissue, 'not found in geneset_dict')
        return
    results_df = tf_enrichment(geneset=geneset_dict[tissue], TF_feat_dict=tfs_feat_dict, tissues=sel_tissues, 
                      data_all=data_all,tissues_label=tissue_labels, 
                      background='relative',verbose=True, save_path=os.path.join(save_dir, save_prefix+tissue + '_single.csv'))
    # filter for cell type
    results_df_filt = results_df[(results_df.pval_bonf<0.05 )&( results_df.tissue==tissue)]
    # downsample # of tf tf pairs to consider
    tfs_feat_dict_pairs = find_tf_pairs(single_results=results_df_filt, tfs_feat_dict=tfs_feat_dict, data_all=data_all)

    results_df_pairs = tf_enrichment(geneset=geneset_dict[tissue], TF_feat_dict=tfs_feat_dict_pairs, tissues=[tissue], 
                      data_all=data_all,tissues_label=tissue_labels, 
                      background='relative',verbose=True, save_path=os.path.join(save_dir,  save_prefix+tissue + '_pair.csv'))

    results_df_pairs_filt = results_df_pairs[(results_df_pairs.oddsratio>1) & (results_df_pairs.pval_bonf<0.05)]
    results_df_pairs_filt.to_csv(os.path.join(save_dir,  save_prefix+tissue + '_pair_filt.csv'))
    # results_df_KRT_pairs_filt = results_df_KRT_pairs_filt[results_df_KRT_pairs_filt.num_in_2>=(results_df_KRT_pairs_filt.observed+10)]
    print('num sig tf-tf pair in ', tissue, results_df_pairs_filt.shape)

### testing

In [60]:
# TF_feat_dict=tfs_feat_dict
# tissues=['cancer','normal']
# # get subset
# geneset = expr_genesets['cancer']
# data_all_sel = data_all[data_all.index.isin(geneset)]
# tissues_label_sel = is_cancer_label[data_all.index.isin(geneset)]
# print('num genes in geneset',len(geneset))
# print('orig data', data_all.shape)
# print('subsetting data,', data_all_sel.shape, tissues_label_sel.shape)
# print(Counter(tissues_label_sel))

In [61]:
# # set background and initial variables
# results_df = pd.DataFrame(columns = ['tf_key', 'tissue', 'jaccard', 'intersect_over_min','intersection','union', 'num_in_gene', 'num_in_feat', 'observed', 'expected', 'oddsratio', 'pval' ])
# counter = 0
# count_all = data_all_sel.sum().sum() ## background

# print('starting iteration')
# print('estimated count', len(TF_feat_dict)*len(tissues))
# for tissue in tissues:
#     print('*****iterating tissue, ', tissue)
#     tissue_crm = data_all_sel[tissues_label_sel==tissue]
#     tissue_crm = tissue_crm[tissue_crm.index.isin(geneset)]
#     count_tissue_gene = tissue_crm.sum().sum() #mat_counts.sum(axis=1)[0], sum first row

#     # iterate through tfs
#     for tf_key, feat_list in sorted(TF_feat_dict.items(),key=lambda x: x[0]):
#         if len(feat_list)>0:
#             tissue_crm_selfeat = tissue_crm[feat_list]
#             count_selfeat_tissue = tissue_crm_selfeat.sum().sum() # A
#             count_selfeat = data_all_sel[feat_list].sum().sum() #mat_counts.sum(axis=0)[0], sum down first col
#             count_selfeat_neg = count_selfeat - count_selfeat_tissue # B: not in selected genes but in selected feature
#             count_neg_tissue = count_tissue_gene - count_selfeat_tissue #C: not in selected feature but in selected genes
#             count_neg_neg = count_all - count_selfeat_tissue- count_selfeat_neg - count_neg_tissue #D
#             mat_counts = np.array([[count_selfeat_tissue,count_neg_tissue],
#                            [count_selfeat_neg, count_neg_neg]]).reshape((2,2))
#             pseudo = 1
#             print(tf_key)
#             print(mat_counts)
#             mat_counts_pseudo = mat_counts+pseudo
#             num_in_1 = mat_counts.sum(axis=1)[0] #count_tissue_gene
#             num_in_2 = mat_counts.sum(axis=0)[0] #count_selfeat
#             in_1_and_in_2 = count_selfeat_tissue # A
#             in_1_or_in_2 = count_selfeat_tissue +count_selfeat_neg+count_neg_tissue # A+B+C
#             in_1 = count_selfeat_tissue+count_neg_tissue # A+C
#             in_2 = count_selfeat_tissue+count_selfeat_neg#A+B
#             observed_num = mat_counts[0][0] #count_KRTtf_KRTgene
#             expected_num = num_in_1*num_in_2/sum(sum(mat_counts))
#             oddsratio_pseudo, pvalue_pseudo = stats.fisher_exact(mat_counts_pseudo,alternative='greater')
#             jaccard = in_1_and_in_2/in_1_or_in_2
#             intersect_over_min = in_1_and_in_2/min(in_1,in_2)

#             results_df.at[counter] = {'tf_key':tf_key, 'tissue':tissue,
#                                         'jaccard':jaccard,'intersect_over_min':intersect_over_min,
#                                     'intersection':in_1_and_in_2, 
#                                    'union':in_1_or_in_2, 
#                                    'num_in_gene':num_in_1,'num_in_feat':num_in_2,
#                                    'observed':observed_num, 'expected':expected_num, 
#                                    'oddsratio':oddsratio_pseudo, 'pval':pvalue_pseudo}
#             counter+=1
#             if (counter %1000)==0:
#                 print(tf_key, tissue, counter)


# results_df['pval_bonf'] = results_df.pval.apply(lambda x: min(1, x* results_df.shape[0]))
# results_df['log_pval_bonf'] = results_df.pval_bonf.apply(lambda x: min(100,-np.log10(x+1e-100)))


In [62]:
# results_df = tf_enrichment(geneset=expr_genesets['cancer'], TF_feat_dict=tfs_feat_dict, tissues=['cancer','normal'], 
# #                   data_all=data_all,tissues_label=is_cancer_label, 
#                   background='relative',verbose=True, save_path=os.path.join(save_dir, 'expr_'+'cancer' + '_single.csv'))

In [63]:
# # filter for cell type
# results_df_filt = results_df[(results_df.pval_bonf<0.05 )&( results_df.tissue==tissue)]
# print(results_df.shape, results_df_filt.shape)
# # downsample # of tf tf pairs to consider
# tfs_feat_dict_pairs = find_tf_pairs(single_results=results_df_filt, tfs_feat_dict=tfs_feat_dict, data_all=data_all)

# results_df_pairs = tf_enrichment(geneset=expr_genesets['cancer'], TF_feat_dict=tfs_feat_dict_pairs, tissues=['cancer'], 
#                   data_all=data_all,tissues_label=is_cancer_label, 
#                   background='relative',verbose=True, save_path=os.path.join(save_dir,  'expr_'+tissue + '_pair.csv'))


In [64]:
# # pairs testing
# TF_feat_dict=tfs_feat_dict_pairs
# tissues=['cancer','normal']
# # get subset
# geneset = expr_genesets['cancer']
# data_all_sel = data_all[data_all.index.isin(geneset)]
# tissues_label_sel = is_cancer_label[data_all.index.isin(geneset)]
# print('num genes in geneset',len(geneset))
# print('orig data', data_all.shape)
# print('subsetting data,', data_all_sel.shape, tissues_label_sel.shape)
# # set background and initial variables
# results_df = pd.DataFrame(columns = ['tf_key', 'tissue', 'jaccard', 'intersect_over_min','intersection','union', 'num_in_gene', 'num_in_feat', 'observed', 'expected', 'oddsratio', 'pval' ])
# counter = 0
# count_all = data_all_sel.sum().sum() ## background

# print('starting iteration')
# print('estimated count', len(TF_feat_dict)*len(tissues))
# for tissue in tissues:
#     print('*****iterating tissue, ', tissue)
#     tissue_crm = data_all_sel[tissues_label_sel==tissue]
#     tissue_crm = tissue_crm[tissue_crm.index.isin(geneset)]
#     count_tissue_gene = tissue_crm.sum().sum() #mat_counts.sum(axis=1)[0], sum first row

#     # iterate through tfs
#     for tf_key, feat_list in sorted(TF_feat_dict.items(),key=lambda x: x[0]):
#         if len(feat_list)>0:
#             tissue_crm_selfeat = tissue_crm[feat_list]
#             count_selfeat_tissue = tissue_crm_selfeat.sum().sum() # A
#             count_selfeat = data_all_sel[feat_list].sum().sum() #mat_counts.sum(axis=0)[0], sum down first col
#             count_selfeat_neg = count_selfeat - count_selfeat_tissue # B: not in selected genes but in selected feature
#             count_neg_tissue = count_tissue_gene - count_selfeat_tissue #C: not in selected feature but in selected genes
#             count_neg_neg = count_all - count_selfeat_tissue- count_selfeat_neg - count_neg_tissue #D
#             mat_counts = np.array([[count_selfeat_tissue,count_neg_tissue],
#                            [count_selfeat_neg, count_neg_neg]]).reshape((2,2))
#             pseudo = 1
#             print(tf_key)
#             print(mat_counts)
#             mat_counts_pseudo = mat_counts+pseudo
#             num_in_1 = mat_counts.sum(axis=1)[0] #count_tissue_gene
#             num_in_2 = mat_counts.sum(axis=0)[0] #count_selfeat
#             in_1_and_in_2 = count_selfeat_tissue # A
#             in_1_or_in_2 = count_selfeat_tissue +count_selfeat_neg+count_neg_tissue # A+B+C
#             in_1 = count_selfeat_tissue+count_neg_tissue # A+C
#             in_2 = count_selfeat_tissue+count_selfeat_neg#A+B
#             observed_num = mat_counts[0][0] #count_KRTtf_KRTgene
#             expected_num = num_in_1*num_in_2/sum(sum(mat_counts))
#             oddsratio_pseudo, pvalue_pseudo = stats.fisher_exact(mat_counts_pseudo,alternative='greater')
#             jaccard = in_1_and_in_2/in_1_or_in_2
#             intersect_over_min = in_1_and_in_2/min(in_1,in_2)
            
#             results_df.at[counter] = {'tf_key':tf_key, 'tissue':tissue,
#                                         'jaccard':jaccard,'intersect_over_min':intersect_over_min,
#                                     'intersection':in_1_and_in_2, 
#                                    'union':in_1_or_in_2, 
#                                    'num_in_gene':num_in_1,'num_in_feat':num_in_2,
#                                    'observed':observed_num, 'expected':expected_num, 
#                                    'oddsratio':oddsratio_pseudo, 'pval':pvalue_pseudo}
# #             print(results_df.loc[counter])
#             counter+=1
#             if (counter %1000)==0:
#                 print(tf_key, tissue, counter)


# results_df['pval_bonf'] = results_df.pval.apply(lambda x: min(1, x* results_df.shape[0]))
# results_df['log_pval_bonf'] = results_df.pval_bonf.apply(lambda x: min(100,-np.log10(x+1e-100)))


In [65]:
# results_df

### running

In [66]:
# %%time
# # get unique tissue geneset enrichments
# for tissue in normal_tissues:
#     print('============ Running unique', tissue)
#     run_pair_enrichment(tissue, save_prefix='unique_', geneset_dict = unique_genesets)

In [67]:
# %%time
# # get expressed tissue geneset enrichments
# for tissue in normal_tissues:
#     print('============ Running expressed', tissue)
#     run_pair_enrichment(tissue, save_prefix='expr_', geneset_dict = expr_genesets)

In [68]:
%%time
# get expressed tissue geneset enrichments
for disease_state in ['normal', 'cancer']:
    print('============ Running expressed', disease_state)
    run_pair_enrichment(disease_state, save_prefix='expr_', geneset_dict = expr_genesets)

num genes in geneset 5502
subsetting data, (9165, 1057) (9165,)
starting iteration
estimated count 1058
*****iterating tissue,  cancer




*****iterating tissue,  normal
ZNF143 normal 1000
saved file ../data/processed/fig4_modelling/tf_tf_pairs_mm/expr_normal_single.csv
num single unique TFs 111
num pairs of TFs 6105
num genes in geneset 5502
subsetting data, (9165, 1057) (9165,)
starting iteration
estimated count 6105
*****iterating tissue,  normal
CREB3L1::NRF1 normal 1000
EGR1::ZNF320 normal 2000
FOXQ1::ZBTB14 normal 3000
KLF5::VEZF1 normal 4000
RFX1::ZBTB14 normal 5000
ZNF148::ZNF770 normal 6000
saved file ../data/processed/fig4_modelling/tf_tf_pairs_mm/expr_normal_pair.csv
num sig tf-tf pair in  normal (6105, 14)
num genes in geneset 4655
subsetting data, (7929, 1057) (7929,)
starting iteration
estimated count 1058
*****iterating tissue,  cancer
*****iterating tissue,  normal
ZNF143 normal 1000
saved file ../data/processed/fig4_modelling/tf_tf_pairs_mm/expr_cancer_single.csv
num single unique TFs 122
num pairs of TFs 7381
num genes in geneset 4655
subsetting data, (7929, 1057) (7929,)
starting iteration
estimated cou

In [69]:
%%time
# get expressed tissue geneset enrichments
for disease_state in ['normal', 'cancer']:
    print('============ Running expressed', disease_state)
    run_pair_enrichment(disease_state, save_prefix='diff_', geneset_dict = diff_genesets)

num genes in geneset 3128
subsetting data, (4709, 1057) (4709,)
starting iteration
estimated count 1058
*****iterating tissue,  cancer




*****iterating tissue,  normal
ZNF143 normal 1000
saved file ../data/processed/fig4_modelling/tf_tf_pairs_mm/diff_normal_single.csv
num single unique TFs 92
num pairs of TFs 4186
num genes in geneset 3128
subsetting data, (4709, 1057) (4709,)
starting iteration
estimated count 4186
*****iterating tissue,  normal
E2F7::THAP1 normal 1000
HOXC11::RORA normal 2000
PATZ1::ZNF320 normal 3000
ZFP64::ZNF148 normal 4000
saved file ../data/processed/fig4_modelling/tf_tf_pairs_mm/diff_normal_pair.csv
num sig tf-tf pair in  normal (4186, 14)
num genes in geneset 1143
subsetting data, (1849, 1057) (1849,)
starting iteration
estimated count 1058
*****iterating tissue,  cancer
*****iterating tissue,  normal
ZNF143 normal 1000
saved file ../data/processed/fig4_modelling/tf_tf_pairs_mm/diff_cancer_single.csv
num single unique TFs 66
num pairs of TFs 2145
num genes in geneset 1143
subsetting data, (1849, 1057) (1849,)
starting iteration
estimated count 2145
*****iterating tissue,  cancer
GATA3::SOX15 ca

In [70]:
# %%time
# # get group tissue geneset enrichments
# for tissue in normal_tissues:
#     print('============ Running group', tissue)
#     run_pair_enrichment(tissue, save_prefix='group_', geneset_dict = group_genesets)

# 4. Curating list of tf-tf pairs

# 5. filter curating list to find combinations that are close together

In [71]:
for filepath in sorted(glob.glob(os.path.join(save_dir, '*_filt.csv'))):
    df = pd.read_csv(filepath, index_col=0)
    print(os.path.basename(filepath), df.shape)

diff_cancer_pair_filt.csv (2145, 14)
diff_normal_pair_filt.csv (4186, 14)
expr_cancer_pair_filt.csv (7381, 14)
expr_normal_pair_filt.csv (6105, 14)


In [79]:
# get tissue specific tf-tf and global tf-tf pairs useful for expr
expr_all_tissue_df = pd.DataFrame()
for filepath in sorted(glob.glob(os.path.join(save_dir, 'expr*_filt.csv'))):
    df = pd.read_csv(filepath, index_col=0)
    print(os.path.basename(filepath), df.shape)
    expr_all_tissue_df = pd.concat([expr_all_tissue_df, df], axis=0,sort=False)


expr_cancer_pair_filt.csv (7381, 14)
expr_normal_pair_filt.csv (6105, 14)


In [80]:
expr_all_tissue_df.expected.describe()

count     13486.000000
mean       6054.566442
std       13302.562399
min           3.646342
25%         439.664709
50%        1853.317890
75%        5195.267031
max      151198.800575
Name: expected, dtype: float64

In [81]:
print(expr_all_tissue_df.shape)
expr_all_tissue_df[:5]

(13486, 14)


Unnamed: 0,tf_key,tissue,jaccard,intersect_over_min,intersection,union,num_in_gene,num_in_feat,observed,expected,oddsratio,pval,pval_bonf,log_pval_bonf
0,ALX1::ARID3A,cancer,0.002277,0.977969,3418.0,1501003.0,1500926.0,3495.0,3418.0,3263.968771,3.107479,3.330506e-33,2.458246e-29,28.609375
1,ALX1::ARID5B,cancer,0.028894,0.965806,43412.0,1502463.0,1500926.0,44949.0,43412.0,41977.720255,2.027727,3.47893e-200,2.5677980000000004e-196,100.0
2,ALX1::BACH2,cancer,0.001116,0.988201,1675.0,1500946.0,1500926.0,1695.0,1675.0,1582.954812,5.654398,3.725644e-27,2.7498980000000005e-23,22.560683
3,ALX1::BHLHE40,cancer,0.001287,0.989754,1932.0,1500946.0,1500926.0,1952.0,1932.0,1822.966249,6.522569,1.6282030000000003e-33,1.201777e-29,28.920176
4,ALX1::BPTF,cancer,0.00229,0.960861,3437.0,1501066.0,1500926.0,3577.0,3437.0,3340.548296,1.72758,2.995143e-12,2.210715e-08,7.655467


In [82]:
# number of vocabularies for cancer
tf_tf_pair_counter = expr_all_tissue_df.tf_key.value_counts()
len(set(tf_tf_pair_counter.index))

13486

In [83]:
# create filtered expression datasets (vocabs that are responsible for expression that are tissue specific
for filepath in sorted(glob.glob(os.path.join(save_dir, 'diff*_filt.csv'))):
    tissue = os.path.basename(filepath).split('_')[1]
    df = pd.read_csv(filepath, index_col=0)
    df_filt = df[df.tf_key.isin(tf_tf_pair_counter[tf_tf_pair_counter==1].index)]
    print(tissue, df.shape[0], df_filt.shape[0])
    df_filt.to_csv(os.path.join(save_dir, 'diff_'+tissue+'_pair_filt_unique.csv'))

cancer 2145 2145
normal 4186 4186


In [84]:
# create filtered expression datasets (vocabs that are responsible for expression that are tissue specific
for filepath in sorted(glob.glob(os.path.join(save_dir, 'expr*_filt.csv'))):
    tissue = os.path.basename(filepath).split('_')[1]
    df = pd.read_csv(filepath, index_col=0)
    df_filt = df[df.tf_key.isin(tf_tf_pair_counter[tf_tf_pair_counter==1].index)]
    print(tissue, df.shape[0], df_filt.shape[0])
    df_filt.to_csv(os.path.join(save_dir, 'expr_'+tissue+'_pair_filt_unique.csv'))

cancer 7381 7381
normal 6105 6105


In [85]:
# def check_tf_tf_loop_occur(tf_tf_pair, tissue, genesets_dict, data_all=data_all, tissues_label=tissues_label):  #suffixes='loop', 'pro_loop'
#     tf_1, tf_2 = tf_tf_pair.split('::')
    
#     # get features to look at together, list of lists
#     feat_list_type = {
#         'pro_pro':[tf_1 + '_pro', tf_2+'_pro'],
#         'loop_loop':[tf_1 + '_loop', tf_2+'_loop'],
#         'pro_loop':[tf_1 + '_pro', tf_2+'_loop'],
#         'loop_pro':[tf_1 + '_loop', tf_2+'_pro']}
  
#     data_all_sel = data_all[data_all.index.isin(genesets_dict[tissue]) & (tissues_label==tissue)]
#     feat_type_df = {}
#     for type_feat, feat_list in feat_list_type.items():
#         try:
#             check_series = pd.DataFrame(data_all_sel[feat_list]>0).all(axis=1)
#             feat_type_df[type_feat+"_count"] = check_series.value_counts()[True]  
#             feat_type_df[type_feat+"_genes"] = '|'.join(check_series.index[check_series])
            
#         except:
#             feat_type_df[type_feat+"_count"] = 0
#             feat_type_df[type_feat+"_genes"] = ''
            
#     return feat_type_df

def check_tf_tf_loop_occur_cancer(tf_tf_pair, tissue, genesets_dict, data_all=data_all, cancer_label = is_cancer_label, tissues_label=tissues_label):  #suffixes='loop', 'pro_loop'
    tf_1, tf_2 = tf_tf_pair.split('::')
    
    # get features to look at together, list of lists
    feat_list_type = {
        'pro_pro':[tf_1 + '_pro', tf_2+'_pro'],
        'loop_loop':[tf_1 + '_loop', tf_2+'_loop'],
        'pro_loop':[tf_1 + '_pro', tf_2+'_loop'],
        'loop_pro':[tf_1 + '_loop', tf_2+'_pro']}
  
    data_all_sel = data_all[data_all.index.isin(genesets_dict['cancer']) & (tissues_label==tissue)]
    feat_type_df = {}
    for type_feat, feat_list in feat_list_type.items():
        try:
            check_series = pd.DataFrame(data_all_sel[feat_list]>0).all(axis=1)
            feat_type_df[type_feat+"_count"] = check_series.value_counts()[True]  
            feat_type_df[type_feat+"_genes"] = '|'.join(check_series.index[check_series])
            
        except:
            feat_type_df[type_feat+"_count"] = 0
            feat_type_df[type_feat+"_genes"] = ''
            
    return feat_type_df

# 2. get gene sets

create dictionary {tissue: gene_list} for each category

2A. unique gene lists for each cell type

In [31]:
glob.glob('../../rnaseq/unique_gene_lists/*genes.txt')

['../../rnaseq/unique_gene_lists/Colon_genes.txt',
 '../../rnaseq/unique_gene_lists/GDSD0_genes.txt',
 '../../rnaseq/unique_gene_lists/Airway_genes.txt',
 '../../rnaseq/unique_gene_lists/GDSD6_genes.txt',
 '../../rnaseq/unique_gene_lists/Pancreas_genes.txt',
 '../../rnaseq/unique_gene_lists/Uterine_genes.txt',
 '../../rnaseq/unique_gene_lists/Ovarian_genes.txt',
 '../../rnaseq/unique_gene_lists/Astrocytes_genes.txt',
 '../../rnaseq/unique_gene_lists/Bladder_genes.txt',
 '../../rnaseq/unique_gene_lists/Melanocytes_genes.txt',
 '../../rnaseq/unique_gene_lists/HMEC_genes.txt',
 '../../rnaseq/unique_gene_lists/Prostate_genes.txt',
 '../../rnaseq/unique_gene_lists/GM12878_genes.txt',
 '../../rnaseq/unique_gene_lists/Renal_genes.txt',
 '../../rnaseq/unique_gene_lists/GDSD3_genes.txt',
 '../../rnaseq/unique_gene_lists/Esophageal_genes.txt',
 '../../rnaseq/unique_gene_lists/Thyroid_genes.txt']

In [32]:
unique_genesets = {}
for rna_file in glob.glob('../../rnaseq/unique_gene_lists/*genes.txt'):
    tissue = os.path.basename(rna_file).split('_')[0]
    geneset = sorted(pd.read_csv(rna_file,header=None).loc[:,0].dropna().unique())
    print(tissue, len(geneset))
    unique_genesets[tissue] = geneset
    

Colon 14
GDSD0 14
Airway 41
GDSD6 240
Pancreas 17
Uterine 37
Ovarian 27
Astrocytes 465
Bladder 37
Melanocytes 226
HMEC 49
Prostate 312
GM12878 447
Renal 27
GDSD3 61
Esophageal 19
Thyroid 44


2B. expressed gene lists for each cell type

In [33]:
THRES=5

In [34]:
expr_genesets = {}
for tissue in rna_df_norm.columns:
    geneset = list(rna_df_norm[tissue][rna_df_norm[tissue]>THRES].index)
    print(tissue, len(geneset))
    expr_genesets[tissue] = geneset

Airway 6815
Astrocytes 7303
Bladder 6642
Colon 6826
Esophageal 6651
GDSD6 6442
GM12878 6166
HMEC 6972
Melanocytes 7359
Ovarian 6578
Pancreas 7055
Prostate 6516
Renal 6901
Thyroid 7003
Uterine 6569


2C. group genesets

In [35]:
print(glob.glob('../data/interim/rna/*_genes.csv'))
group_tissue_mapping = {
    'blue':['Astrocytes','Melanocytes'],
    'grey':['GM12878'],
    'green':['Colon','Esophageal','Ovarian','Pancreas','Renal','Thyroid'],
    'purple':['Airway','Bladder', 'GDSD6', 'HMEC', 'Prostate', 'Uterine'] 
}

['../data/interim/rna/purple_genes.csv', '../data/interim/rna/green_genes.csv', '../data/interim/rna/common_genes.csv', '../data/interim/rna/all_genes.csv', '../data/interim/rna/grey_genes.csv', '../data/interim/rna/blue_genes.csv']


In [36]:
group_genesets = {}
for group, tissues in group_tissue_mapping.items():
    geneset = sorted(pd.read_csv('../data/interim/rna/'+group+'_genes.csv',header=None).loc[:,0].dropna().unique())
    for tissue in tissues:
        group_genesets[tissue] = geneset
        print(tissue, len(geneset))
        

Astrocytes 954
Melanocytes 954
GM12878 626
Colon 759
Esophageal 759
Ovarian 759
Pancreas 759
Renal 759
Thyroid 759
Airway 796
Bladder 796
GDSD6 796
HMEC 796
Prostate 796
Uterine 796


# testing 

In [117]:
%%time
df = pd.read_csv(os.path.join(save_dir, 'diff_cancer_pair_filt.csv'), index_col=0)
tf_tf_pair_feat = {}
for tf_tf_pair in df.tf_key:
    tf_tf_pair_feat[tf_tf_pair] = check_tf_tf_loop_occur_cancer(tf_tf_pair, tissue='COLO_SCR_DMSO',genesets_dict=diff_genesets)
df_tf_key_type = pd.DataFrame.from_dict(tf_tf_pair_feat,orient='index')


CPU times: user 37.4 s, sys: 817 ms, total: 38.3 s
Wall time: 38.3 s


In [118]:
Counter(tissues_label)

Counter({'COLO_SCR_DMSO': 8544, 'Melanocytes': 6887, 'WM_SCR_DMSO': 10339})

In [119]:
df_tf_key_type.loop_loop_count.describe()

count    2145.000000
mean       53.967366
std        68.794666
min         0.000000
25%         0.000000
50%        34.000000
75%        74.000000
max       524.000000
Name: loop_loop_count, dtype: float64

In [120]:
row = df_tf_key_type[df_tf_key_type.pro_pro_count>0].iloc[0]
row

pro_pro_count                                                      3
pro_pro_genes                                        CPM|MDFIC|PFDN4
loop_loop_count                                                  110
loop_loop_genes    AFAP1L1|AGMO|AKAP6|ALDH1B1|ANGPT1|ANKMY2|ARHGA...
pro_loop_count                                                     4
pro_loop_genes                                  CPM|LRR1|MDFIC|PFDN4
loop_pro_count                                                    14
loop_pro_genes     AKAP6|ALDH1B1|COL19A1|COPS6|CPM|DLX1|DUSP4|MDF...
Name: ARID3A::ARID5B, dtype: object

In [90]:
# list(df_tf_key_type[df_tf_key_type.pro_pro_count>0].index)

In [91]:
# glob.glob('../data/interim/annon/anchor_motif_scan/*bed')

In [130]:
sorted(glob.glob('../data/interim/annon/promoter_motif_scan/*bed'))

['../data/interim/annon/promoter_motif_scan/promoter_A431-CTRLi_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_Airway_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_Astrocytes_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_Bladder_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_CAL27-CTRLi_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_COLO-SCR-DMSO_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_Colon_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_Esophageal_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_GDSD0_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_GDSD3_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_GDSD6_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_GM12878_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_HMEC_annon.bed',
 '../data/interim/annon/promoter_motif_scan/

In [93]:
# %%time
# # data_all_sel = data_all[data_all.index.isin(unique_genesets['Prostate']) & (tissues_label=='Prostate')]
# # vocab_tissue_df = pd.DataFrame()
# # foot_df = pd.read_csv(os.path.join('../data/interim/annon/anchor_motif_scan/','SCC13-CTRLi_annon.bed'),sep='\t',header=None)

# foot_df = pd.read_csv(os.path.join('../data/interim/annon/promoter_motif_scan/','promoter_SCC13-CTRLi_annon.bed'),sep='\t',header=None)
# foot_df.columns = ['chr','start','stop','TSS','chr_m','start_m','stop_m','id_trim','score','strand']

In [94]:
# foot_df

In [95]:
# tf_pair = row.name
# tf_arr = tf_pair.split('::')
# # genes_pro_only = row['pro_pro_genes'].split('|')
# genes_pro_only = row['loop_loop_genes'].split('|')
# print(genes_pro_only)
# foot_df_sel = foot_df[foot_df.TSS.isin(genes_pro_only)]
# print(foot_df_sel.shape)
# foot_df_sel['tf'] = foot_df_sel.id_trim.map(tf_id_to_name_dict)
# print(foot_df_sel.shape)
# foot_df_sel = foot_df_sel[foot_df_sel.tf.isin(tf_arr)].drop_duplicates()
# print(tf_pair, row['pro_pro_genes'])
# # print(tf_pair, row['pro_pro_genes'])
# print(foot_df_sel)

In [96]:
# # filter by those that have pairs 
# # foot_df = pd.read_csv(os.path.join('../data/interim/annon/promoter_motif_scan/','promoter_Prostate_annon.bed'),sep='\t',header=None)
                      
# # foot_df.columns = ['chr','start','stop','TSS','chr_m','start_m','stop_m','id_trim','score','strand']
# foot_df = foot_df[foot_df.TSS.isin(['CYP4B1'])]
# foot_df['tf'] = foot_df.id_trim.map(tf_id_to_name_dict)
# foot_df[foot_df.tf.isin(['NFATC4','ZNF667'])].drop_duplicates()


In [97]:
# foot_df = foot_df[foot_df.tf.isin(['NFATC4','ZNF667'])].drop_duplicates()
# foot_df


# loop through all of the tf-tf pairs and find the locations in the promoter regions

In [100]:
# df_tf_key_type[:5]

In [122]:
def get_vocab_cancer_mc(tf_tf_pair_file, tissues, genesets_dict, type_loop='pro_pro', savepath=None, save_loop_count=None):
    if savepath is None: #savepath is save each tissue separately
        vocab_df = pd.DataFrame()
    for tissue in sorted(tissues):
        print('====== STARTING ',tissue, tf_tf_pair_file)

        # read file
        df = pd.read_csv(tf_tf_pair_file, index_col=0)
        if df.shape[0]==0:
            print('EXIT: no data', tf_tf_pair_file)
            continue
        else:
            print('num vocab found:', df.shape[0])

        # get tf tf pair types
        tf_tf_pair_feat = {}
        for tf_tf_pair in df.tf_key: 
            tf_tf_pair_feat[tf_tf_pair] = check_tf_tf_loop_occur_cancer(tf_tf_pair, tissue, genesets_dict)
        df_tf_key_type = pd.DataFrame.from_dict(tf_tf_pair_feat,orient='index')
        
        ###ADDED IN TISSUE MAPPER
        tissue_new = '-'.join(tissue.split('_'))

        # get pro_pro OR loop_loop or looponly (PART TO CHANGE IN FUTURE)
        if type_loop in ['pro_pro','loop_loop','pro_loop']:
            if save_loop_count is not None:
                print('saving loop type info', save_loop_count+tissue_new+'_loop_type.csv')
                df_tf_key_type.to_csv(save_loop_count+tissue_new+'_loop_type.csv')
        else:
            print('vocab type not correct', type_loop)
            raise            
        df_tf_filt = df_tf_key_type[df_tf_key_type[type_loop+'_count']>0]    
        
        
        # get locations of vocab points
        if df_tf_filt.shape[0]==0:
            print('EXIT: no tf tf pairs in loop type: ', type_loop, filepath)
            continue   
        else:
            print('num tf-tf pairs in loop type found', type_loop,  df_tf_filt.shape[0])

        vocab_tissue_df = pd.DataFrame()
        foot_df = pd.read_csv(os.path.join('../data/interim/annon/promoter_motif_scan/','promoter_'+tissue_new+'_annon.bed'),sep='\t',header=None)
        foot_df.columns = ['chr','start','stop','TSS','chr_m','start_m','stop_m','id_trim','score','strand']
        for tf_pair, row in df_tf_filt.iterrows():
            tf_arr = tf_pair.split('::')
#             print(row)
            genes_pro_only = row[type_loop+'_genes'].split('|')
            foot_df_sel = foot_df[foot_df.TSS.isin(genes_pro_only)]
            foot_df_sel['tf'] = foot_df_sel.id_trim.map(tf_id_to_name_dict)
            foot_df_sel = foot_df_sel[foot_df_sel.tf.isin(tf_arr)].drop_duplicates()
            foot_df_sel['vocab'] = tf_pair
            foot_df_sel['genes'] = row[type_loop+'_genes']
            vocab_tissue_df = pd.concat([vocab_tissue_df, foot_df_sel],axis=0,sort=False)

        vocab_tissue_df['tissue'] = tissue
        print('vocab_tissue_df length: ', vocab_tissue_df.shape[0], len(vocab_tissue_df.vocab.unique()))
        if savepath is not None:
            print('saved', savepath+tissue+"_"+type_loop+'_vocab_info.csv')
            vocab_tissue_df.to_csv(savepath+tissue_new+"_"+type_loop+'_vocab_info.csv')
        
        if savepath is None:
            vocab_df = pd.concat([vocab_df,vocab_tissue_df ],axis=0,sort=False)
    if savepath is None:
        return vocab_df
    else:
        return None

unique stuff

In [123]:
# %%time 
# vocab_df = get_vocab(glob.glob(os.path.join(save_dir, 'unique*_filt.csv')),unique_genesets,
#                      type_loop='pro_pro',save_loop_count=os.path.join(save_dir, 'unique_'))
# vocab_df.to_csv(os.path.join(save_dir, 'unique_alltissue_pro_pro_vocab_info.csv'))
# vocab_df.shape

In [124]:
# %%time 
# vocab_df = get_vocab(glob.glob(os.path.join(save_dir, 'unique*_filt.csv')),unique_genesets,type_loop='pro_loop')
# vocab_df.to_csv(os.path.join(save_dir, 'unique_alltissue_pro_loop_vocab_info.csv'))
# vocab_df.shape


In [125]:
# %%time 
# vocab_df = get_vocab(glob.glob(os.path.join(save_dir, 'unique*_filt.csv')),unique_genesets,type_loop='loop_loop')
# vocab_df.to_csv(os.path.join(save_dir, 'unique_alltissue_loop_loop_vocab_info.csv'))
# vocab_df.shape


In [126]:
glob.glob(os.path.join(save_dir, '*_filt_unique.csv'))

['../data/processed/fig4_modelling/tf_tf_pairs_mm/diff_cancer_pair_filt_unique.csv',
 '../data/processed/fig4_modelling/tf_tf_pairs_mm/diff_normal_pair_filt_unique.csv',
 '../data/processed/fig4_modelling/tf_tf_pairs_mm/expr_cancer_pair_filt_unique.csv',
 '../data/processed/fig4_modelling/tf_tf_pairs_mm/expr_normal_pair_filt_unique.csv']

In [127]:
cancer_tissues

['COLO_SCR_DMSO', 'WM_SCR_DMSO']

In [135]:
%%time 
vocab_df_diff = get_vocab_cancer_mc(os.path.join(save_dir, 'diff_cancer_pair_filt_unique.csv'), cancer_tissues, diff_genesets, 
                          savepath=os.path.join(save_dir, 'diff_'))

vocab_df_diff_loop_loop = get_vocab_cancer_mc(os.path.join(save_dir, 'diff_cancer_pair_filt_unique.csv'), cancer_tissues, diff_genesets, 
                                            savepath=os.path.join(save_dir, 'diff_'),
                                            type_loop='loop_loop', save_loop_count=os.path.join(save_dir, 'diff_'))
# vocab_df_expr.to_csv(os.path.join(save_dir, 'expr_alltissue_vocab_info.csv'))

num vocab found: 2145
num tf-tf pairs in loop type found pro_pro 665


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


vocab_tissue_df length:  18328 665
saved ../data/processed/fig4_modelling/tf_tf_pairs_mm/diff_COLO_SCR_DMSO_pro_pro_vocab_info.csv
num vocab found: 2145
num tf-tf pairs in loop type found pro_pro 612
vocab_tissue_df length:  30218 612
saved ../data/processed/fig4_modelling/tf_tf_pairs_mm/diff_WM_SCR_DMSO_pro_pro_vocab_info.csv
num vocab found: 2145
saving loop type info ../data/processed/fig4_modelling/tf_tf_pairs_mm/diff_COLO-SCR-DMSO_loop_type.csv
num tf-tf pairs in loop type found loop_loop 1540
vocab_tissue_df length:  99627 1496
saved ../data/processed/fig4_modelling/tf_tf_pairs_mm/diff_COLO_SCR_DMSO_loop_loop_vocab_info.csv
num vocab found: 2145
saving loop type info ../data/processed/fig4_modelling/tf_tf_pairs_mm/diff_WM-SCR-DMSO_loop_type.csv
num tf-tf pairs in loop type found loop_loop 1653
vocab_tissue_df length:  224652 1633
saved ../data/processed/fig4_modelling/tf_tf_pairs_mm/diff_WM_SCR_DMSO_loop_loop_vocab_info.csv
CPU times: user 22min 44s, sys: 51.6 s, total: 23min 36s

In [136]:
%%time 
vocab_df_expr = get_vocab_cancer_mc(os.path.join(save_dir, 'expr_cancer_pair_filt_unique.csv'), cancer_tissues, expr_genesets, 
                          savepath=os.path.join(save_dir, 'expr_'))

vocab_df_expr_loop_loop = get_vocab_cancer_mc('../data/processed/fig4_modelling/tf_tf_pairs_scc/expr_cancer_pair_filt_unique.csv', cancer_tissues, expr_genesets, 
                                            savepath=os.path.join(save_dir, 'expr_'),
                                            type_loop='loop_loop', save_loop_count=os.path.join(save_dir, 'expr_'))


num vocab found: 7381
num tf-tf pairs in loop type found pro_pro 2971


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


vocab_tissue_df length:  113039 2971
saved ../data/processed/fig4_modelling/tf_tf_pairs_mm/expr_COLO_SCR_DMSO_pro_pro_vocab_info.csv
num vocab found: 7381
num tf-tf pairs in loop type found pro_pro 2679
vocab_tissue_df length:  165082 2679
saved ../data/processed/fig4_modelling/tf_tf_pairs_mm/expr_WM_SCR_DMSO_pro_pro_vocab_info.csv
num vocab found: 15400
saving loop type info ../data/processed/fig4_modelling/tf_tf_pairs_mm/expr_COLO-SCR-DMSO_loop_type.csv
num tf-tf pairs in loop type found loop_loop 7750
vocab_tissue_df length:  1712965 7655
saved ../data/processed/fig4_modelling/tf_tf_pairs_mm/expr_COLO_SCR_DMSO_loop_loop_vocab_info.csv
num vocab found: 15400
saving loop type info ../data/processed/fig4_modelling/tf_tf_pairs_mm/expr_WM-SCR-DMSO_loop_type.csv
num tf-tf pairs in loop type found loop_loop 8128
vocab_tissue_df length:  2914671 8108
saved ../data/processed/fig4_modelling/tf_tf_pairs_mm/expr_WM_SCR_DMSO_loop_loop_vocab_info.csv
CPU times: user 19h 49min 52s, sys: 39min 49s,