# 8B2_tf_tf_enrichment_cancer

07/20/2020

getting tf-tf enriched pairs for each tissue for each geneset

cancer - scc



In [1]:
# basic packages
import os, glob
import pandas as pd
import numpy as np; np.random.seed(0)
import itertools
from collections import Counter, defaultdict
import time

# machine learning packages from sklearn
from sklearn.preprocessing import MinMaxScaler #StandardScaler 
from sklearn import preprocessing, metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV
from IPython.display import Image
from scipy import stats

# Import tools needed for visualization
import seaborn as sns; sns.set()
import matplotlib
import matplotlib.pyplot as plt

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
save_dir = '../data/processed/fig4_modelling/tf_tf_pairs_scc/'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [4]:
THRES=5

In [5]:
tf_annon_df = pd.read_csv('../data/external/HOCOMOCOv11_annotation.csv',index_col=0)
tf_annon_df['id_trim'] = tf_annon_df['id'] + '.pwm.trim'
tf_name_to_id_dict = pd.Series(tf_annon_df.id_trim.values, index=tf_annon_df.tf.values).to_dict()
tf_id_to_name_dict = pd.Series(tf_annon_df.tf.values, index=tf_annon_df.id_trim.values).to_dict()

In [6]:
rna_df = pd.read_csv('../data/interim/rna/tissue_tpm_sym.csv',index_col=0)
# rna_df_norm = rna_df[normal_tissues]

In [7]:
rna_df.loc["BARX2",:]

A431-CTRLi             1.025
A431-p63i              1.390
Airway                 8.655
Astrocytes             0.065
Bladder                0.750
CAL27-CTRLi            0.615
CAL27-p63i             0.550
COLO_SCR_DMSO          0.000
COLO_SCR_PLX           0.005
COLO_shMITF_DMSO       0.015
COLO_shMITF_PLX        0.000
Colon                  1.775
D0-CTRLi               0.380
D0-p63i                0.220
D3-CTRLi               2.665
D3-p63i                0.640
Esophageal             0.405
GDSD0                  2.800
GDSD3                 14.180
GDSD6                  6.525
GM12878                0.055
HMEC                   0.105
Melanocytes            0.030
Ovarian                0.355
Pancreas               5.415
Prostate               1.625
Renal                  1.175
SCC13-CTRLi            0.330
SCC13-p63i             0.455
SKMEL5_SCR_DMSO        0.000
SKMEL5_SCR_PLX         0.020
SKMEL5_shMITF-DMSO     0.000
SKMEL5_shMITF_PLX      0.005
Thyroid                2.460
Uterine       

In [8]:
print(rna_df.columns)
cancer_tissues = ['A431-CTRLi','CAL27-CTRLi', 'SCC13-CTRLi']
normal_tissues = ['GDSD0','GDSD3']#'D0-CTRLi', 'D3-CTRLi']
sel_tissues = cancer_tissues+normal_tissues
tissues_dict = dict(zip(sel_tissues,range(len(sel_tissues))))
rna_df_sel = rna_df[sel_tissues]

Index(['A431-CTRLi', 'A431-p63i', 'Airway', 'Astrocytes', 'Bladder',
       'CAL27-CTRLi', 'CAL27-p63i', 'COLO_SCR_DMSO', 'COLO_SCR_PLX',
       'COLO_shMITF_DMSO', 'COLO_shMITF_PLX', 'Colon', 'D0-CTRLi', 'D0-p63i',
       'D3-CTRLi', 'D3-p63i', 'Esophageal', 'GDSD0', 'GDSD3', 'GDSD6',
       'GM12878', 'HMEC', 'Melanocytes', 'Ovarian', 'Pancreas', 'Prostate',
       'Renal', 'SCC13-CTRLi', 'SCC13-p63i', 'SKMEL5_SCR_DMSO',
       'SKMEL5_SCR_PLX', 'SKMEL5_shMITF-DMSO', 'SKMEL5_shMITF_PLX', 'Thyroid',
       'Uterine', 'WM_SCR_DMSO', 'WM_SCR_PLX', 'WM_shMITF_DMSO',
       'WM_shMITF_PLX'],
      dtype='object')


# 0. CRM data wrangling and global var setup

In [9]:
%%time
# import
data_all = pd.read_csv('/Users/mguo123/Google Drive/1_khavari/omics_project-LD/pan_omics/data/processed/tissue_crms/all_count_comb_overall.csv',index_col=0,header=0)
data_all = data_all[data_all.tissue.isin(sel_tissues)]
data_all = data_all[data_all.iloc[:,2:].sum(axis=1)>1e-1]

# expression labels
exp_label = list(np.log10(data_all.exp.values+1e-2))
labels_all  = np.array(np.array(exp_label)>THRES)


tissues_label  = data_all.tissue.values#np.array((data_all.exp>THRES).values)
tissue_num_labels =  data_all.tissue.map(tissues_dict).values
is_cancer_label = np.array(['cancer' if x in cancer_tissues  else 'normal' for x in tissues_label ])


genes_all = data_all.index.values
gene_to_num_dict = dict(zip(np.unique(genes_all),range(len(np.unique(genes_all)))))
genes_num_all = np.vectorize(gene_to_num_dict.get)(genes_all)



CPU times: user 1min 24s, sys: 24.4 s, total: 1min 48s
Wall time: 1min 32s


In [39]:
print('files_loaded', data_all.shape)


files_loaded (58225, 1057)


In [11]:
## only tfs

data_all.drop(['tissue','exp','num_loop_counts','num_loops','num_atac_regions_pro','num_atac_regions_loop'],axis=1,inplace=True)

data_all.shape




(58225, 1057)

In [12]:
data_all[:5]

Unnamed: 0,AHR_pro,ARID5B_pro,ARNT_pro,ARNTL_pro,ATF1_pro,ATF2_pro,ATF3_pro,ATF4_pro,ATF6_pro,ATF7_pro,...,MAFA_loop,MEOX1_loop,E2F5_pro,E2F5_loop,ESR2_pro,KLF14_pro,TBX15_pro,ESR2_loop,KLF14_loop,TBX15_loop
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GALT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GNT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
#tfs_feat_dict: dict
# key: tf
# value: list of crm features
tfs_feat_dict = defaultdict(list)
for feat in data_all.columns:
    tfs_feat_dict[feat.split('_')[0]].append(feat)

# 1. Functions 

In [14]:
def find_tf_pairs(single_results, tfs_feat_dict=tfs_feat_dict, data_all=data_all):
    tfs_unique = single_results[single_results.pval_bonf<0.05].tf_key.unique()
    print('num single unique TFs', len(tfs_unique))
    TF_pair_dict = defaultdict(list)
    for tf1 in tfs_unique:
        for tf2 in tfs_unique:
            if (tf1<tf2):
                if (tf1 in tfs_feat_dict) and (tf2 in tfs_feat_dict):
                    possible_feats = [tf1+'_pro',tf1+'_loop',tf2+'_pro',tf2+'_loop']
                    for feat in possible_feats:
                        if feat in data_all.columns:
                            TF_pair_dict[tf1+'::'+tf2].append(feat)
    print('num pairs of TFs', len(TF_pair_dict))
    return TF_pair_dict

In [15]:
# TF_feat_dict: dict, key = tf_key (string of tfs sep by ::), val; list of features/columns of data_all
# backgound: relative means the background in the data_all_sel, if 'all' then background is data_all
# def tf_enrichment(geneset, tissues=normal_tissues, data_all=data_all,tissues_label=tissues_label, background='all'):
def tf_enrichment(geneset, TF_feat_dict, tissues=['cancer', 'normal'], 
                  data_all=data_all,tissues_label=is_cancer_label, 
                  background='relative',verbose=True, save_path=None):
    # get subset
    data_all_sel = data_all[data_all.index.isin(geneset)]
    tissues_label_sel = tissues_label[data_all.index.isin(geneset)]
    if verbose:
        print('num genes in geneset',len(geneset))
        print('subsetting data,', data_all_sel.shape, tissues_label_sel.shape)
    
    # set background and initial variables
    results_df = pd.DataFrame(columns = ['tf_key', 'tissue', 'jaccard', 'intersect_over_min','intersection','union', 'num_in_gene', 'num_in_feat', 'observed', 'expected', 'oddsratio', 'pval' ])
    counter = 0
    if background=='all':
        count_all = data_all.sum().sum()
    else:
        count_all = data_all_sel.sum().sum() ## background
    
    # iterate through tissues
    if verbose:
        print('starting iteration')
        print('estimated count', len(TF_feat_dict)*len(tissues))
    for tissue in tissues:
        if verbose:
            print('*****iterating tissue, ', tissue)
        tissue_crm = data_all_sel[tissues_label_sel==tissue]
        tissue_crm = tissue_crm[tissue_crm.index.isin(geneset)]
        count_tissue_gene = tissue_crm.sum().sum() #mat_counts.sum(axis=1)[0], sum first row
        
        # iterate through tfs
        for tf_key, feat_list in sorted(TF_feat_dict.items(),key=lambda x: x[0]):
            if len(feat_list)>0:
                tissue_crm_selfeat = tissue_crm[feat_list]
                count_selfeat_tissue = tissue_crm_selfeat.sum().sum() # A
                count_selfeat = data_all_sel[feat_list].sum().sum() #mat_counts.sum(axis=0)[0], sum down first col
                count_selfeat_neg = count_selfeat - count_selfeat_tissue # B: not in selected genes but in selected feature
                count_neg_tissue = count_tissue_gene - count_selfeat_tissue #C: not in selected feature but in selected genes
                count_neg_neg = count_all - count_selfeat_tissue- count_selfeat_neg - count_neg_tissue #D
                mat_counts = np.array([[count_selfeat_tissue,count_neg_tissue],
                               [count_selfeat_neg, count_neg_neg]]).reshape((2,2))
                pseudo = 1
                mat_counts_pseudo = mat_counts+pseudo
                num_in_1 = mat_counts.sum(axis=1)[0] #count_tissue_gene
                num_in_2 = mat_counts.sum(axis=0)[0] #count_selfeat
                in_1_and_in_2 = count_selfeat_tissue # A
                in_1_or_in_2 = count_selfeat_tissue +count_selfeat_neg+count_neg_tissue # A+B+C
                in_1 = count_selfeat_tissue+count_neg_tissue # A+C
                in_2 = count_selfeat_tissue+count_selfeat_neg#A+B
                observed_num = mat_counts[0][0] #count_KRTtf_KRTgene
                expected_num = num_in_1*num_in_2/sum(sum(mat_counts))
                oddsratio_pseudo, pvalue_pseudo = stats.fisher_exact(mat_counts_pseudo,alternative='greater')
                jaccard = in_1_and_in_2/in_1_or_in_2
                intersect_over_min = in_1_and_in_2/min(in_1,in_2)

                results_df.at[counter] = {'tf_key':tf_key, 'tissue':tissue,
                                            'jaccard':jaccard,'intersect_over_min':intersect_over_min,
                                        'intersection':in_1_and_in_2, 
                                       'union':in_1_or_in_2, 
                                       'num_in_gene':num_in_1,'num_in_feat':num_in_2,
                                       'observed':observed_num, 'expected':expected_num, 
                                       'oddsratio':oddsratio_pseudo, 'pval':pvalue_pseudo}
                counter+=1
                if verbose:
                    if (counter %1000)==0:
                        print(tf_key, tissue, counter)

    
    results_df['pval_bonf'] = results_df.pval.apply(lambda x: min(1, x* results_df.shape[0]))
    results_df['log_pval_bonf'] = results_df.pval_bonf.apply(lambda x: min(100,-np.log10(x+1e-100)))
    
    if save_path is not None:
        results_df.to_csv(save_path)
        print('saved file', save_path)
    return results_df

# 2. get gene sets

create dictionary {tissue: gene_list} for each category

cancer variant (to types of gene sets the differential one and then the normal one)

In [16]:
# 2A. unique gene lists for each cell type
# glob.glob('../../rnaseq/unique_gene_lists/*genes.txt')

In [17]:
# unique_genesets = {}
# for rna_file in glob.glob('../../rnaseq/unique_gene_lists/*genes.txt'):
#     tissue = os.path.basename(rna_file).split('_')[0]
#     geneset = sorted(pd.read_csv(rna_file,header=None).loc[:,0].dropna().unique())
#     print(tissue, len(geneset))
#     unique_genesets[tissue] = geneset
    

2B. differentially expressed gene lists for each cell type

In [18]:
# def t_test(row):
#     t_stat, pval =  stats.ttest_ind(row[cancer_tissues], row[normal_tissues], axis=0, equal_var=True, nan_policy='propagate')
#     fc = 
#     return t_stat, pval, fc

In [19]:
rna_df_sel_high = rna_df_sel[rna_df_sel.max(axis=1)>1]
rna_df_sel_high_pseudo = rna_df_sel_high+1

ttest_rna_df = pd.DataFrame(stats.ttest_ind(rna_df_sel_high_pseudo[cancer_tissues], rna_df_sel_high_pseudo[normal_tissues], 
                                            axis=1, equal_var=True, nan_policy='propagate')).T
ttest_rna_df.columns = ['tstat','pval']
ttest_rna_df = pd.concat([rna_df_sel_high.reset_index(), ttest_rna_df],axis=1)
ttest_rna_df.sort_values('tstat',inplace=True)

  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


In [20]:
# print genes in cancer tissues
cancer_genes = sorted(ttest_rna_df[(ttest_rna_df.pval < 0.001)& (ttest_rna_df.tstat > 0)]['index'])
print(len(cancer_genes))
print(cancer_genes)

48
['ABCB7', 'AP5S1', 'ARGLU1', 'BAX', 'BCL9', 'CASP2', 'CCDC142', 'CHMP3', 'CTCF', 'CTDSP1', 'CUEDC1', 'DNAJA2', 'DNAJC7', 'DYNC1LI2', 'FUNDC2', 'GDE1', 'GOLGA2', 'IFI16', 'IRF3', 'KLHL9', 'MAPK12', 'MIDN', 'MLEC', 'MXRA7', 'NDUFB11', 'NRIP3', 'POU2F1', 'PRDM2', 'RBMX', 'RHPN2', 'RNU4-1', 'RNU5A-1', 'SDHAF2', 'SERF1A', 'SMG1', 'SNORA31B', 'SPATA33', 'STAT3', 'SUOX', 'SUPT4H1', 'SUPT6H', 'SUV39H1', 'TMCO4', 'TSR2', 'VAC14', 'WDR45B', 'ZC3H18', 'ZNF618']


In [21]:
# print genes in normal tissues
normal_genes = sorted(ttest_rna_df[(ttest_rna_df.pval < 0.001)& (ttest_rna_df.tstat < 0)]['index'])
print(len(normal_genes))
print(normal_genes)

221
['ABCD4', 'ACSL3', 'ADPRM', 'AGAP1-IT1', 'AMN1', 'AMY2B', 'AMZ2', 'ANKRA2', 'ANTKMT', 'ARMT1', 'ARRDC1', 'AS3MT', 'ASPRV1', 'ATP10D', 'ATP6AP2', 'ATP6V1C2', 'AZI2', 'BBS9', 'BEX4', 'BOLA2', 'C16orf95', 'C18orf21', 'C2orf74', 'C4orf3', 'C4orf33', 'C9orf72', 'CAMTA1', 'CBR4', 'CCDC122', 'CCND2', 'CD9', 'CDIP1', 'CFAP36', 'CHM', 'CHMP5', 'CNOT8', 'COL16A1', 'COL4A6', 'COMMD6', 'COX6C', 'CTSK', 'CUL5', 'CYP2J2', 'CYP51A1', 'CYYR1', 'DCAF8', 'DCTN6', 'DEPDC5', 'DFFB', 'DLEC1', 'DNAJC3', 'DST', 'E4F1', 'EAF1', 'EBAG9', 'ENOX2', 'EVA1C', 'FADS2', 'FAM104B', 'FAM168A', 'FAM169A', 'FAM24B', 'FAN1', 'FBXL3', 'GABARAPL2', 'GALNT11', 'GET1', 'GGACT', 'GNB2', 'GNPTG', 'GOLGA7', 'GTF2B', 'GTF2H1', 'GYG2P1', 'H1-0', 'HDDC2', 'HHAT', 'HIPK1', 'HIVEP2', 'HMGCR', 'IDH1-AS1', 'IFT43', 'IMPACT', 'INPP5D', 'ITM2A', 'JKAMP', 'KBTBD3', 'KCNK6', 'KDM5D', 'KIAA1191', 'KIF21A', 'KLHL20', 'KPRP', 'LCE1C', 'LCE1F', 'LGALS9C', 'LHFPL2', 'LINC00659', 'LINC01133', 'LMAN2', 'LNX2', 'LOXL3', 'LRIG3', 'LXN', 'MAN1B

In [22]:
diff_genesets = {
    'normal':normal_genes,
    'cancer':cancer_genes,
}
# for tissue in rna_df_norm.columns:
#     geneset = list(rna_df_norm[tissue][rna_df_norm[tissue]>THRES].index)
#     print(tissue, len(geneset))
#     expr_genesets[tissue] = geneset

2C. highly expressed gene lists for each cell type

In [23]:
THRES=10

In [24]:
cancer_genes = rna_df_sel.index[rna_df_sel[cancer_tissues].min(axis=1)>THRES].values
print(len(cancer_genes))

3916


In [25]:
normal_genes = rna_df_sel.index[rna_df_sel[normal_tissues].min(axis=1)>THRES].values
print(len(normal_genes))

4669


In [26]:
expr_genesets = {
    'normal':normal_genes,
    'cancer':cancer_genes,
}

In [27]:
# t_test(rna_df_sel.iloc[1,:])[1]

In [28]:
# # 2D. group genesets
# print(glob.glob('../data/interim/rna/*_genes.csv'))
# group_tissue_mapping = {
#     'blue':['Astrocytes','Melanocytes'],
#     'grey':['GM12878'],
#     'green':['Colon','Esophageal','Ovarian','Pancreas','Renal','Thyroid'],
#     'purple':['Airway','Bladder', 'GDSD6', 'HMEC', 'Prostate', 'Uterine'] 
# }

In [29]:
# group_genesets = {}
# for group, tissues in group_tissue_mapping.items():
#     geneset = sorted(pd.read_csv('../data/interim/rna/'+group+'_genes.csv',header=None).loc[:,0].dropna().unique())
#     for tissue in tissues:
#         group_genesets[tissue] = geneset
#         print(tissue, len(geneset))
        

# 3. running pairwise enrichments

In [85]:
def run_pair_enrichment(tissue, save_prefix, geneset_dict,sel_tissues=['cancer','normal'],
                       tissue_labels = is_cancer_label):
    if tissue not in geneset_dict:
        print(tissue, 'not found in geneset_dict')
        return
    results_df = tf_enrichment(geneset=geneset_dict[tissue], TF_feat_dict=tfs_feat_dict, tissues=sel_tissues, 
                      data_all=data_all,tissues_label=tissue_labels, 
                      background='relative',verbose=True, save_path=os.path.join(save_dir, save_prefix+tissue + '_single.csv'))
    # filter for cell type
    results_df_filt = results_df[(results_df.pval_bonf<0.05 )&( results_df.tissue==tissue)]
    # downsample # of tf tf pairs to consider
    tfs_feat_dict_pairs = find_tf_pairs(single_results=results_df_filt, tfs_feat_dict=tfs_feat_dict, data_all=data_all)

    results_df_pairs = tf_enrichment(geneset=geneset_dict[tissue], TF_feat_dict=tfs_feat_dict_pairs, tissues=[tissue], 
                      data_all=data_all,tissues_label=tissue_labels, 
                      background='relative',verbose=True, save_path=os.path.join(save_dir,  save_prefix+tissue + '_pair.csv'))

    results_df_pairs_filt = results_df_pairs[(results_df_pairs.oddsratio>1) & (results_df_pairs.pval_bonf<0.05)]
    results_df_pairs_filt.to_csv(os.path.join(save_dir,  save_prefix+tissue + '_pair_filt.csv'))
    # results_df_KRT_pairs_filt = results_df_KRT_pairs_filt[results_df_KRT_pairs_filt.num_in_2>=(results_df_KRT_pairs_filt.observed+10)]
    print('num sig tf-tf pair in ', tissue, results_df_pairs_filt.shape)

### testing

In [40]:
# TF_feat_dict=tfs_feat_dict
# tissues=['cancer','normal']
# # get subset
# geneset = expr_genesets['cancer']
# data_all_sel = data_all[data_all.index.isin(geneset)]
# tissues_label_sel = is_cancer_label[data_all.index.isin(geneset)]
# print('num genes in geneset',len(geneset))
# print('orig data', data_all.shape)
# print('subsetting data,', data_all_sel.shape, tissues_label_sel.shape)
# print(Counter(tissues_label_sel))

In [48]:
# # set background and initial variables
# results_df = pd.DataFrame(columns = ['tf_key', 'tissue', 'jaccard', 'intersect_over_min','intersection','union', 'num_in_gene', 'num_in_feat', 'observed', 'expected', 'oddsratio', 'pval' ])
# counter = 0
# count_all = data_all_sel.sum().sum() ## background

# print('starting iteration')
# print('estimated count', len(TF_feat_dict)*len(tissues))
# for tissue in tissues:
#     print('*****iterating tissue, ', tissue)
#     tissue_crm = data_all_sel[tissues_label_sel==tissue]
#     tissue_crm = tissue_crm[tissue_crm.index.isin(geneset)]
#     count_tissue_gene = tissue_crm.sum().sum() #mat_counts.sum(axis=1)[0], sum first row

#     # iterate through tfs
#     for tf_key, feat_list in sorted(TF_feat_dict.items(),key=lambda x: x[0]):
#         if len(feat_list)>0:
#             tissue_crm_selfeat = tissue_crm[feat_list]
#             count_selfeat_tissue = tissue_crm_selfeat.sum().sum() # A
#             count_selfeat = data_all_sel[feat_list].sum().sum() #mat_counts.sum(axis=0)[0], sum down first col
#             count_selfeat_neg = count_selfeat - count_selfeat_tissue # B: not in selected genes but in selected feature
#             count_neg_tissue = count_tissue_gene - count_selfeat_tissue #C: not in selected feature but in selected genes
#             count_neg_neg = count_all - count_selfeat_tissue- count_selfeat_neg - count_neg_tissue #D
#             mat_counts = np.array([[count_selfeat_tissue,count_neg_tissue],
#                            [count_selfeat_neg, count_neg_neg]]).reshape((2,2))
#             pseudo = 1
#             print(tf_key)
#             print(mat_counts)
#             mat_counts_pseudo = mat_counts+pseudo
#             num_in_1 = mat_counts.sum(axis=1)[0] #count_tissue_gene
#             num_in_2 = mat_counts.sum(axis=0)[0] #count_selfeat
#             in_1_and_in_2 = count_selfeat_tissue # A
#             in_1_or_in_2 = count_selfeat_tissue +count_selfeat_neg+count_neg_tissue # A+B+C
#             in_1 = count_selfeat_tissue+count_neg_tissue # A+C
#             in_2 = count_selfeat_tissue+count_selfeat_neg#A+B
#             observed_num = mat_counts[0][0] #count_KRTtf_KRTgene
#             expected_num = num_in_1*num_in_2/sum(sum(mat_counts))
#             oddsratio_pseudo, pvalue_pseudo = stats.fisher_exact(mat_counts_pseudo,alternative='greater')
#             jaccard = in_1_and_in_2/in_1_or_in_2
#             intersect_over_min = in_1_and_in_2/min(in_1,in_2)

#             results_df.at[counter] = {'tf_key':tf_key, 'tissue':tissue,
#                                         'jaccard':jaccard,'intersect_over_min':intersect_over_min,
#                                     'intersection':in_1_and_in_2, 
#                                    'union':in_1_or_in_2, 
#                                    'num_in_gene':num_in_1,'num_in_feat':num_in_2,
#                                    'observed':observed_num, 'expected':expected_num, 
#                                    'oddsratio':oddsratio_pseudo, 'pval':pvalue_pseudo}
#             counter+=1
#             if (counter %1000)==0:
#                 print(tf_key, tissue, counter)


# results_df['pval_bonf'] = results_df.pval.apply(lambda x: min(1, x* results_df.shape[0]))
# results_df['log_pval_bonf'] = results_df.pval_bonf.apply(lambda x: min(100,-np.log10(x+1e-100)))


In [91]:
# results_df = tf_enrichment(geneset=expr_genesets['cancer'], TF_feat_dict=tfs_feat_dict, tissues=['cancer','normal'], 
# #                   data_all=data_all,tissues_label=is_cancer_label, 
#                   background='relative',verbose=True, save_path=os.path.join(save_dir, 'expr_'+'cancer' + '_single.csv'))

In [82]:
# # filter for cell type
# results_df_filt = results_df[(results_df.pval_bonf<0.05 )&( results_df.tissue==tissue)]
# print(results_df.shape, results_df_filt.shape)
# # downsample # of tf tf pairs to consider
# tfs_feat_dict_pairs = find_tf_pairs(single_results=results_df_filt, tfs_feat_dict=tfs_feat_dict, data_all=data_all)

# results_df_pairs = tf_enrichment(geneset=expr_genesets['cancer'], TF_feat_dict=tfs_feat_dict_pairs, tissues=['cancer'], 
#                   data_all=data_all,tissues_label=is_cancer_label, 
#                   background='relative',verbose=True, save_path=os.path.join(save_dir,  'expr_'+tissue + '_pair.csv'))


In [92]:
# # pairs testing
# TF_feat_dict=tfs_feat_dict_pairs
# tissues=['cancer','normal']
# # get subset
# geneset = expr_genesets['cancer']
# data_all_sel = data_all[data_all.index.isin(geneset)]
# tissues_label_sel = is_cancer_label[data_all.index.isin(geneset)]
# print('num genes in geneset',len(geneset))
# print('orig data', data_all.shape)
# print('subsetting data,', data_all_sel.shape, tissues_label_sel.shape)
# # set background and initial variables
# results_df = pd.DataFrame(columns = ['tf_key', 'tissue', 'jaccard', 'intersect_over_min','intersection','union', 'num_in_gene', 'num_in_feat', 'observed', 'expected', 'oddsratio', 'pval' ])
# counter = 0
# count_all = data_all_sel.sum().sum() ## background

# print('starting iteration')
# print('estimated count', len(TF_feat_dict)*len(tissues))
# for tissue in tissues:
#     print('*****iterating tissue, ', tissue)
#     tissue_crm = data_all_sel[tissues_label_sel==tissue]
#     tissue_crm = tissue_crm[tissue_crm.index.isin(geneset)]
#     count_tissue_gene = tissue_crm.sum().sum() #mat_counts.sum(axis=1)[0], sum first row

#     # iterate through tfs
#     for tf_key, feat_list in sorted(TF_feat_dict.items(),key=lambda x: x[0]):
#         if len(feat_list)>0:
#             tissue_crm_selfeat = tissue_crm[feat_list]
#             count_selfeat_tissue = tissue_crm_selfeat.sum().sum() # A
#             count_selfeat = data_all_sel[feat_list].sum().sum() #mat_counts.sum(axis=0)[0], sum down first col
#             count_selfeat_neg = count_selfeat - count_selfeat_tissue # B: not in selected genes but in selected feature
#             count_neg_tissue = count_tissue_gene - count_selfeat_tissue #C: not in selected feature but in selected genes
#             count_neg_neg = count_all - count_selfeat_tissue- count_selfeat_neg - count_neg_tissue #D
#             mat_counts = np.array([[count_selfeat_tissue,count_neg_tissue],
#                            [count_selfeat_neg, count_neg_neg]]).reshape((2,2))
#             pseudo = 1
#             print(tf_key)
#             print(mat_counts)
#             mat_counts_pseudo = mat_counts+pseudo
#             num_in_1 = mat_counts.sum(axis=1)[0] #count_tissue_gene
#             num_in_2 = mat_counts.sum(axis=0)[0] #count_selfeat
#             in_1_and_in_2 = count_selfeat_tissue # A
#             in_1_or_in_2 = count_selfeat_tissue +count_selfeat_neg+count_neg_tissue # A+B+C
#             in_1 = count_selfeat_tissue+count_neg_tissue # A+C
#             in_2 = count_selfeat_tissue+count_selfeat_neg#A+B
#             observed_num = mat_counts[0][0] #count_KRTtf_KRTgene
#             expected_num = num_in_1*num_in_2/sum(sum(mat_counts))
#             oddsratio_pseudo, pvalue_pseudo = stats.fisher_exact(mat_counts_pseudo,alternative='greater')
#             jaccard = in_1_and_in_2/in_1_or_in_2
#             intersect_over_min = in_1_and_in_2/min(in_1,in_2)
            
#             results_df.at[counter] = {'tf_key':tf_key, 'tissue':tissue,
#                                         'jaccard':jaccard,'intersect_over_min':intersect_over_min,
#                                     'intersection':in_1_and_in_2, 
#                                    'union':in_1_or_in_2, 
#                                    'num_in_gene':num_in_1,'num_in_feat':num_in_2,
#                                    'observed':observed_num, 'expected':expected_num, 
#                                    'oddsratio':oddsratio_pseudo, 'pval':pvalue_pseudo}
# #             print(results_df.loc[counter])
#             counter+=1
#             if (counter %1000)==0:
#                 print(tf_key, tissue, counter)


# results_df['pval_bonf'] = results_df.pval.apply(lambda x: min(1, x* results_df.shape[0]))
# results_df['log_pval_bonf'] = results_df.pval_bonf.apply(lambda x: min(100,-np.log10(x+1e-100)))


In [93]:
# results_df

### running

In [177]:
# %%time
# # get unique tissue geneset enrichments
# for tissue in normal_tissues:
#     print('============ Running unique', tissue)
#     run_pair_enrichment(tissue, save_prefix='unique_', geneset_dict = unique_genesets)

In [178]:
# %%time
# # get expressed tissue geneset enrichments
# for tissue in normal_tissues:
#     print('============ Running expressed', tissue)
#     run_pair_enrichment(tissue, save_prefix='expr_', geneset_dict = expr_genesets)

In [86]:
%%time
# get expressed tissue geneset enrichments
for disease_state in ['normal', 'cancer']:
    print('============ Running expressed', disease_state)
    run_pair_enrichment(disease_state, save_prefix='expr_', geneset_dict = expr_genesets)

num genes in geneset 4669
subsetting data, (17301, 1057) (17301,)
starting iteration
estimated count 1058
*****iterating tissue,  cancer




*****iterating tissue,  normal
ZNF143 normal 1000
saved file ../data/processed/fig4_modelling/tf_tf_pairs_scc/expr_normal_single.csv
num single unique TFs 122
num pairs of TFs 7381
num genes in geneset 4669
subsetting data, (17301, 1057) (17301,)
starting iteration
estimated count 7381
*****iterating tissue,  normal
BCL11A::NR3C1 normal 1000
DLX1::ZBTB7B normal 2000
FOSL2::ZNF449 normal 3000
HOXA10::NFATC1 normal 4000
JUNB::TBX3 normal 5000
NR3C1::ZNF680 normal 6000
ZBTB18::ZNF680 normal 7000
saved file ../data/processed/fig4_modelling/tf_tf_pairs_scc/expr_normal_pair.csv
num sig tf-tf pair in  normal (7381, 14)
num genes in geneset 3916
subsetting data, (14682, 1057) (14682,)
starting iteration
estimated count 1058
*****iterating tissue,  cancer
*****iterating tissue,  normal
ZNF143 normal 1000
saved file ../data/processed/fig4_modelling/tf_tf_pairs_scc/expr_cancer_single.csv
num single unique TFs 176
num pairs of TFs 15400
num genes in geneset 3916
subsetting data, (14682, 1057) (146

In [87]:
%%time
# get expressed tissue geneset enrichments
for disease_state in ['normal', 'cancer']:
    print('============ Running expressed', disease_state)
    run_pair_enrichment(disease_state, save_prefix='diff_', geneset_dict = diff_genesets)

num genes in geneset 221
subsetting data, (660, 1057) (660,)
starting iteration
estimated count 1058
*****iterating tissue,  cancer




*****iterating tissue,  normal
ZNF143 normal 1000
saved file ../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_normal_single.csv
num single unique TFs 59
num pairs of TFs 1711
num genes in geneset 221
subsetting data, (660, 1057) (660,)
starting iteration
estimated count 1711
*****iterating tissue,  normal
KLF8::ZNF449 normal 1000
saved file ../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_normal_pair.csv
num sig tf-tf pair in  normal (1711, 14)
num genes in geneset 48
subsetting data, (173, 1057) (173,)
starting iteration
estimated count 1058
*****iterating tissue,  cancer
*****iterating tissue,  normal
ZNF143 normal 1000
saved file ../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_cancer_single.csv
num single unique TFs 20
num pairs of TFs 190
num genes in geneset 48
subsetting data, (173, 1057) (173,)
starting iteration
estimated count 190
*****iterating tissue,  cancer
saved file ../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_cancer_pair.csv
num sig tf-tf pair i

In [94]:
# %%time
# # get group tissue geneset enrichments
# for tissue in normal_tissues:
#     print('============ Running group', tissue)
#     run_pair_enrichment(tissue, save_prefix='group_', geneset_dict = group_genesets)

# 4. Curating list of tf-tf pairs

# 5. filter curating list to find combinations that are close together

In [95]:
for filepath in sorted(glob.glob(os.path.join(save_dir, '*_filt.csv'))):
    df = pd.read_csv(filepath, index_col=0)
    print(os.path.basename(filepath), df.shape)

diff_cancer_pair_filt.csv (190, 14)
diff_normal_pair_filt.csv (1711, 14)
expr_cancer_pair_filt.csv (15400, 14)
expr_normal_pair_filt.csv (7381, 14)


In [96]:
# # get tissue specific tf-tf and global tf-tf pairs useful for expr
# expr_all_tissue_df = pd.DataFrame()
# for filepath in sorted(glob.glob(os.path.join(save_dir, 'expr*_filt.csv'))):
#     df = pd.read_csv(filepath, index_col=0)
#     print(os.path.basename(filepath), df.shape)
#     expr_all_tissue_df = pd.concat([expr_all_tissue_df, df], axis=0,sort=False)


expr_cancer_pair_filt.csv (15400, 14)
expr_normal_pair_filt.csv (7381, 14)


In [99]:
# expr_all_tissue_df.expected.describe()

In [100]:
# print(expr_all_tissue_df.shape)
# expr_all_tissue_df[:5]

(22781, 14)


Unnamed: 0,tf_key,tissue,jaccard,intersect_over_min,intersection,union,num_in_gene,num_in_feat,observed,expected,oddsratio,pval,pval_bonf,log_pval_bonf
0,ALX4::ARNT,cancer,0.001243,0.93358,8321.0,6693667.0,6693075.0,8913.0,8321.0,8132.117685,1.348018,1.343369e-13,2.068788e-09,8.684284
1,ALX4::ARNTL,cancer,0.00092,0.941149,6157.0,6693460.0,6693075.0,6542.0,6157.0,5968.844821,1.532408,2.913817e-18,4.487279e-14,13.348017
2,ALX4::BATF3,cancer,0.000307,1.0,2052.0,6693075.0,6693075.0,2052.0,2052.0,1872.220968,197.198979,3.121098e-80,4.8064910000000004e-76,75.318172
3,ALX4::CBFB,cancer,0.003064,0.925374,20510.0,6694729.0,6693075.0,22164.0,20510.0,20222.176188,1.190653,1.531203e-12,2.358052e-08,7.627447
4,ALX4::CEBPZ,cancer,0.001969,0.928763,13181.0,6694086.0,6693075.0,14192.0,13181.0,12948.615974,1.251284,7.686594e-13,1.183735e-08,7.926745


In [101]:
# # number of vocabularies for cancer
# tf_tf_pair_counter = expr_all_tissue_df.tf_key.value_counts()
# len(set(tf_tf_pair_counter.index))

22781

In [102]:
# create filtered expression datasets (vocabs that are responsible for expression that are tissue specific
for filepath in sorted(glob.glob(os.path.join(save_dir, 'diff*_filt.csv'))):
    tissue = os.path.basename(filepath).split('_')[1]
    df = pd.read_csv(filepath, index_col=0)
    df_filt = df[df.tf_key.isin(tf_tf_pair_counter[tf_tf_pair_counter==1].index)]
    print(tissue, df.shape[0], df_filt.shape[0])
    df_filt.to_csv(os.path.join(save_dir, 'diff_'+tissue+'_pair_filt_unique.csv'))

cancer 190 190
normal 1711 1278


In [103]:
# create filtered expression datasets (vocabs that are responsible for expression that are tissue specific
for filepath in sorted(glob.glob(os.path.join(save_dir, 'expr*_filt.csv'))):
    tissue = os.path.basename(filepath).split('_')[1]
    df = pd.read_csv(filepath, index_col=0)
    df_filt = df[df.tf_key.isin(tf_tf_pair_counter[tf_tf_pair_counter==1].index)]
    print(tissue, df.shape[0], df_filt.shape[0])
    df_filt.to_csv(os.path.join(save_dir, 'expr_'+tissue+'_pair_filt_unique.csv'))

cancer 15400 15400
normal 7381 7381


In [116]:
# def check_tf_tf_loop_occur(tf_tf_pair, tissue, genesets_dict, data_all=data_all, tissues_label=tissues_label):  #suffixes='loop', 'pro_loop'
#     tf_1, tf_2 = tf_tf_pair.split('::')
    
#     # get features to look at together, list of lists
#     feat_list_type = {
#         'pro_pro':[tf_1 + '_pro', tf_2+'_pro'],
#         'loop_loop':[tf_1 + '_loop', tf_2+'_loop'],
#         'pro_loop':[tf_1 + '_pro', tf_2+'_loop'],
#         'loop_pro':[tf_1 + '_loop', tf_2+'_pro']}
  
#     data_all_sel = data_all[data_all.index.isin(genesets_dict[tissue]) & (tissues_label==tissue)]
#     feat_type_df = {}
#     for type_feat, feat_list in feat_list_type.items():
#         try:
#             check_series = pd.DataFrame(data_all_sel[feat_list]>0).all(axis=1)
#             feat_type_df[type_feat+"_count"] = check_series.value_counts()[True]  
#             feat_type_df[type_feat+"_genes"] = '|'.join(check_series.index[check_series])
            
#         except:
#             feat_type_df[type_feat+"_count"] = 0
#             feat_type_df[type_feat+"_genes"] = ''
            
#     return feat_type_df

def check_tf_tf_loop_occur_cancer(tf_tf_pair, tissue, genesets_dict, data_all=data_all, cancer_label = is_cancer_label, tissues_label=tissues_label):  #suffixes='loop', 'pro_loop'
    tf_1, tf_2 = tf_tf_pair.split('::')
    
    # get features to look at together, list of lists
    feat_list_type = {
        'pro_pro':[tf_1 + '_pro', tf_2+'_pro'],
        'loop_loop':[tf_1 + '_loop', tf_2+'_loop'],
        'pro_loop':[tf_1 + '_pro', tf_2+'_loop'],
        'loop_pro':[tf_1 + '_loop', tf_2+'_pro']}
  
    data_all_sel = data_all[data_all.index.isin(genesets_dict['cancer']) & (tissues_label==tissue)]
    feat_type_df = {}
    for type_feat, feat_list in feat_list_type.items():
        try:
            check_series = pd.DataFrame(data_all_sel[feat_list]>0).all(axis=1)
            feat_type_df[type_feat+"_count"] = check_series.value_counts()[True]  
            feat_type_df[type_feat+"_genes"] = '|'.join(check_series.index[check_series])
            
        except:
            feat_type_df[type_feat+"_count"] = 0
            feat_type_df[type_feat+"_genes"] = ''
            
    return feat_type_df

# testing 

In [122]:
df = pd.read_csv(os.path.join(save_dir, 'diff_cancer_pair_filt.csv'), index_col=0)
tf_tf_pair_feat = {}
for tf_tf_pair in df.tf_key:
    tf_tf_pair_feat[tf_tf_pair] = check_tf_tf_loop_occur_cancer(tf_tf_pair, tissue='SCC13-CTRLi',genesets_dict=diff_genesets)
df_tf_key_type = pd.DataFrame.from_dict(tf_tf_pair_feat,orient='index')


In [121]:
Counter(tissues_label)

Counter({'A431-CTRLi': 12448,
         'CAL27-CTRLi': 13213,
         'GDSD0': 9430,
         'GDSD3': 9587,
         'SCC13-CTRLi': 13547})

In [123]:
df_tf_key_type.loop_loop_count.describe()

count    190.000000
mean       7.615789
std        8.309793
min        0.000000
25%        0.000000
50%        6.000000
75%       12.000000
max       33.000000
Name: loop_loop_count, dtype: float64

In [124]:
row = df_tf_key_type[df_tf_key_type.pro_pro_count>0].iloc[0]
row

pro_pro_count                                                      2
pro_pro_genes                                        DYNC1LI2|MAPK12
loop_loop_count                                                   33
loop_loop_genes    AP5S1|ARGLU1|BAX|CASP2|CCDC142|CHMP3|CTCF|CTDS...
pro_loop_count                                                     2
pro_loop_genes                                       DYNC1LI2|MAPK12
loop_pro_count                                                     2
loop_pro_genes                                       DYNC1LI2|MAPK12
Name: SP2::SP4, dtype: object

In [125]:
list(df_tf_key_type[df_tf_key_type.pro_pro_count>0].index)

['SP2::SP4',
 'SP2::WT1',
 'SP2::ZNF341',
 'SP4::WT1',
 'SP4::ZNF341',
 'WT1::ZNF324',
 'WT1::ZNF341']

In [138]:
glob.glob('../data/interim/annon/anchor_motif_scan/*bed')

['../data/interim/annon/anchor_motif_scan/Esophageal_annon.bed',
 '../data/interim/annon/anchor_motif_scan/Thyroid_annon.bed',
 '../data/interim/annon/anchor_motif_scan/A431-CTRLi_annon.bed',
 '../data/interim/annon/anchor_motif_scan/GDSD3_annon.bed',
 '../data/interim/annon/anchor_motif_scan/Renal_annon.bed',
 '../data/interim/annon/anchor_motif_scan/GM12878_annon.bed',
 '../data/interim/annon/anchor_motif_scan/Prostate_annon.bed',
 '../data/interim/annon/anchor_motif_scan/HMEC_annon.bed',
 '../data/interim/annon/anchor_motif_scan/Ovarian_annon.bed',
 '../data/interim/annon/anchor_motif_scan/Pancreas_annon.bed',
 '../data/interim/annon/anchor_motif_scan/Uterine_annon.bed',
 '../data/interim/annon/anchor_motif_scan/Melanocytes_annon.bed',
 '../data/interim/annon/anchor_motif_scan/Bladder_annon.bed',
 '../data/interim/annon/anchor_motif_scan/Astrocytes_annon.bed',
 '../data/interim/annon/anchor_motif_scan/SCC13-CTRLi_annon.bed',
 '../data/interim/annon/anchor_motif_scan/CAL27-CTRLi_anno

In [137]:
glob.glob('../data/interim/annon/promoter_motif_scan/*bed')

['../data/interim/annon/promoter_motif_scan/promoter_Melanocytes_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_Astrocytes_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_WM2664-SCR-DMSO_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_GM12878_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_CAL27-CTRLi_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_Thyroid_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_HMEC_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_SCC13-CTRLi_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_Renal_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_Prostate_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_GDSD3_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_Pancreas_annon.bed',
 '../data/interim/annon/promoter_motif_scan/promoter_Colon_annon.bed',
 '../data/interim/annon/promoter_mo

In [144]:
%%time
# data_all_sel = data_all[data_all.index.isin(unique_genesets['Prostate']) & (tissues_label=='Prostate')]
# vocab_tissue_df = pd.DataFrame()
# foot_df = pd.read_csv(os.path.join('../data/interim/annon/anchor_motif_scan/','SCC13-CTRLi_annon.bed'),sep='\t',header=None)

foot_df = pd.read_csv(os.path.join('../data/interim/annon/promoter_motif_scan/','promoter_SCC13-CTRLi_annon.bed'),sep='\t',header=None)
foot_df.columns = ['chr','start','stop','TSS','chr_m','start_m','stop_m','id_trim','score','strand']

CPU times: user 3.98 s, sys: 140 ms, total: 4.13 s
Wall time: 2.71 s


In [141]:
foot_df

Unnamed: 0,chr,start,stop,TSS,chr_m,start_m,stop_m,id_trim,score,strand
0,chr1,710000,715000,chr1_710000_715000,chr1,713913,713918,MYBB_HUMAN.H11MO.0.D.pwm.trim,5.824923,-
1,chr1,710000,715000,chr1_710000_715000,chr1,713913,713917,ZN589_HUMAN.H11MO.0.D.pwm.trim,4.334007,-
2,chr1,710000,715000,chr1_710000_715000,chr1,713913,713918,MYBB_HUMAN.H11MO.0.D.pwm.trim,5.824923,-
3,chr1,710000,715000,chr1_710000_715000,chr1,713913,713917,ZN589_HUMAN.H11MO.0.D.pwm.trim,4.334007,-
4,chr1,710000,715000,chr1_710000_715000,chr1,713916,713931,RXRA_HUMAN.H11MO.0.A.pwm.trim,10.867212,+
...,...,...,...,...,...,...,...,...,...,...
74165985,chrY,59030000,59035000,chrY_59030000_59035000,chrY,59030117,59030127,RFX1_HUMAN.H11MO.1.B.pwm.trim,10.920844,-
74165986,chrY,59030000,59035000,chrY_59030000_59035000,chrY,59030117,59030127,RFX1_HUMAN.H11MO.1.B.pwm.trim,10.920844,-
74165987,chrY,59030000,59035000,chrY_59030000_59035000,chrY,59030120,59030123,KLF8_HUMAN.H11MO.0.C.pwm.trim,3.409391,+
74165988,chrY,59030000,59035000,chrY_59030000_59035000,chrY,59030120,59030123,KLF8_HUMAN.H11MO.0.C.pwm.trim,3.409391,+


In [143]:
tf_pair = row.name
tf_arr = tf_pair.split('::')
# genes_pro_only = row['pro_pro_genes'].split('|')
genes_pro_only = row['loop_loop_genes'].split('|')
print(genes_pro_only)
foot_df_sel = foot_df[foot_df.TSS.isin(genes_pro_only)]
print(foot_df_sel.shape)
foot_df_sel['tf'] = foot_df_sel.id_trim.map(tf_id_to_name_dict)
print(foot_df_sel.shape)
foot_df_sel = foot_df_sel[foot_df_sel.tf.isin(tf_arr)].drop_duplicates()
print(tf_pair, row['pro_pro_genes'])
# print(tf_pair, row['pro_pro_genes'])
print(foot_df_sel)

['AP5S1', 'ARGLU1', 'BAX', 'CASP2', 'CCDC142', 'CHMP3', 'CTCF', 'CTDSP1', 'CUEDC1', 'DNAJA2', 'DNAJC7', 'DYNC1LI2', 'FUNDC2', 'GOLGA2', 'IFI16', 'KLHL9', 'MAPK12', 'MIDN', 'MXRA7', 'NDUFB11', 'NRIP3', 'POU2F1', 'RHPN2', 'SMG1', 'SPATA33', 'STAT3', 'SUOX', 'SUPT4H1', 'SUPT6H', 'SUV39H1', 'TMCO4', 'WDR45B', 'ZC3H18']
(0, 10)
(0, 11)
SP2::SP4 AP5S1|ARGLU1|BAX|CASP2|CCDC142|CHMP3|CTCF|CTDSP1|CUEDC1|DNAJA2|DNAJC7|DYNC1LI2|FUNDC2|GOLGA2|IFI16|KLHL9|MAPK12|MIDN|MXRA7|NDUFB11|NRIP3|POU2F1|RHPN2|SMG1|SPATA33|STAT3|SUOX|SUPT4H1|SUPT6H|SUV39H1|TMCO4|WDR45B|ZC3H18
Empty DataFrame
Columns: [chr, start, stop, TSS, chr_m, start_m, stop_m, id_trim, score, strand, tf]
Index: []


In [131]:
# filter by those that have pairs 
# foot_df = pd.read_csv(os.path.join('../data/interim/annon/promoter_motif_scan/','promoter_Prostate_annon.bed'),sep='\t',header=None)
                      
# foot_df.columns = ['chr','start','stop','TSS','chr_m','start_m','stop_m','id_trim','score','strand']
foot_df = foot_df[foot_df.TSS.isin(['CYP4B1'])]
foot_df['tf'] = foot_df.id_trim.map(tf_id_to_name_dict)
foot_df[foot_df.tf.isin(['NFATC4','ZNF667'])].drop_duplicates()


Unnamed: 0,chr,start,stop,TSS,chr_m,start_m,stop_m,id_trim,score,strand,tf
132457,chr1,47211510,47214010,CYP4B1,chr1,47213569,47213574,NFAC4_HUMAN.H11MO.0.C.pwm.trim,8.701374,+,NFATC4
132461,chr1,47211510,47214010,CYP4B1,chr1,47213580,47213597,ZN667_HUMAN.H11MO.0.C.pwm.trim,11.091512,+,ZNF667


In [132]:
foot_df = foot_df[foot_df.tf.isin(['NFATC4','ZNF667'])].drop_duplicates()
foot_df


Unnamed: 0,chr,start,stop,TSS,chr_m,start_m,stop_m,id_trim,score,strand,tf
132457,chr1,47211510,47214010,CYP4B1,chr1,47213569,47213574,NFAC4_HUMAN.H11MO.0.C.pwm.trim,8.701374,+,NFATC4
132461,chr1,47211510,47214010,CYP4B1,chr1,47213580,47213597,ZN667_HUMAN.H11MO.0.C.pwm.trim,11.091512,+,ZNF667


# loop through all of the tf-tf pairs and find the locations in the promoter regions

In [133]:
df_tf_key_type[:5]

Unnamed: 0,pro_pro_count,pro_pro_genes,loop_loop_count,loop_loop_genes,pro_loop_count,pro_loop_genes,loop_pro_count,loop_pro_genes
EGR2::EGR3,0,,0,,0,,0,
EGR2::EGR4,0,,0,,0,,0,
EGR2::ELF3,0,,0,,0,,0,
EGR2::FLI1,0,,0,,0,,0,
EGR2::FOXJ3,0,,0,,0,,0,


In [149]:
def get_vocab_cancer(tf_tf_pair_file, tissues, genesets_dict, type_loop='pro_pro', savepath=None, save_loop_count=None):
    if savepath is None: #savepath is save each tissue separately
        vocab_df = pd.DataFrame()
    for tissue in sorted(tissues):
        print('====== STARTING ',tissue, tf_tf_pair_file)

        # read file
        df = pd.read_csv(tf_tf_pair_file, index_col=0)
        if df.shape[0]==0:
            print('EXIT: no data', tf_tf_pair_file)
            continue
        else:
            print('num vocab found:', df.shape[0])

        # get tf tf pair types
        tf_tf_pair_feat = {}
        for tf_tf_pair in df.tf_key: 
            tf_tf_pair_feat[tf_tf_pair] = check_tf_tf_loop_occur_cancer(tf_tf_pair, tissue, genesets_dict)
        df_tf_key_type = pd.DataFrame.from_dict(tf_tf_pair_feat,orient='index')

        # get pro_pro OR loop_loop or looponly (PART TO CHANGE IN FUTURE)
        if type_loop in ['pro_pro','loop_loop','pro_loop']:
            if save_loop_count is not None:
                print('saving loop type info', save_loop_count+tissue+'_loop_type.csv')
                df_tf_key_type.to_csv(save_loop_count+tissue+'_loop_type.csv')
        else:
            print('vocab type not correct', type_loop)
            raise            
        df_tf_filt = df_tf_key_type[df_tf_key_type[type_loop+'_count']>0]    
        
        
        # get locations of vocab points
        if df_tf_filt.shape[0]==0:
            print('EXIT: no tf tf pairs in loop type: ', type_loop, filepath)
            continue   
        else:
            print('num tf-tf pairs in loop type found', type_loop,  df_tf_filt.shape[0])

        vocab_tissue_df = pd.DataFrame()
        foot_df = pd.read_csv(os.path.join('../data/interim/annon/promoter_motif_scan/','promoter_'+tissue+'_annon.bed'),sep='\t',header=None)
        foot_df.columns = ['chr','start','stop','TSS','chr_m','start_m','stop_m','id_trim','score','strand']
        for tf_pair, row in df_tf_filt.iterrows():
            tf_arr = tf_pair.split('::')
#             print(row)
            genes_pro_only = row[type_loop+'_genes'].split('|')
            foot_df_sel = foot_df[foot_df.TSS.isin(genes_pro_only)]
            foot_df_sel['tf'] = foot_df_sel.id_trim.map(tf_id_to_name_dict)
            foot_df_sel = foot_df_sel[foot_df_sel.tf.isin(tf_arr)].drop_duplicates()
            foot_df_sel['vocab'] = tf_pair
            foot_df_sel['genes'] = row[type_loop+'_genes']
            vocab_tissue_df = pd.concat([vocab_tissue_df, foot_df_sel],axis=0,sort=False)

        vocab_tissue_df['tissue'] = tissue
        print('vocab_tissue_df length: ', vocab_tissue_df.shape[0], len(vocab_tissue_df.vocab.unique()))
        if savepath is not None:
            print('saved', savepath+tissue+"_"+type_loop+'_vocab_info.csv')
            vocab_tissue_df.to_csv(savepath+tissue+"_"+type_loop+'_vocab_info.csv')
        
        if savepath is None:
            vocab_df = pd.concat([vocab_df,vocab_tissue_df ],axis=0,sort=False)
    if savepath is None:
        return vocab_df
    else:
        return None

unique stuff

In [150]:
# %%time 
# vocab_df = get_vocab(glob.glob(os.path.join(save_dir, 'unique*_filt.csv')),unique_genesets,
#                      type_loop='pro_pro',save_loop_count=os.path.join(save_dir, 'unique_'))
# vocab_df.to_csv(os.path.join(save_dir, 'unique_alltissue_pro_pro_vocab_info.csv'))
# vocab_df.shape

In [151]:
# %%time 
# vocab_df = get_vocab(glob.glob(os.path.join(save_dir, 'unique*_filt.csv')),unique_genesets,type_loop='pro_loop')
# vocab_df.to_csv(os.path.join(save_dir, 'unique_alltissue_pro_loop_vocab_info.csv'))
# vocab_df.shape


In [152]:
# %%time 
# vocab_df = get_vocab(glob.glob(os.path.join(save_dir, 'unique*_filt.csv')),unique_genesets,type_loop='loop_loop')
# vocab_df.to_csv(os.path.join(save_dir, 'unique_alltissue_loop_loop_vocab_info.csv'))
# vocab_df.shape


In [153]:
glob.glob(os.path.join(save_dir, '*_filt_unique.csv'))

['../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_cancer_pair_filt_unique.csv',
 '../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_normal_pair_filt_unique.csv',
 '../data/processed/fig4_modelling/tf_tf_pairs_scc/expr_cancer_pair_filt_unique.csv',
 '../data/processed/fig4_modelling/tf_tf_pairs_scc/expr_normal_pair_filt_unique.csv']

In [154]:
cancer_tissues

['A431-CTRLi', 'CAL27-CTRLi', 'SCC13-CTRLi']

In [156]:
%%time 
vocab_df_diff = get_vocab_cancer('../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_cancer_pair_filt_unique.csv', cancer_tissues, diff_genesets, 
                          savepath=os.path.join(save_dir, 'diff_'))
# vocab_df_expr.to_csv(os.path.join(save_dir, 'expr_alltissue_vocab_info.csv'))

num vocab found: 190
num tf-tf pairs in loop type found pro_pro 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


vocab_tissue_df length:  39 5
saved ../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_A431-CTRLi_pro_pro_vocab_info.csv
num vocab found: 190
num tf-tf pairs in loop type found pro_pro 21
vocab_tissue_df length:  322 21
saved ../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_CAL27-CTRLi_pro_pro_vocab_info.csv
num vocab found: 190
num tf-tf pairs in loop type found pro_pro 7
vocab_tissue_df length:  158 7
saved ../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_SCC13-CTRLi_pro_pro_vocab_info.csv
CPU times: user 21.4 s, sys: 1.36 s, total: 22.7 s
Wall time: 18.2 s


In [157]:
%%time 
vocab_df_diff_loop_loop = get_vocab_cancer('../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_cancer_pair_filt_unique.csv', cancer_tissues, diff_genesets, 
                                            savepath=os.path.join(save_dir, 'diff_'),
                                            type_loop='loop_loop', save_loop_count=os.path.join(save_dir, 'diff_'))
# vocab_df_expr.to_csv(os.path.join(save_dir, 'expr_alltissue_vocab_info.csv'))

num vocab found: 190
saving loop type info ../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_A431-CTRLi_loop_type.csv
num tf-tf pairs in loop type found loop_loop 136


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


vocab_tissue_df length:  3170 130
saved ../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_A431-CTRLi_loop_loop_vocab_info.csv
num vocab found: 190
saving loop type info ../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_CAL27-CTRLi_loop_type.csv
num tf-tf pairs in loop type found loop_loop 91
vocab_tissue_df length:  1747 84
saved ../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_CAL27-CTRLi_loop_loop_vocab_info.csv
num vocab found: 190
saving loop type info ../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_SCC13-CTRLi_loop_type.csv
num tf-tf pairs in loop type found loop_loop 120
vocab_tissue_df length:  1376 104
saved ../data/processed/fig4_modelling/tf_tf_pairs_scc/diff_SCC13-CTRLi_loop_loop_vocab_info.csv
CPU times: user 48 s, sys: 772 ms, total: 48.8 s
Wall time: 44.7 s


In [160]:
%%time 
vocab_df_expr = get_vocab_cancer('../data/processed/fig4_modelling/tf_tf_pairs_scc/expr_cancer_pair_filt_unique.csv', cancer_tissues, expr_genesets, 
                          savepath=os.path.join(save_dir, 'expr_'))



num vocab found: 15400
num tf-tf pairs in loop type found pro_pro 6247


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


vocab_tissue_df length:  443772 6247
saved ../data/processed/fig4_modelling/tf_tf_pairs_scc/expr_A431-CTRLi_pro_pro_vocab_info.csv
num vocab found: 15400
num tf-tf pairs in loop type found pro_pro 6583
vocab_tissue_df length:  385874 6583
saved ../data/processed/fig4_modelling/tf_tf_pairs_scc/expr_CAL27-CTRLi_pro_pro_vocab_info.csv
num vocab found: 15400
num tf-tf pairs in loop type found pro_pro 6612
vocab_tissue_df length:  441040 6612
saved ../data/processed/fig4_modelling/tf_tf_pairs_scc/expr_SCC13-CTRLi_pro_pro_vocab_info.csv
num vocab found: 15400
saving loop type info ../data/processed/fig4_modelling/tf_tf_pairs_scc/expr_A431-CTRLi_loop_type.csv
num tf-tf pairs in loop type found loop_loop 10440


KeyboardInterrupt: 

In [161]:
%%time
vocab_df_expr_loop_loop = get_vocab_cancer('../data/processed/fig4_modelling/tf_tf_pairs_scc/expr_cancer_pair_filt_unique.csv', cancer_tissues, expr_genesets, 
                                            savepath=os.path.join(save_dir, 'expr_'),
                                            type_loop='loop_loop', save_loop_count=os.path.join(save_dir, 'expr_'))


num vocab found: 15400
saving loop type info ../data/processed/fig4_modelling/tf_tf_pairs_scc/expr_A431-CTRLi_loop_type.csv
num tf-tf pairs in loop type found loop_loop 10440


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

KeyboardInterrupt

