### Find TF-co-occurences for cellcluster/celltypes of WP3 (TOBIAS)

constant variables and imports

In [2]:
from tfcomb import CombObj
import os
import pathlib

'''
Constants for this script
'''

genome_path="../testdaten/hg19_masked.fa"
main_jaspar_file="../testdaten/JASPAR2020_CORE_vertebrates.meme" 

# path where market basket analyses for cluster are put.
result_path="./results/wp3/"

# folder of wp3, where clusters are
path_to_clusters="/mnt/workspace_stud/stud8/testordner/"
# identifier for the folder names, we need 
cluster_folder_identifier="cluster"

# create result folders 
if not os.path.exists(result_path):
     pathlib.Path(result_path).mkdir(parents=True, exist_ok=True)


Function definitions for Market Basket analyses + saving

In [3]:
def do_market_basket_analyses_for_cell_cluster(cell_cluster_name: str, bindetect_path: str, condition: str):
    '''
        Does market basket analyses with bindetect output of Tobias (WP3)
    '''
    comb = CombObj()
    # Condition wird bei WP3 festgelegt , nochmal abstimmen. Wie oft lassen sie Tobias laufen 
    comb.TFBS_from_TOBIAS(bindetect_path=bindetect_path, condition=condition, overwrite=False)
    
    print(f'Start market basket analyses for cell-cluster/type: {cell_cluster_name}')
    comb.market_basket(threads=10)
    if len(comb.rules) <= 0:
        print(f'Could not find TF-cooccurences for cell-cluster/type: {cell_cluster_name}')
        return
    print(f'Finished market basket analyses for cell-cluster/type: {cell_cluster_name}')
    print(f'Found rules: {len(comb.rules)}')
    comb.to_pickle(f'{result_path}{cell_cluster_name}.pkl')
    print(f'Saved: {result_path}{cell_cluster_name}.pkl')
   
    

In [3]:
comb = CombObj()
    # Condition wird bei WP3 festgelegt , nochmal abstimmen. Wie oft lassen sie Tobias laufen 
comb.TFBS_from_TOBIAS(bindetect_path='/mnt/workspace_stud/allstud/wp3/liver/snakemakeout/TFBS', condition='cluster6', overwrite=False)


INFO: Read 3095126 sites (838 unique names) from condition 'cluster6'


In [4]:
comb.market_basket(threads=10)
comb.rules

INFO: Setting up binding sites for counting
INFO: Counting co-occurrences within sites
INFO: Counting co-occurrence within background
INFO: Progress: 10%
INFO: Progress: 20%
INFO: Progress: 30%
INFO: Progress: 40%
INFO: Progress: 50%
INFO: Progress: 60%
INFO: Progress: 70%
INFO: Progress: 80%
INFO: Progress: 90%
INFO: Finished!
INFO: Done finding co-occurrences! Run .market_basket() to estimate significant pairs
INFO: Market basket analysis is done! Results are found in <CombObj>.rules


Unnamed: 0,TF1,TF2,TF1_TF2_count,TF1_count,TF2_count,cosine,zscore
PATZ1_MA1866.1-SP2_MA0516.3,PATZ1_MA1866.1,SP2_MA0516.3,39345,32341,27215,1.326199,205.385778
SP2_MA0516.3-PATZ1_MA1866.1,SP2_MA0516.3,PATZ1_MA1866.1,39345,27215,32341,1.326199,205.385778
KLF12_MA0742.2-SP2_MA0516.3,KLF12_MA0742.2,SP2_MA0516.3,33397,23427,27215,1.322650,245.585607
SP2_MA0516.3-KLF12_MA0742.2,SP2_MA0516.3,KLF12_MA0742.2,33397,27215,23427,1.322650,245.585607
KLF15_MA1513.1-SP2_MA0516.3,KLF15_MA1513.1,SP2_MA0516.3,34270,24890,27215,1.316732,258.646126
...,...,...,...,...,...,...,...
ZFP57_MA1583.1-VSX2_MA0726.1,ZFP57_MA1583.1,VSX2_MA0726.1,1,1137,921,0.000977,-3.123055
Lhx1_MA1518.2-ZBTB33_MA0527.1,Lhx1_MA1518.2,ZBTB33_MA0527.1,1,673,2271,0.000809,-4.736544
ZBTB33_MA0527.1-Lhx1_MA1518.2,ZBTB33_MA0527.1,Lhx1_MA1518.2,1,2271,673,0.000809,-4.736544
PHOX2A_MA0713.1-YY2_MA0748.2,PHOX2A_MA0713.1,YY2_MA0748.2,1,1482,3316,0.000451,-7.921224


In [7]:
comb2 = CombObj()
    # Condition wird bei WP3 festgelegt , nochmal abstimmen. Wie oft lassen sie Tobias laufen 
comb2.TFBS_from_TOBIAS(bindetect_path='/mnt/workspace_stud/allstud/wp3/liver/snakemakeout/TFBS', condition='cluster14', overwrite=False)
comb2.market_basket(threads=10)

INFO: Read 3971175 sites (838 unique names) from condition 'cluster14'
INFO: Setting up binding sites for counting
INFO: Counting co-occurrences within sites
INFO: Counting co-occurrence within background
INFO: Progress: 10%
INFO: Progress: 20%
INFO: Progress: 30%
INFO: Progress: 40%
INFO: Progress: 50%
INFO: Progress: 60%
INFO: Progress: 70%
INFO: Progress: 80%
INFO: Progress: 90%
INFO: Finished!
INFO: Done finding co-occurrences! Run .market_basket() to estimate significant pairs
INFO: Market basket analysis is done! Results are found in <CombObj>.rules


In [5]:
comb.TFBS

[chr1	827411	827423	KLF4_MA0039.4	7.31418	-,
 chr1	827415	827427	ZNF263_MA0528.2	7.09345	+,
 chr1	827422	827428	Foxn1_MA1684.1	6.85621	+,
 chr1	827423	827437	TFAP2C_MA0814.2	7.64654	-,
 chr1	827425	827436	TFAP2E_MA1569.1	8.04684	-,
 chr1	827428	827445	ZNF528_MA1597.1	4.42287	+,
 chr1	827429	827441	PATZ1_MA1866.1	7.22259	+,
 chr1	827430	827440	SP5_MA2032.1	9.23843	-,
 chr1	827430	827442	KLF4_MA0039.4	7.27117	-,
 chr1	827430	827442	ZNF148_MA1653.1	8.77817	-,
 chr1	827431	827440	KLF1_MA0493.2	8.27045	+,
 chr1	827431	827440	SP4_MA0685.2	8.55849	+,
 chr1	827431	827441	KLF5_MA0599.1	8.78328	-,
 chr1	827431	827442	MAZ_MA1522.1	10.18353	-,
 chr1	827442	827448	AhrArnt_MA0006.1	7.30452	+,
 chr1	827463	827477	SPIC_MA0687.1	7.69999	+,
 chr1	827468	827480	THAP1_MA0597.2	8.57619	+,
 chr1	827471	827486	NR1H4RXRA_MA1146.1	8.22879	+,
 chr1	827471	827486	NR2F1_MA1538.1	5.51047	-,
 chr1	827471	827486	NR2F6_MA1539.1	5.47065	+,
 chr1	827471	827486	NR4A2RXRA_MA1147.1	7.74239	+,
 chr1	827473	827483	MSANTD3_M

In [6]:
comb.TF_names

['ALX3_MA0634.1',
 'ARGFX_MA1463.1',
 'ARNT2_MA1464.1',
 'ARNTHIF1A_MA0259.1',
 'ASCL1_MA1100.2',
 'ASCL1_MA1631.1',
 'ATF2_MA1632.1',
 'ATF3_MA0605.2',
 'ATF4_MA0833.2',
 'ATF6_MA1466.1',
 'ATF7_MA0834.1',
 'ATOH7_MA1468.1',
 'AhrArnt_MA0006.1',
 'Alx1_MA0854.1',
 'Alx4_MA0853.1',
 'Ar_MA0007.3',
 'Arid3a_MA0151.1',
 'Arid3b_MA0601.1',
 'Arid5a_MA0602.1',
 'Arnt_MA0004.1',
 'Arntl_MA0603.1',
 'Arx_MA0874.1',
 'Ascl2_MA0816.1',
 'Atf1_MA0604.1',
 'Atf3_MA2036.1',
 'Atoh1_MA0461.2',
 'Atoh1_MA1467.2',
 'BACH1_MA1633.2',
 'BACH2_MA1101.2',
 'BACH2_MA1470.1',
 'BARHL1_MA0877.3',
 'BARHL2_MA0635.1',
 'BARX1_MA0875.1',
 'BARX2_MA1471.1',
 'BATF3_MA0835.2',
 'BATFJUN_MA0462.2',
 'BATF_MA1634.1',
 'BCL6B_MA0731.1',
 'BCL6_MA0463.2',
 'BHLHA15_MA0607.2',
 'BHLHE22_MA0818.2',
 'BHLHE22_MA1635.1',
 'BHLHE23_MA0817.1',
 'BHLHE40_MA0464.2',
 'BHLHE41_MA0636.1',
 'BNC2_MA2022.1',
 'BSX_MA0876.1',
 'Bach1Mafk_MA0591.1',
 'Bcl11B_MA2034.1',
 'Bhlha15_MA1472.2',
 'CDX1_MA0878.3',
 'CDX2_MA0465.2',
 'C

In [4]:
def read_in_folder_names_for_cluster():
    #glob()
    dirlist = [ item for item in os.listdir(path_to_clusters) if os.path.isdir(os.path.join(path_to_clusters, item))]
    cluster_names = []
    for folder in dirlist:
        if cluster_folder_identifier in folder:
            cluster_names.append(folder)
    return cluster_names




In [6]:
# Identifing the clusters to read from
# bsp. ["cluster10","cluster20" ....]
cluster_names = read_in_folder_names_for_cluster()
print(f"Clusters:{cluster_names}")

# bsp. ["/mnt/workspace_stud/stud8/testordner/cluster10/BINDetect/", ...]
cluster_bindetect_paths = []
for name in cluster_names:
    cluster_path = f"{path_to_clusters}{name}/BINDetect/"
    cluster_bindetect_paths.append(cluster_path)
    
print(f"Paths to bindetect folders of the clusters: {cluster_bindetect_paths}")


Clusters:['cluster10', 'cluster9', 'cluster3', 'cluster5', 'cluster7']
Paths to bindetect folders of the clusters: ['/mnt/workspace_stud/stud8/testordner/cluster10/BINDetect/', '/mnt/workspace_stud/stud8/testordner/cluster9/BINDetect/', '/mnt/workspace_stud/stud8/testordner/cluster3/BINDetect/', '/mnt/workspace_stud/stud8/testordner/cluster5/BINDetect/', '/mnt/workspace_stud/stud8/testordner/cluster7/BINDetect/']


In [9]:
for idx, path in enumerate(cluster_bindetect_paths):
    
    if not os.path.exists(path):
        print(f"No Bindetect folder for cluster: {cluster_names[idx]}. Continue with next.")
        continue
        
    do_market_basket_analyses_for_cell_cluster(cell_cluster_name=cluster_names[idx],
                                               bindetect_path=path,
                                               condition=f"{cluster_names[idx]}Scores")
    print(f"MB Done for cluster: {cluster_names[idx]}")
print("End market basket analyses")

INFO: Read 1008547 sites (746 unique names) from condition 'cluster10Scores'
Start market basket analyses for cell-cluster/type: cluster10
INFO: Setting up binding sites for counting
INFO: Counting co-occurrences within sites
INFO: Counting co-occurrence within background
INFO: Progress: 10%
INFO: Progress: 20%
INFO: Progress: 30%
INFO: Progress: 40%
INFO: Progress: 50%
INFO: Progress: 60%
INFO: Progress: 70%
INFO: Progress: 80%
INFO: Progress: 90%
INFO: Finished!
INFO: Done finding co-occurrences! Run .market_basket() to estimate significant pairs
INFO: Market basket analysis is done! Results are found in <CombObj>.rules
Finished market basket analyses for cell-cluster/type: cluster10
Found rules: 555800
Saved: ./results/wp3/cluster10.pkl
MB Done for cluster: cluster10
No Bindetect folder for cluster: cluster9. Continue with next.
No Bindetect folder for cluster: cluster3. Continue with next.
No Bindetect folder for cluster: cluster5. Continue with next.
No Bindetect folder for cluste