### Find TF-co-occurences for cellcluster/celltypes of WP3 (TOBIAS)

constant variables and imports

In [2]:
from tfcomb import CombObj, DiffCombObj
import os
import pathlib
import pandas as pd
import numpy as np

'''
Constants for this script
'''

genome_path="../testdaten/homo_sapiens.104.mainChr.fa"
main_jaspar_file="../testdaten/JASPAR2020_CORE_vertebrates.meme" 

# path where market basket analyses for cluster are put.
result_path="./results/wp3/"

# folder of wp3, where clusters are
path_to_tissues="/mnt/workspace_stud/allstud/wp3/"
#path_to_clusters="/mnt/workspace_stud/stud8/testordner/"

# identifier for the folder names, we need 
#cluster_folder_identifier="cluster"
bindetect_path_snippet = "/snakemakeout/TFBS/"
conditions_path_snippet = "/clusterBams/"

main_analysis_path=f"{result_path}main/"
differential_analysis_path=f"{result_path}diff_analysis/"

differential_analysis_selection_path=f"{differential_analysis_path}selection/"
answers_path=f"{result_path}answers/"


# create result folders 
if not os.path.exists(result_path):
     pathlib.Path(result_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(main_analysis_path):
     pathlib.Path(main_analysis_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(differential_analysis_path):
     pathlib.Path(differential_analysis_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(answers_path):
     pathlib.Path(answers_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(differential_analysis_selection_path):
     pathlib.Path(differential_analysis_selection_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(genome_path):
    print(f"ERROR: path {genome_path} does not exist")

if not os.path.exists(main_jaspar_file):
    print(f"ERROR: path {main_jaspar_file} does not exist")

if not os.path.exists(path_to_tissues):
    print(f"ERROR: path {path_to_tissues} does not exist")

Function definitions for Market Basket analyses + saving

In [3]:
def do_market_basket_analyses(tissue: str, bindetect_path: str, condition: str):
    '''
        Does market basket analyses with bindetect output of Tobias (WP3)
    '''
    comb = CombObj()
    # Condition wird bei WP3 festgelegt , nochmal abstimmen. Wie oft lassen sie Tobias laufen 
    comb.TFBS_from_TOBIAS(bindetect_path=bindetect_path, condition=condition, overwrite=False)
    
    print(f'Start market basket analyses for condition: {condition}')
    comb.market_basket(threads=4)
    if len(comb.rules) <= 0:
        print(f'Could not find TF-cooccurences for condition: {condition}')
        return
    print(f'Finished market basket analyses for condition: {condition}')
    print(f'Found rules: {len(comb.rules)}')
    comb.to_pickle(f'{main_analysis_path}{tissue}_{condition}.pkl')
    print(f'Saved: {main_analysis_path}{tissue}_{condition}.pkl')
   
    

In [4]:
def get_folder_names_in_folder(main_folder_path:str):
    #glob()

    dirlist = [ item for item in os.listdir(main_folder_path) if os.path.isdir(os.path.join(main_folder_path, item))]
    folder_names = []
    for folder in dirlist:
        folder_names.append(folder)
    return folder_names

def read_in_file_names_of_folder(folder_path:str):
    return [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]


In [8]:
# Identifing the tissues to read from
tissue_names = get_folder_names_in_folder(main_folder_path=path_to_tissues)

# remove this as soon as wp3 has changed the folder structure
folder_to_filter = ['output','output_data','presentations','testdata']
tissue_names = list(filter(lambda x: x not in folder_to_filter , tissue_names))

print(f"Tissues:{tissue_names}")

cluster_bindetect_paths = []
conditions_path = []
for tissue_name in tissue_names:
    bindetect_path = f"{path_to_tissues}{tissue_name}{bindetect_path_snippet}"
    conditions_path = f"{path_to_tissues}{tissue_name}{conditions_path_snippet}"
    cluster_file_names = read_in_file_names_of_folder(folder_path=conditions_path)
    
    cluster_file_names = list(filter(lambda x: '.bam' in x , cluster_file_names))
    print(cluster_file_names)
    conditions =[]
    for file in cluster_file_names:
        condition = file.split('.bam')[0]
        conditions.append(condition)
    
    print(conditions)
    for condition in conditions:
        do_market_basket_analyses(tissue=tissue_name,bindetect_path=bindetect_path, condition=condition)
    
        
print(f"Done Market basket analyses for all tissues")


Tissues:['liver']
INFO: Read 4038178 sites (838 unique names) from condition 'cluster12'
Start market basket analyses for condition: cluster12
INFO: Setting up binding sites for counting
INFO: Counting co-occurrences within sites
INFO: Counting co-occurrence within background
INFO: Progress: 10%
INFO: Progress: 20%
INFO: Progress: 30%
INFO: Progress: 40%
INFO: Progress: 50%
INFO: Progress: 60%
INFO: Progress: 70%
INFO: Progress: 80%
INFO: Progress: 90%
INFO: Finished!
INFO: Done finding co-occurrences! Run .market_basket() to estimate significant pairs
INFO: Market basket analysis is done! Results are found in <CombObj>.rules
Finished market basket analyses for condition: cluster12
Found rules: 702228
Saved: ./results/wp3/main/liver_cluster12.pkl
INFO: Read 3095126 sites (838 unique names) from condition 'cluster6'
Start market basket analyses for condition: cluster6
INFO: Setting up binding sites for counting
INFO: Counting co-occurrences within sites
INFO: Counting co-occurrence with

Finished market basket analyses for condition: cluster4
Found rules: 701896
Saved: ./results/wp3/main/liver_cluster4.pkl
INFO: Read 3792246 sites (838 unique names) from condition 'cluster7'
Start market basket analyses for condition: cluster7
INFO: Setting up binding sites for counting
INFO: Counting co-occurrences within sites
INFO: Counting co-occurrence within background
INFO: Progress: 10%
INFO: Progress: 20%
INFO: Progress: 30%
INFO: Progress: 40%
INFO: Progress: 52%
INFO: Progress: 60%
INFO: Progress: 70%
INFO: Progress: 80%
INFO: Progress: 90%
INFO: Finished!
INFO: Done finding co-occurrences! Run .market_basket() to estimate significant pairs
INFO: Market basket analysis is done! Results are found in <CombObj>.rules
Finished market basket analyses for condition: cluster7
Found rules: 702199
Saved: ./results/wp3/main/liver_cluster7.pkl
INFO: Read 1379460 sites (838 unique names) from condition 'cluster9'
Start market basket analyses for condition: cluster9
INFO: Setting up bind

INFO: Market basket analysis is done! Results are found in <CombObj>.rules
Finished market basket analyses for condition: cluster5
Found rules: 701812
Saved: ./results/wp3/main/liver_cluster5.pkl
Done Market basket analyses for all tissues
