In [1]:
from tfcomb import CombObj, DiffCombObj
import os
import pathlib
import pandas as pd
import numpy as np
'''
Constants for this script
'''

#genome_path="../testdaten/hg19_masked.fa"
genome_path="../testdaten/homo_sapiens.104.mainChr.fa"

main_jaspar_file="../testdaten/JASPAR2020_CORE_vertebrates.meme" 

# path where market basket analyses for cluster are put.
result_path="./results/wp2/"
main_analysis_path=f"{result_path}main/"
differential_analysis_path=f"{result_path}diff_analysis/"
answers_path=f"{result_path}answers/"

### folder of wp2, where the clusters are
#path_to_clusters="../testdaten/wp2/"
#path_to_clusters="/mnt/workspace_stud/stud3/WP6_data/"
path_to_clusters="/mnt/workspace_stud/stud4/WP6_data/"

# create result folders 
if not os.path.exists(result_path):
     pathlib.Path(result_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(main_analysis_path):
     pathlib.Path(main_analysis_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(differential_analysis_path):
     pathlib.Path(differential_analysis_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(answers_path):
     pathlib.Path(answers_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(genome_path):
    print(f"ERROR: path {genome_path} does not exist")

if not os.path.exists(main_jaspar_file):
    print(f"ERROR: path {main_jaspar_file} does not exist")

if not os.path.exists(path_to_clusters):
    print(f"ERROR: path {path_to_clusters} does not exist")


In [2]:
def do_market_basket_analyses_for_cell_cluster(cell_cluster_name: str, cell_cluster_path:str):
    '''
        Does market basket analyses.
    '''
    comb = CombObj()
    comb.TFBS_from_motifs(regions= cell_cluster_path,
                   motifs=main_jaspar_file,
                   genome=genome_path,
                   threads=4)
    
    print(f'Start market basket analyses for cell-cluster/type: {cell_cluster_name}')
    comb.market_basket(threads=10)
    if len(comb.rules) <= 0:
        print(f'Could not find TF-cooccurences for cell-cluster/type: {cell_cluster_name}')
        return
    print(f'Finished market basket analyses for cell-cluster/type: {cell_cluster_name}')
    print(f'Found rules: {len(comb.rules)}')
    comb.to_pickle(f'{main_analysis_path}{cell_cluster_name}.pkl')
    print(f'Saved: {main_analysis_path}{cell_cluster_name}.pkl')

In [3]:
def read_in_file_names_of_folder(rel_path:str):
    return [f for f in os.listdir(rel_path) if os.path.isfile(os.path.join(rel_path, f))]

cluster_file_names = read_in_file_names_of_folder(rel_path=path_to_clusters)
print(cluster_file_names)





['right-lobe-of-liver.10.bed', 'right-lobe-of-liver.11.bed', 'right-lobe-of-liver.12.bed', 'right-lobe-of-liver.14.bed', 'right-lobe-of-liver.15.bed', 'right-lobe-of-liver.16.bed', 'right-lobe-of-liver.17.bed', 'right-lobe-of-liver.18.bed', 'right-lobe-of-liver.1.bed', 'right-lobe-of-liver.2.bed', 'right-lobe-of-liver.3.bed', 'right-lobe-of-liver.4.bed', 'right-lobe-of-liver.5.bed', 'right-lobe-of-liver.6.bed', 'right-lobe-of-liver.7.bed', 'right-lobe-of-liver.8.bed']


In [None]:
# Has to be tested as soon as wp2 generates new .bed files
for file_name in cluster_file_names:
    cluster_name = file_name.split('.bed')[0]
    print(cluster_name)
    print(file_name)
    cluster_path=f"{path_to_clusters}{file_name}"
    do_market_basket_analyses_for_cell_cluster(cell_cluster_name=cluster_name, cell_cluster_path=cluster_path)

    

# Analysis

### Differential Analysis

In [4]:
# mb market basket analysis
files_main_mb= read_in_file_names_of_folder(rel_path=main_analysis_path)
print(f"Count of Files: {len(files_main_mb)}")
print(f"Files: {files_main_mb}")


Count of Files: 16
Files: ['right-lobe-of-liver.10.pkl', 'right-lobe-of-liver.11.pkl', 'right-lobe-of-liver.12.pkl', 'right-lobe-of-liver.14.pkl', 'right-lobe-of-liver.15.pkl', 'right-lobe-of-liver.16.pkl', 'right-lobe-of-liver.17.pkl', 'right-lobe-of-liver.18.pkl', 'right-lobe-of-liver.1.pkl', 'right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.4.pkl', 'right-lobe-of-liver.5.pkl', 'right-lobe-of-liver.6.pkl', 'right-lobe-of-liver.7.pkl', 'right-lobe-of-liver.8.pkl']


In [17]:
# Diff analysis between each cluster:
for i, file in enumerate(files_main_mb):
    print(file)
    name_i = file.split('.pkl')[0]
    
    for j in range(i + 1, len(files_main_mb), 1):
        file_j = files_main_mb[j]
        name_j = file_j.split('.pkl')[0]
        print(j)
        print(name_j)
        A = CombObj().from_pickle(f"{main_analysis_path}{file}")
        print(A)
        A.set_prefix(name_i)
        B = CombObj().from_pickle(f"{main_analysis_path}{file_j}")
        print(B)
        B.set_prefix(name_j)
        compare_obj = A.compare(B)
        compare_obj.to_pickle(f'{differential_analysis_path}{name_i}__{name_j}.pkl')
        
        
print("Done differential analysis")

right-lobe-of-liver.10.pkl
1
right-lobe-of-liver.11
<CombObj: 308349 TFBS (746 unique names) | Market basket analysis: 338172 rules>
<CombObj: 320596 TFBS (746 unique names) | Market basket analysis: 344416 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.10 / right-lobe-of-liver.11
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
2
right-lobe-of-liver.12
<CombObj: 308349 TFBS (746 unique names) | Market basket analysis: 338172 rules>
<CombObj: 176578 TFBS (746 unique names) | Market basket analysis: 235437 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.10 / right-lobe-of-liver.12
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
3
right-lobe-of-liver.14
<CombObj: 308349 TFBS (746 unique names) | Market basket analysis: 338172 rules>
<CombObj: 22521 TFBS (741 unique names) | Market basket analysis: 59422 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.10 / ri

INFO: Calculating foldchange for contrast: right-lobe-of-liver.11 / right-lobe-of-liver.1
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
9
right-lobe-of-liver.2
<CombObj: 320596 TFBS (746 unique names) | Market basket analysis: 344416 rules>
<CombObj: 498126 TFBS (746 unique names) | Market basket analysis: 407056 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.11 / right-lobe-of-liver.2
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
10
right-lobe-of-liver.3
<CombObj: 320596 TFBS (746 unique names) | Market basket analysis: 344416 rules>
<CombObj: 291110 TFBS (746 unique names) | Market basket analysis: 327759 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.11 / right-lobe-of-liver.3
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
11
right-lobe-of-liver.4
<CombObj: 320596 TFBS (746 unique names) | Market basket analysis: 344416 rules>
<CombObj

5
right-lobe-of-liver.16
<CombObj: 22521 TFBS (741 unique names) | Market basket analysis: 59422 rules>
<CombObj: 263810 TFBS (746 unique names) | Market basket analysis: 315033 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.14 / right-lobe-of-liver.16
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
6
right-lobe-of-liver.17
<CombObj: 22521 TFBS (741 unique names) | Market basket analysis: 59422 rules>
<CombObj: 238282 TFBS (746 unique names) | Market basket analysis: 302671 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.14 / right-lobe-of-liver.17
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
7
right-lobe-of-liver.18
<CombObj: 22521 TFBS (741 unique names) | Market basket analysis: 59422 rules>
<CombObj: 287765 TFBS (746 unique names) | Market basket analysis: 325985 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.14 / right-lobe-of-liver.18
INFO: The 

INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
10
right-lobe-of-liver.3
<CombObj: 266767 TFBS (746 unique names) | Market basket analysis: 316204 rules>
<CombObj: 291110 TFBS (746 unique names) | Market basket analysis: 327759 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.15 / right-lobe-of-liver.3
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
11
right-lobe-of-liver.4
<CombObj: 266767 TFBS (746 unique names) | Market basket analysis: 316204 rules>
<CombObj: 299770 TFBS (746 unique names) | Market basket analysis: 334129 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.15 / right-lobe-of-liver.4
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
12
right-lobe-of-liver.5
<CombObj: 266767 TFBS (746 unique names) | Market basket analysis: 316204 rules>
<CombObj: 166885 TFBS (746 unique names) | Market basket analysis: 226900 rules>
INFO: Calculatin

INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
14
right-lobe-of-liver.7
<CombObj: 238282 TFBS (746 unique names) | Market basket analysis: 302671 rules>
<CombObj: 107304 TFBS (746 unique names) | Market basket analysis: 182464 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.17 / right-lobe-of-liver.7
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
15
right-lobe-of-liver.8
<CombObj: 238282 TFBS (746 unique names) | Market basket analysis: 302671 rules>
<CombObj: 103339 TFBS (746 unique names) | Market basket analysis: 177038 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.17 / right-lobe-of-liver.8
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
right-lobe-of-liver.18.pkl
8
right-lobe-of-liver.1
<CombObj: 287765 TFBS (746 unique names) | Market basket analysis: 325985 rules>
<CombObj: 408518 TFBS (746 unique names) | Market basket analysis: 3808

INFO: Calculating foldchange for contrast: right-lobe-of-liver.2 / right-lobe-of-liver.8
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
right-lobe-of-liver.3.pkl
11
right-lobe-of-liver.4
<CombObj: 291110 TFBS (746 unique names) | Market basket analysis: 327759 rules>
<CombObj: 299770 TFBS (746 unique names) | Market basket analysis: 334129 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.3 / right-lobe-of-liver.4
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
12
right-lobe-of-liver.5
<CombObj: 291110 TFBS (746 unique names) | Market basket analysis: 327759 rules>
<CombObj: 166885 TFBS (746 unique names) | Market basket analysis: 226900 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.3 / right-lobe-of-liver.5
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
13
right-lobe-of-liver.6
<CombObj: 291110 TFBS (746 unique names) | Market basket analysis

## Specific analysis:
### Question:
Does the cluster 'right-lobe-of-liver.1.pkl' have specific tf-co-occurences, which can´t be found in the other clusters of the right-lobe-of-liver. ? Which are those tf-co and how many are there?
####  Steps:
    - Take all differential mb-analysis of cluster one. Merge them to a pandas dataframe.
    - Remove All diff tf-co´s which are found twice in the analysis.
    

In [None]:
# Find the specific tf_cooccurences of a tissue that unique for the specific cluster in the tissue.
# 1. Diff analyse , 
# 2. Read in Diffanalyse for the specific cluster
# 3. Find tf-cooccurence of the diffob , which are occuring in each cluster
# Read in file Names of all analysis
files_main_mb= read_in_file_names_of_folder(rel_path=main_analysis_path)
print(f"Count of Files: {len(files_main_mb)}")
#print(f"Files: {files_main_mb}")

files_diff= read_in_file_names_of_folder(rel_path=differential_analysis_path)
print(f"Count of Files: {len(files_diff)}")
#print(f"Files: {files_diff}")
test = ""
for file_mb in files_main_mb:
    cluster_name = file_mb.split('.pkl')[0]
    print(cluster_name)
    diffs = list(filter(lambda x: cluster_name in x, files_diff))
    print(len(diffs))
    print(diffs)
    
    # Keeps the read in DiffCombObj diff_objects:
    diff_objects = []
    
    for diff in diffs:
        diff_obj = DiffCombObj().from_pickle(f"{differential_analysis_path}{diff}")
        selection = diff_obj.select_rules()
        diff_objects.append(selection)
    
    # erste DiffObj dataframe 
    df_unified = diff_objects[0].rules
    for i in range(len(diff_objects)-1):
        
        # concat row the dataframes
        df_diff = pd.concat([df_unified, diff_objects[i + 1].rules])
        
        # take all TF1-TF2 combination that are duplicated and only keep the first
        unified_duplicates = df_diff[df_diff.duplicated(subset=['TF1', 'TF2'], keep='first')]
        
        df_unified = unified_duplicates
    
    df_unified.to_pickle(path=f"{answers_path}{cluster_name}.pkl")
    test = df_unified


Count of Files: 16
Count of Files: 120
right-lobe-of-liver.10
15
['right-lobe-of-liver.10__right-lobe-of-liver.11.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.12.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.14.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.15.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.16.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.17.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.18.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.1.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.4.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.5.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.6.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.7.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.8.pkl']
INFO: Selecting rules for contrast: ('right-lobe-of-liver.10', 'right-lobe-of-liver.11')
INFO: measure_threshold is None; trying to 

In [19]:
test

test2 = pd.read_pickle(f"{answers_path}right-lobe-of-liver.10.pkl")  
test2

Unnamed: 0,TF1,TF2,right-lobe-of-liver.10_cosine,right-lobe-of-liver.11_cosine,right-lobe-of-liver.10/right-lobe-of-liver.11_cosine_log2fc,right-lobe-of-liver.12_cosine,right-lobe-of-liver.10/right-lobe-of-liver.12_cosine_log2fc,right-lobe-of-liver.14_cosine,right-lobe-of-liver.10/right-lobe-of-liver.14_cosine_log2fc,right-lobe-of-liver.15_cosine,...,right-lobe-of-liver.4_cosine,right-lobe-of-liver.10/right-lobe-of-liver.4_cosine_log2fc,right-lobe-of-liver.5_cosine,right-lobe-of-liver.10/right-lobe-of-liver.5_cosine_log2fc,right-lobe-of-liver.6_cosine,right-lobe-of-liver.10/right-lobe-of-liver.6_cosine_log2fc,right-lobe-of-liver.7_cosine,right-lobe-of-liver.10/right-lobe-of-liver.7_cosine_log2fc,right-lobe-of-liver.8_cosine,right-lobe-of-liver.10/right-lobe-of-liver.8_cosine_log2fc


In [10]:
#mb_obj = CombObj().from_pickle(f"{main_analysis_path}right-lobe-of-liver.10.pkl")
diff_obj_1_1 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.10__right-lobe-of-liver.1.pkl")
diff_obj1_2 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.10__right-lobe-of-liver.1.pkl")
diff_obj2 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.10__right-lobe-of-liver.2.pkl")
#mb_obj.rules
#type(diff_obj.rules)
#diff_obj2.rules
df_diff = pd.concat([diff_obj_1_1.rules,diff_obj2.rules])

#df_diff2 = pd.concat([diff_obj_1_1.rules,diff_obj2.rules, diff_obj1_2.rules])

unified_duplicates = df_diff[df_diff.duplicated(subset=['TF1', 'TF2'], keep='first')]

df_diff2 = pd.concat([unified_duplicates, diff_obj1_2.rules])

unified_duplicates2 = df_diff2[df_diff2.duplicated(subset=['TF1', 'TF2'], keep='first')]

#df_diff = df_diff.drop_duplicates(subset=['TF1', 'TF2'])
#unified_duplicates
#unified_duplicates
#unified_duplicates2
#diff_obj_1_1.rules
df_diff




Unnamed: 0,TF1,TF2,right-lobe-of-liver.10_cosine,right-lobe-of-liver.1_cosine,right-lobe-of-liver.10/right-lobe-of-liver.1_cosine_log2fc,right-lobe-of-liver.2_cosine,right-lobe-of-liver.10/right-lobe-of-liver.2_cosine_log2fc
ZEB1-TCF3,ZEB1,TCF3,0.032443,0.070394,-0.656047,,
TCF3-ZEB1,TCF3,ZEB1,0.032443,0.070394,-0.656047,,
FOSL1JUNDvar.2-CREM,FOSL1JUNDvar.2,CREM,0.037223,0.073415,-0.596413,,
CREM-FOSL1JUNDvar.2,CREM,FOSL1JUNDvar.2,0.037223,0.073415,-0.596413,,
ZEB1-TCF4,ZEB1,TCF4,0.035606,0.070648,-0.591947,,
...,...,...,...,...,...,...,...
MLXIPL-MLX,MLXIPL,MLX,0.086869,,,0.031730,0.897921
MLX-MITF,MLX,MITF,0.078215,,,0.026030,0.923858
MITF-MLX,MITF,MLX,0.078215,,,0.026030,0.923858
MITF-MLXIPL,MITF,MLXIPL,0.098281,,,0.030302,1.062647


In [None]:
diff_obj_1_1.rules.loc['Foxd3-ONECUT2']
diff_obj2.rules.loc['Foxd3-ONECUT2']

In [None]:
top30C = selectedC.select_top_rules(n=30)
top30C.rules.head(31)

In [None]:
top30C.plot_heatmap()

In [None]:
top30C.plot_bubble()

In [None]:
top30C.plot_network()

In [None]:
selectedC.rules