In [50]:
from tfcomb import CombObj, DiffCombObj
import os
import pathlib
import pandas as pd
import numpy as np
'''
Constants for this script
'''

#genome_path="../testdaten/hg19_masked.fa"
genome_path="../testdaten/homo_sapiens.104.mainChr.fa"

main_jaspar_file="../testdaten/JASPAR2020_CORE_vertebrates.meme" 

# path where market basket analyses for cluster are put.
result_path="./results/wp2/"
main_analysis_path=f"{result_path}main/"
differential_analysis_path=f"{result_path}diff_analysis/"
differential_analysis_selection_path=f"{differential_analysis_path}selection/"
answers_path=f"{result_path}answers/"

### folder of wp2, where the clusters are
#path_to_clusters="../testdaten/wp2/"
#path_to_clusters="/mnt/workspace_stud/stud3/WP6_data/"
path_to_clusters="/mnt/workspace_stud/stud4/WP6_data/"

# create result folders 
if not os.path.exists(result_path):
     pathlib.Path(result_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(main_analysis_path):
     pathlib.Path(main_analysis_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(differential_analysis_path):
     pathlib.Path(differential_analysis_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(answers_path):
     pathlib.Path(answers_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(differential_analysis_selection_path):
     pathlib.Path(differential_analysis_selection_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(genome_path):
    print(f"ERROR: path {genome_path} does not exist")

if not os.path.exists(main_jaspar_file):
    print(f"ERROR: path {main_jaspar_file} does not exist")

if not os.path.exists(path_to_clusters):
    print(f"ERROR: path {path_to_clusters} does not exist")


In [2]:
def do_market_basket_analyses_for_cell_cluster(cell_cluster_name: str, cell_cluster_path:str):
    '''
        Does market basket analyses.
    '''
    comb = CombObj()
    comb.TFBS_from_motifs(regions= cell_cluster_path,
                   motifs=main_jaspar_file,
                   genome=genome_path,
                   threads=4)
    
    print(f'Start market basket analyses for cell-cluster/type: {cell_cluster_name}')
    comb.market_basket(threads=10)
    if len(comb.rules) <= 0:
        print(f'Could not find TF-cooccurences for cell-cluster/type: {cell_cluster_name}')
        return
    print(f'Finished market basket analyses for cell-cluster/type: {cell_cluster_name}')
    print(f'Found rules: {len(comb.rules)}')
    comb.to_pickle(f'{main_analysis_path}{cell_cluster_name}.pkl')
    print(f'Saved: {main_analysis_path}{cell_cluster_name}.pkl')

In [3]:
def read_in_file_names_of_folder(rel_path:str):
    return [f for f in os.listdir(rel_path) if os.path.isfile(os.path.join(rel_path, f))]

cluster_file_names = read_in_file_names_of_folder(rel_path=path_to_clusters)
print(cluster_file_names)





['right-lobe-of-liver.10.bed', 'right-lobe-of-liver.11.bed', 'right-lobe-of-liver.12.bed', 'right-lobe-of-liver.14.bed', 'right-lobe-of-liver.15.bed', 'right-lobe-of-liver.16.bed', 'right-lobe-of-liver.17.bed', 'right-lobe-of-liver.18.bed', 'right-lobe-of-liver.1.bed', 'right-lobe-of-liver.2.bed', 'right-lobe-of-liver.3.bed', 'right-lobe-of-liver.4.bed', 'right-lobe-of-liver.5.bed', 'right-lobe-of-liver.6.bed', 'right-lobe-of-liver.7.bed', 'right-lobe-of-liver.8.bed']


In [None]:
# Has to be tested as soon as wp2 generates new .bed files
for file_name in cluster_file_names:
    cluster_name = file_name.split('.bed')[0]
    print(cluster_name)
    print(file_name)
    cluster_path=f"{path_to_clusters}{file_name}"
    do_market_basket_analyses_for_cell_cluster(cell_cluster_name=cluster_name, cell_cluster_path=cluster_path)

    

# Analysis

### Differential Analysis

In [4]:
# mb market basket analysis
files_main_mb= read_in_file_names_of_folder(rel_path=main_analysis_path)
print(f"Count of Files: {len(files_main_mb)}")
print(f"Files: {files_main_mb}")


Count of Files: 16
Files: ['right-lobe-of-liver.10.pkl', 'right-lobe-of-liver.11.pkl', 'right-lobe-of-liver.12.pkl', 'right-lobe-of-liver.14.pkl', 'right-lobe-of-liver.15.pkl', 'right-lobe-of-liver.16.pkl', 'right-lobe-of-liver.17.pkl', 'right-lobe-of-liver.18.pkl', 'right-lobe-of-liver.1.pkl', 'right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.4.pkl', 'right-lobe-of-liver.5.pkl', 'right-lobe-of-liver.6.pkl', 'right-lobe-of-liver.7.pkl', 'right-lobe-of-liver.8.pkl']


In [None]:
# Diff analysis between each cluster:
for i, file in enumerate(files_main_mb):
    print(file)
    name_i = file.split('.pkl')[0]
    
    for j in range(i + 1, len(files_main_mb), 1):
        file_j = files_main_mb[j]
        name_j = file_j.split('.pkl')[0]
        print(j)
        print(name_j)
        A = CombObj().from_pickle(f"{main_analysis_path}{file}")
        print(A)
        A.set_prefix(name_i)
        B = CombObj().from_pickle(f"{main_analysis_path}{file_j}")
        print(B)
        B.set_prefix(name_j)
        compare_obj = A.compare(B)
        compare_obj.to_pickle(f'{differential_analysis_path}{name_i}__{name_j}.pkl')
        
        selected = compare_obj.select_rules()
        selected.to_pickle(f'{differential_analysis_selection_path}{name_i}__{name_j}.pkl')
        
        
print("Done differential analysis")

right-lobe-of-liver.10.pkl
1
right-lobe-of-liver.11
<CombObj: 308349 TFBS (746 unique names) | Market basket analysis: 338172 rules>
<CombObj: 320596 TFBS (746 unique names) | Market basket analysis: 344416 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.10 / right-lobe-of-liver.11
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
INFO: Selecting rules for contrast: ('right-lobe-of-liver.10', 'right-lobe-of-liver.11')
INFO: measure_threshold is None; trying to calculate optimal threshold
INFO: mean_threshold is None; trying to calculate optimal threshold
INFO: Creating subset of rules using thresholds
2
right-lobe-of-liver.12
<CombObj: 308349 TFBS (746 unique names) | Market basket analysis: 338172 rules>
<CombObj: 176578 TFBS (746 unique names) | Market basket analysis: 235437 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.10 / right-lobe-of-liver.12
INFO: The calculated log2fc's are found in the rules table (

## Specific analysis:
### Question:
Does the cluster 'right-lobe-of-liver.1.pkl' have specific tf-co-occurences, which can´t be found in the other clusters of the right-lobe-of-liver. ? Which are those tf-co and how many are there?
####  Steps:
    - Take all differential mb-analysis of cluster one. Merge them to a pandas dataframe.
    - Remove All diff tf-co´s which are found twice in the analysis.
    

In [96]:
# Find the specific tf_cooccurences of a tissue that unique for the specific cluster in the tissue.
# 1. Diff analyse , 
# 2. Read in Diffanalyse for the specific cluster
# 3. Find tf-cooccurence of the diffob , which are occuring in each cluster
# Read in file Names of all analysis
files_main_mb= read_in_file_names_of_folder(rel_path=main_analysis_path)
print(f"Count of Files: {len(files_main_mb)}")
#print(f"Files: {files_main_mb}")

files_diff= read_in_file_names_of_folder(rel_path=differential_analysis_path)
print(f"Count of Files: {len(files_diff)}")
#print(f"Files: {files_diff}")
test = ""
for file_mb in files_main_mb:
    cluster_name = file_mb.split('.pkl')[0]
    print(cluster_name)
    diffs = list(filter(lambda x: cluster_name in x, files_diff))
    print(len(diffs))
    print(diffs)
    
    # Keeps the read in DiffCombObj diff_objects:
    diff_objects = []
    
    for diff in diffs:
        diff_obj = DiffCombObj().from_pickle(f"{differential_analysis_selection_path}{diff}")
        diff_objects.append(diff_obj)
    
    # erste DiffObj dataframe 
    df_unified = diff_objects[0].rules
    for i in range(len(diff_objects)-1):
        obj_1= diff_objects[i]
        obj_2 = diff_objects[i + 1]
        # merge rules-dataframe by index (TF´s)
        obj_3 = df_unified.merge(obj_2.rules, left_index=True, right_index=True, suffixes=(f"_{obj_1.prefixes[0]}_{obj_1.prefixes[1]}", f"_{obj_2.prefixes[0]}_{obj_2.prefixes[1]}"))
        df_unified = obj_3.copy(deep=True)
        # concat row the dataframes
        #df_diff = pd.concat([df_unified, diff_objects[i + 1].rules])
        
        # take all TF1-TF2 combination that are duplicated and only keep the first
        #unified_duplicates = df_diff[df_diff.duplicated(subset=['TF1', 'TF2'], keep='first')]
    
        #df_unified = unified_duplicates
    
    df_unified.to_pickle(path=f"{answers_path}{cluster_name}.pkl")
    test = df_unified


Count of Files: 16
Count of Files: 120
right-lobe-of-liver.10
15
['right-lobe-of-liver.10__right-lobe-of-liver.11.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.12.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.14.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.15.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.16.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.17.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.18.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.1.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.4.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.5.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.6.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.7.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.8.pkl']
right-lobe-of-liver.11
15
['right-lobe-of-liver.10__right-lobe-of-liver.11.pkl', 'right-lobe-of-liver.11__right-lobe-of-liver.12.pkl

right-lobe-of-liver.2
15
['right-lobe-of-liver.10__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.11__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.12__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.14__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.15__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.16__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.17__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.18__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.1__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.2__right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.2__right-lobe-of-liver.4.pkl', 'right-lobe-of-liver.2__right-lobe-of-liver.5.pkl', 'right-lobe-of-liver.2__right-lobe-of-liver.6.pkl', 'right-lobe-of-liver.2__right-lobe-of-liver.7.pkl', 'right-lobe-of-liver.2__right-lobe-of-liver.8.pkl']
right-lobe-of-liver.3
15
['right-lobe-of-liver.10__right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.11__right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.12__right-lobe-of-liver.3.pkl', '

In [108]:
answer_file_names=read_in_file_names_of_folder(rel_path=answers_path)
print(answer_file_names)
cluster_with_unique_tfs = []
for name in answer_file_names:
    
    df = pd.read_pickle(f"{answers_path}{name}")
    if df.size > 0:
        cluster_with_unique_tfs.append(name)
print(f"ANSWER:{cluster_with_unique_tfs}") 

for cluster in cluster_with_unique_tfs:
    df = pd.read_pickle(f"{answers_path}{cluster}")
    print(f"cluster_name: {cluster}, count_tf_cos: {df.shape[0]}")
    
df = pd.read_pickle(f"{answers_path}right-lobe-of-liver.6.pkl")
df.shape[0]
#print(pd.read_pickle(f"{answers_path}right-lobe-of-liver.6.pkl").size)
#print(pd.read_pickle(f"{answers_path}right-lobe-of-liver.11.pkl").size)
#print(pd.read_pickle(f"{answers_path}right-lobe-of-liver.14.pkl").size)


['right-lobe-of-liver.10.pkl', 'right-lobe-of-liver.11.pkl', 'right-lobe-of-liver.12.pkl', 'right-lobe-of-liver.14.pkl', 'right-lobe-of-liver.15.pkl', 'right-lobe-of-liver.16.pkl', 'right-lobe-of-liver.17.pkl', 'right-lobe-of-liver.18.pkl', 'right-lobe-of-liver.1.pkl', 'right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.4.pkl', 'right-lobe-of-liver.5.pkl', 'right-lobe-of-liver.6.pkl', 'right-lobe-of-liver.7.pkl', 'right-lobe-of-liver.8.pkl']
ANSWER:['right-lobe-of-liver.11.pkl', 'right-lobe-of-liver.14.pkl', 'right-lobe-of-liver.6.pkl']
cluster_name: right-lobe-of-liver.11.pkl, count_tf_cos: 38
cluster_name: right-lobe-of-liver.14.pkl, count_tf_cos: 22
cluster_name: right-lobe-of-liver.6.pkl, count_tf_cos: 6


6

### Try and Error section

In [70]:
#mb_obj = CombObj().from_pickle(f"{main_analysis_path}right-lobe-of-liver.10.pkl")
obj_1 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.1.pkl")
obj_2 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.2.pkl")
obj_3 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.3.pkl")
obj_4 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.4.pkl")
obj_5 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.5.pkl")
obj_6 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.6.pkl")
#mb_obj.rules
#type(diff_obj.rules)
#diff_obj2.rules
df_diff = pd.concat([obj_1.rules, obj_2.rules, obj_3.rules, obj_4.rules, obj_5.rules, obj_6.rules], join="inner")

#df_diff2 = pd.concat([diff_obj_1_1.rules,diff_obj2.rules, diff_obj1_2.rules])

unified_duplicates = df_diff[df_diff.duplicated(subset=['TF1', 'TF2'], keep='first')]

df_diff2 = pd.concat([unified_duplicates, diff_obj1_2.rules])

unified_duplicates2 = df_diff2[df_diff2.duplicated(subset=['TF1', 'TF2'], keep='first')]

#df_diff = df_diff.drop_duplicates(subset=['TF1', 'TF2'])
#unified_duplicates
#unified_duplicates
#unified_duplicates2
#diff_obj_1_1.rules
#unified_duplicates



In [89]:
obj_1 = DiffCombObj().from_pickle(f"{differential_analysis_selection_path}right-lobe-of-liver.16__right-lobe-of-liver.1.pkl")
obj_2 = DiffCombObj().from_pickle(f"{differential_analysis_selection_path}right-lobe-of-liver.16__right-lobe-of-liver.2.pkl")

obj_1.simplify_rules()
obj_2.simplify_rules()
obj3 = obj_1.rules.merge(obj_2.rules, left_index=True, right_index=True, suffixes=(f"_{obj_1.prefixes[0]}_{obj_1.prefixes[1]}", f"_{obj_2.prefixes[0]}_{obj_2.prefixes[1]}"))
obj3


Unnamed: 0,TF1_right-lobe-of-liver.16_right-lobe-of-liver.1,TF2_right-lobe-of-liver.16_right-lobe-of-liver.1,right-lobe-of-liver.16_cosine_right-lobe-of-liver.16_right-lobe-of-liver.1,right-lobe-of-liver.1_cosine,right-lobe-of-liver.16/right-lobe-of-liver.1_cosine_log2fc,TF1_right-lobe-of-liver.16_right-lobe-of-liver.2,TF2_right-lobe-of-liver.16_right-lobe-of-liver.2,right-lobe-of-liver.16_cosine_right-lobe-of-liver.16_right-lobe-of-liver.2,right-lobe-of-liver.2_cosine,right-lobe-of-liver.16/right-lobe-of-liver.2_cosine_log2fc
MLXIPL-MLX,MLXIPL,MLX,0.016039,0.049144,-1.365381,MLXIPL,MLX,0.015862,0.031589,-0.813708
ETV5-EWSR1-FLI1,ETV5,EWSR1-FLI1,0.017486,0.043775,-1.118498,ETV5,EWSR1-FLI1,0.017295,0.041284,-1.05747
MEF2D-TBX1,MEF2D,TBX1,0.021175,0.051722,-1.116865,MEF2D,TBX1,0.020906,0.052591,-1.155862
MEF2D-TBX15,MEF2D,TBX15,0.021175,0.051722,-1.116865,MEF2D,TBX15,0.020906,0.052591,-1.155862
Nr2f6var.2-MEF2B,Nr2f6var.2,MEF2B,0.018306,0.043053,-1.045135,Nr2f6var.2,MEF2B,0.018095,0.036975,-0.864971
ZBTB7A-EWSR1-FLI1,ZBTB7A,EWSR1-FLI1,0.018895,0.040685,-0.936127,ZBTB7A,EWSR1-FLI1,0.018666,0.040533,-0.947275
MEF2C-KLF2,MEF2C,KLF2,0.027347,0.054701,-0.884938,MEF2C,KLF2,0.027347,0.051211,-0.79926
MEF2C-KLF6,MEF2C,KLF6,0.029429,0.05814,-0.87568,MEF2C,KLF6,0.029532,0.056594,-0.836844
TBX1-MEF2C,TBX1,MEF2C,0.022007,0.043663,-0.850429,TBX1,MEF2C,0.021741,0.047505,-0.976269
TBX5-MEF2C,TBX5,MEF2C,0.022007,0.043663,-0.850429,TBX5,MEF2C,0.021741,0.047505,-0.976269


In [None]:
diff_obj_1_1.rules.loc['Foxd3-ONECUT2']
diff_obj2.rules.loc['Foxd3-ONECUT2']

In [None]:
top30C = selectedC.select_top_rules(n=30)
top30C.rules.head(31)

In [85]:
df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo', 'fehler'],'value': [1, 2, 3, 5,7]})
df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo','test'],'value': [5, 6, 7, 8, 9]})


#df1.merge(df2, left_on='lkey', right_on='rkey')
df1.merge(df2, left_index=True, right_index=True, suffixes=("_test", "_test2"))


Unnamed: 0,lkey,value_test,rkey,valuetest_2
0,foo,1,foo,5
1,bar,2,bar,6
2,baz,3,baz,7
3,foo,5,foo,8
4,fehler,7,test,9


In [65]:
df1

Unnamed: 0,lkey,value
0,foo,1
1,bar,2
2,baz,3
3,foo,5


In [66]:
df2

Unnamed: 0,rkey,value
0,foo,5
1,bar,6
2,baz,7
3,foo,8


In [None]:
top30C.plot_bubble()

In [None]:
top30C.plot_network()

In [None]:
selectedC.rules