In [130]:
from tfcomb import CombObj, DiffCombObj
import os
import pathlib
import pandas as pd
import numpy as np
'''
Constants for this script
'''

#genome_path="../testdaten/hg19_masked.fa"
genome_path="../testdaten/homo_sapiens.104.mainChr.fa"

main_jaspar_file="../testdaten/JASPAR2020_CORE_vertebrates.meme" 

# path where market basket analyses for cluster are put.
result_path="./results/wp2/"
main_analysis_path=f"{result_path}main/"
differential_analysis_path=f"{result_path}diff_analysis/"

differential_analysis_selection_path=f"{differential_analysis_path}selection/"
answers_path=f"{result_path}answers/"

### folder of wp2, where the clusters are
#path_to_clusters="../testdaten/wp2/"
#path_to_clusters="/mnt/workspace_stud/stud3/WP6_data/"
path_to_clusters="/mnt/workspace_stud/stud4/WP6_data/"

# create result folders 
if not os.path.exists(result_path):
     pathlib.Path(result_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(main_analysis_path):
     pathlib.Path(main_analysis_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(differential_analysis_path):
     pathlib.Path(differential_analysis_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(answers_path):
     pathlib.Path(answers_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(differential_analysis_selection_path):
     pathlib.Path(differential_analysis_selection_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(genome_path):
    print(f"ERROR: path {genome_path} does not exist")

if not os.path.exists(main_jaspar_file):
    print(f"ERROR: path {main_jaspar_file} does not exist")

if not os.path.exists(path_to_clusters):
    print(f"ERROR: path {path_to_clusters} does not exist")


In [2]:
def do_market_basket_analyses_for_cell_cluster(cell_cluster_name: str, cell_cluster_path:str):
    '''
        Does market basket analyses.
    '''
    comb = CombObj()
    comb.TFBS_from_motifs(regions= cell_cluster_path,
                   motifs=main_jaspar_file,
                   genome=genome_path,
                   threads=4)
    
    print(f'Start market basket analyses for cell-cluster/type: {cell_cluster_name}')
    comb.market_basket(threads=10)
    if len(comb.rules) <= 0:
        print(f'Could not find TF-cooccurences for cell-cluster/type: {cell_cluster_name}')
        return
    print(f'Finished market basket analyses for cell-cluster/type: {cell_cluster_name}')
    print(f'Found rules: {len(comb.rules)}')
    comb.to_pickle(f'{main_analysis_path}{cell_cluster_name}.pkl')
    print(f'Saved: {main_analysis_path}{cell_cluster_name}.pkl')

In [3]:
def read_in_file_names_of_folder(rel_path:str):
    return [f for f in os.listdir(rel_path) if os.path.isfile(os.path.join(rel_path, f))]

cluster_file_names = read_in_file_names_of_folder(rel_path=path_to_clusters)
print(cluster_file_names)





['right-lobe-of-liver.10.bed', 'right-lobe-of-liver.11.bed', 'right-lobe-of-liver.12.bed', 'right-lobe-of-liver.14.bed', 'right-lobe-of-liver.15.bed', 'right-lobe-of-liver.16.bed', 'right-lobe-of-liver.17.bed', 'right-lobe-of-liver.18.bed', 'right-lobe-of-liver.1.bed', 'right-lobe-of-liver.2.bed', 'right-lobe-of-liver.3.bed', 'right-lobe-of-liver.4.bed', 'right-lobe-of-liver.5.bed', 'right-lobe-of-liver.6.bed', 'right-lobe-of-liver.7.bed', 'right-lobe-of-liver.8.bed']


In [None]:
# Has to be tested as soon as wp2 generates new .bed files
for file_name in cluster_file_names:
    cluster_name = file_name.split('.bed')[0]
    print(cluster_name)
    print(file_name)
    cluster_path=f"{path_to_clusters}{file_name}"
    do_market_basket_analyses_for_cell_cluster(cell_cluster_name=cluster_name, cell_cluster_path=cluster_path)

    

# Analysis

### Differential Analysis

In [4]:
# mb market basket analysis
files_main_mb= read_in_file_names_of_folder(rel_path=main_analysis_path)
print(f"Count of Files: {len(files_main_mb)}")
print(f"Files: {files_main_mb}")


Count of Files: 16
Files: ['right-lobe-of-liver.10.pkl', 'right-lobe-of-liver.11.pkl', 'right-lobe-of-liver.12.pkl', 'right-lobe-of-liver.14.pkl', 'right-lobe-of-liver.15.pkl', 'right-lobe-of-liver.16.pkl', 'right-lobe-of-liver.17.pkl', 'right-lobe-of-liver.18.pkl', 'right-lobe-of-liver.1.pkl', 'right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.4.pkl', 'right-lobe-of-liver.5.pkl', 'right-lobe-of-liver.6.pkl', 'right-lobe-of-liver.7.pkl', 'right-lobe-of-liver.8.pkl']


In [None]:
# Diff analysis between each cluster:
for i, file in enumerate(files_main_mb):
    print(file)
    name_i = file.split('.pkl')[0]
    
    for j in range(i + 1, len(files_main_mb), 1):
        file_j = files_main_mb[j]
        name_j = file_j.split('.pkl')[0]
        print(j)
        print(name_j)
        A = CombObj().from_pickle(f"{main_analysis_path}{file}")
        print(A)
        A.set_prefix(name_i)
        B = CombObj().from_pickle(f"{main_analysis_path}{file_j}")
        print(B)
        B.set_prefix(name_j)
        compare_obj = A.compare(B)
        compare_obj.to_pickle(f'{differential_analysis_path}{name_i}__{name_j}.pkl')
        
        selected_std = compare_obj.select_rules()
        
        #TODO: Save autamatically generated thresholds
        # utils.get_threshold(new.rules.iloc[:,4], 'both', percent=0.05)
        # logfc threshold (-xxx , +xxx)
        #  utils.get_threshold(new.rules.iloc[:,2:4].mean(axis=1), 'upper', percent=0.05)
        # cosine threshold
        selected_std.to_pickle(f'{differential_analysis_selection_path}{name_i}__{name_j}.pkl')
        
        break;
        
        
print("Done differential analysis")

right-lobe-of-liver.10.pkl
1
right-lobe-of-liver.11
<CombObj: 308349 TFBS (746 unique names) | Market basket analysis: 338172 rules>
<CombObj: 320596 TFBS (746 unique names) | Market basket analysis: 344416 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.10 / right-lobe-of-liver.11
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
INFO: Selecting rules for contrast: ('right-lobe-of-liver.10', 'right-lobe-of-liver.11')
INFO: measure_threshold is None; trying to calculate optimal threshold
INFO: mean_threshold is None; trying to calculate optimal threshold
INFO: Creating subset of rules using thresholds
2
right-lobe-of-liver.12
<CombObj: 308349 TFBS (746 unique names) | Market basket analysis: 338172 rules>
<CombObj: 176578 TFBS (746 unique names) | Market basket analysis: 235437 rules>
INFO: Calculating foldchange for contrast: right-lobe-of-liver.10 / right-lobe-of-liver.12
INFO: The calculated log2fc's are found in the rules table (

## Specific analysis:
### Question:
Does the cluster 'right-lobe-of-liver.1.pkl' have specific tf-co-occurences, which can´t be found in the other clusters of the right-lobe-of-liver. ? Which are those tf-co and how many are there?
####  Steps:
    - Take all differential mb-analysis of cluster one. Merge them to a pandas dataframe.
    - Remove All diff tf-co´s which are found twice in the analysis.
    
    TODO:
    - simplify_rules für die selection einbauen
    - classify neg_pos foldchanges 'positiv', 'negativ'
    - Merge Diff_analysis für einen gesamten cluster zu großen dataframe (outer join, nicht inner join wie jetzt), um das dann auszuwerten 
    

In [267]:
def prepare_diff_obj_dataframe(diff_obj: DiffCombObj) -> pd.DataFrame:
    
    # possible prefix names ['right-lobe-of-liver.10', 'right-lobe-of-liver.16']
    df = diff_obj.rules
    tissue_name_c1 , cluster_nr_c1  = diff_obj.prefixes[0].split('.')
    tissue_name_c2 , cluster_nr_c2  = diff_obj.prefixes[1].split('.')
    suff = ""
    if tissue_name_c1 == tissue_name_c2:
        suff += f"_{tissue_name_c1}"
    else:
        suff += f"_{tissue_name_c1}_{tissue_name_c2}"

    if cluster_nr_c1 == cluster_nr_c2:
        suff += f"_{cluster_nr_c1}"
    else:
        suff += f"_{cluster_nr_c1}_{cluster_nr_c2}"

    df['log2fc_class'] = df.apply(lambda x: 'negativ' if x[4] < 0 else 'positiv', axis=1)
    df.columns = [f'{x}{suff}' for x in df.columns]
    
    return df.copy(deep=True)


# Find the specific tf_cooccurences of a tissue that unique for the specific cluster in the tissue.
# 1. Diff analyse , 
# 2. Read in Diffanalyse for the specific cluster
# 3. Find tf-cooccurence of the diffob , which are occuring in each cluster
# Read in file Names of all analysis
files_main_mb= read_in_file_names_of_folder(rel_path=main_analysis_path)
print(f"Count of Files: {len(files_main_mb)}")
#print(f"Files: {files_main_mb}")

files_diff= read_in_file_names_of_folder(rel_path=differential_analysis_path)
print(f"Count of Files: {len(files_diff)}")

test = ""
for file_mb in files_main_mb:
    cluster_name = file_mb.split('.pkl')[0]
    print(cluster_name)
    diffs = list(filter(lambda x: cluster_name in x, files_diff))
    print(len(diffs))
    print(diffs)
    
    # Keeps the read in DiffCombObj diff_objects:
    diff_objects = []
    
    for diff in diffs:
        diff_obj = DiffCombObj().from_pickle(f"{differential_analysis_selection_path}{diff}")
        diff_objects.append(diff_obj)
    
    erg = None
    for i in range(len(diff_objects)-1):
        
        if erg is None:
            obj_1= diff_objects[i]
            obj_2 = diff_objects[i + 1]
            df1 = prepare_diff_obj_dataframe(diff_obj = obj_1)
            df2 = prepare_diff_obj_dataframe(diff_obj = obj_2)
            
            erg = df1.merge(df2, how='outer', left_index=True, right_index=True)
        else:
            obj_2 = diff_objects[i + 1] 
            df2 = prepare_diff_obj_dataframe(diff_obj = obj_2)
            erg = erg.merge(df2, how='outer', left_index=True, right_index=True)
       
    test = erg
    erg.to_pickle(path=f"{answers_path}{cluster_name}.pkl")
    
print("Done")    
test.columns

Count of Files: 16
Count of Files: 120
right-lobe-of-liver.10
15
['right-lobe-of-liver.10__right-lobe-of-liver.11.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.12.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.14.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.15.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.16.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.17.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.18.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.1.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.4.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.5.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.6.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.7.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.8.pkl']
right-lobe-of-liver.11
15
['right-lobe-of-liver.10__right-lobe-of-liver.11.pkl', 'right-lobe-of-liver.11__right-lobe-of-liver.12.pkl

right-lobe-of-liver.2
15
['right-lobe-of-liver.10__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.11__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.12__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.14__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.15__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.16__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.17__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.18__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.1__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.2__right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.2__right-lobe-of-liver.4.pkl', 'right-lobe-of-liver.2__right-lobe-of-liver.5.pkl', 'right-lobe-of-liver.2__right-lobe-of-liver.6.pkl', 'right-lobe-of-liver.2__right-lobe-of-liver.7.pkl', 'right-lobe-of-liver.2__right-lobe-of-liver.8.pkl']
right-lobe-of-liver.3
15
['right-lobe-of-liver.10__right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.11__right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.12__right-lobe-of-liver.3.pkl', '

Index(['TF1_right-lobe-of-liver_10_8', 'TF2_right-lobe-of-liver_10_8',
       'right-lobe-of-liver.10_cosine_right-lobe-of-liver_10_8',
       'right-lobe-of-liver.8_cosine_right-lobe-of-liver_10_8',
       'right-lobe-of-liver.10/right-lobe-of-liver.8_cosine_log2fc_right-lobe-of-liver_10_8',
       'log2fc_class_right-lobe-of-liver_10_8', 'TF1_right-lobe-of-liver_11_8',
       'TF2_right-lobe-of-liver_11_8',
       'right-lobe-of-liver.11_cosine_right-lobe-of-liver_11_8',
       'right-lobe-of-liver.8_cosine_right-lobe-of-liver_11_8',
       'right-lobe-of-liver.11/right-lobe-of-liver.8_cosine_log2fc_right-lobe-of-liver_11_8',
       'log2fc_class_right-lobe-of-liver_11_8', 'TF1_right-lobe-of-liver_12_8',
       'TF2_right-lobe-of-liver_12_8',
       'right-lobe-of-liver.12_cosine_right-lobe-of-liver_12_8',
       'right-lobe-of-liver.8_cosine_right-lobe-of-liver_12_8',
       'right-lobe-of-liver.12/right-lobe-of-liver.8_cosine_log2fc_right-lobe-of-liver_12_8',
       'log2fc_class_r

In [283]:
answer_file_names=read_in_file_names_of_folder(rel_path=answers_path)
print(answer_file_names)
cluster_dfs = []
df = None
for name in answer_file_names:
    df = pd.read_pickle(f"{answers_path}{name}")
    cluster_dfs.append(name)
    df = df

    #df.groupby(['class', 'value']).count()
    break;
filter_columns = list(filter(lambda x: 'log2fc_class' in x , df.columns))
#len(filter_columns)
filtered_df = df[df[filter_columns].notna().all(1)] #
filtered_df
#df3.iloc[:, 2:3]
#df = pd.read_pickle(f"{answers_path}right-lobe-of-liver.6.pkl")
#df = pd.read_pickle(f"{differential_analysis_selection_path}{right-lobe-of-liver.6.pkl}")


#original = CombObj().from_pickle(f"{main_analysis_path}right-lobe-of-liver.6.pkl")
#original.rules.loc[df.index]


['right-lobe-of-liver.10.pkl', 'right-lobe-of-liver.11.pkl', 'right-lobe-of-liver.12.pkl', 'right-lobe-of-liver.14.pkl', 'right-lobe-of-liver.15.pkl', 'right-lobe-of-liver.16.pkl', 'right-lobe-of-liver.17.pkl', 'right-lobe-of-liver.18.pkl', 'right-lobe-of-liver.1.pkl', 'right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.4.pkl', 'right-lobe-of-liver.5.pkl', 'right-lobe-of-liver.6.pkl', 'right-lobe-of-liver.7.pkl', 'right-lobe-of-liver.8.pkl']


Unnamed: 0,TF1_right-lobe-of-liver_10_11,TF2_right-lobe-of-liver_10_11,right-lobe-of-liver.10_cosine_right-lobe-of-liver_10_11,right-lobe-of-liver.11_cosine_right-lobe-of-liver_10_11,right-lobe-of-liver.10/right-lobe-of-liver.11_cosine_log2fc_right-lobe-of-liver_10_11,log2fc_class_right-lobe-of-liver_10_11,TF1_right-lobe-of-liver_10_12,TF2_right-lobe-of-liver_10_12,right-lobe-of-liver.10_cosine_right-lobe-of-liver_10_12,right-lobe-of-liver.12_cosine_right-lobe-of-liver_10_12,...,right-lobe-of-liver.10_cosine_right-lobe-of-liver_10_7,right-lobe-of-liver.7_cosine_right-lobe-of-liver_10_7,right-lobe-of-liver.10/right-lobe-of-liver.7_cosine_log2fc_right-lobe-of-liver_10_7,log2fc_class_right-lobe-of-liver_10_7,TF1_right-lobe-of-liver_10_8,TF2_right-lobe-of-liver_10_8,right-lobe-of-liver.10_cosine_right-lobe-of-liver_10_8,right-lobe-of-liver.8_cosine_right-lobe-of-liver_10_8,right-lobe-of-liver.10/right-lobe-of-liver.8_cosine_log2fc_right-lobe-of-liver_10_8,log2fc_class_right-lobe-of-liver_10_8


In [243]:
df = pd.read_pickle(f"{answers_path}right-lobe-of-liver.6.pkl")
selection = DiffCombObj().from_pickle(f"{differential_analysis_selection_path}right-lobe-of-liver.10__right-lobe-of-liver.16.pkl")
selection_orig = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.10__right-lobe-of-liver.16.pkl")
selection_orig
selection

original = CombObj().from_pickle(f"{main_analysis_path}right-lobe-of-liver.6.pkl")
original.rules.loc[df.index]
selection.prefixes

['right-lobe-of-liver.10', 'right-lobe-of-liver.16']

In [182]:
df_6 = pd.read_pickle(f"{answers_path}right-lobe-of-liver.6.pkl")
df_11 = pd.read_pickle(f"{answers_path}right-lobe-of-liver.11.pkl")
df_14 = pd.read_pickle(f"{answers_path}right-lobe-of-liver.14.pkl")
0.172958
df_14

Unnamed: 0,TF1_right-lobe-of-liver.10_right-lobe-of-liver.14,TF2_right-lobe-of-liver.10_right-lobe-of-liver.14,right-lobe-of-liver.10_cosine,right-lobe-of-liver.14_cosine_right-lobe-of-liver.10_right-lobe-of-liver.14,right-lobe-of-liver.10/right-lobe-of-liver.14_cosine_log2fc,TF1_right-lobe-of-liver.11_right-lobe-of-liver.14,TF2_right-lobe-of-liver.11_right-lobe-of-liver.14,right-lobe-of-liver.11_cosine,right-lobe-of-liver.14_cosine_right-lobe-of-liver.11_right-lobe-of-liver.14,right-lobe-of-liver.11/right-lobe-of-liver.14_cosine_log2fc,...,TF1_right-lobe-of-liver.14_right-lobe-of-liver.7,TF2_right-lobe-of-liver.14_right-lobe-of-liver.7,right-lobe-of-liver.14_cosine_right-lobe-of-liver.14_right-lobe-of-liver.7,right-lobe-of-liver.7_cosine,right-lobe-of-liver.14/right-lobe-of-liver.7_cosine_log2fc,TF1,TF2,right-lobe-of-liver.14_cosine,right-lobe-of-liver.8_cosine,right-lobe-of-liver.14/right-lobe-of-liver.8_cosine_log2fc
RXRBvar.2-ESRRB,RXRBvar.2,ESRRB,0.018847,0.172958,-2.446904,RXRBvar.2,ESRRB,0.018536,0.178292,-2.493541,...,RXRBvar.2,ESRRB,0.174534,0.030006,2.03857,RXRBvar.2,ESRRB,0.182861,0.025756,2.23683
ESRRB-RXRBvar.2,ESRRB,RXRBvar.2,0.018847,0.172958,-2.446904,ESRRB,RXRBvar.2,0.018536,0.178292,-2.493541,...,ESRRB,RXRBvar.2,0.174534,0.030006,2.03857,ESRRB,RXRBvar.2,0.182861,0.025756,2.23683
MEF2A-MEIS2var.2,MEF2A,MEIS2var.2,0.046449,0.273918,-2.219153,MEF2A,MEIS2var.2,0.049461,0.289161,-2.221594,...,MEF2A,MEIS2var.2,0.26996,0.076923,1.616588,MEF2A,MEIS2var.2,0.285203,0.076233,1.70144
MEIS2var.2-MEF2A,MEIS2var.2,MEF2A,0.046449,0.273918,-2.219153,MEIS2var.2,MEF2A,0.049461,0.289161,-2.221594,...,MEIS2var.2,MEF2A,0.26996,0.076923,1.616588,MEIS2var.2,MEF2A,0.285203,0.076233,1.70144
TFAP4-IRF4,TFAP4,IRF4,0.022615,0.16098,-2.203226,TFAP4,IRF4,0.021202,0.167671,-2.305012,...,TFAP4,IRF4,0.16427,0.061366,1.215195,TFAP4,IRF4,0.171509,0.030669,1.993759
IRF4-TFAP4,IRF4,TFAP4,0.022615,0.16098,-2.203226,IRF4,TFAP4,0.021202,0.167671,-2.305012,...,IRF4,TFAP4,0.16427,0.061366,1.215195,IRF4,TFAP4,0.171509,0.030669,1.993759
TFAP4-IRF9,TFAP4,IRF9,0.018785,0.136496,-2.139804,TFAP4,IRF9,0.017687,0.14065,-2.218702,...,TFAP4,IRF9,0.140227,0.049413,1.24974,TFAP4,IRF9,0.148155,0.027582,1.900026
IRF9-TFAP4,IRF9,TFAP4,0.018785,0.136496,-2.139804,IRF9,TFAP4,0.017687,0.14065,-2.218702,...,IRF9,TFAP4,0.140227,0.049413,1.24974,IRF9,TFAP4,0.148155,0.027582,1.900026
HOXA6-Crx,HOXA6,Crx,0.028562,0.149944,-1.902399,HOXA6,Crx,0.028015,0.156802,-1.974105,...,HOXA6,Crx,0.15326,0.047705,1.402865,HOXA6,Crx,0.161341,0.045076,1.529402
Crx-HOXA6,Crx,HOXA6,0.028562,0.149944,-1.902399,Crx,HOXA6,0.028015,0.156802,-1.974105,...,Crx,HOXA6,0.15326,0.047705,1.402865,Crx,HOXA6,0.161341,0.045076,1.529402


In [203]:
#original.rules.describe()
df_14

Unnamed: 0,TF1_right-lobe-of-liver.10_right-lobe-of-liver.14,TF2_right-lobe-of-liver.10_right-lobe-of-liver.14,right-lobe-of-liver.10_cosine,right-lobe-of-liver.14_cosine_right-lobe-of-liver.10_right-lobe-of-liver.14,right-lobe-of-liver.10/right-lobe-of-liver.14_cosine_log2fc,TF1_right-lobe-of-liver.11_right-lobe-of-liver.14,TF2_right-lobe-of-liver.11_right-lobe-of-liver.14,right-lobe-of-liver.11_cosine,right-lobe-of-liver.14_cosine_right-lobe-of-liver.11_right-lobe-of-liver.14,right-lobe-of-liver.11/right-lobe-of-liver.14_cosine_log2fc,...,TF1_right-lobe-of-liver.14_right-lobe-of-liver.7,TF2_right-lobe-of-liver.14_right-lobe-of-liver.7,right-lobe-of-liver.14_cosine_right-lobe-of-liver.14_right-lobe-of-liver.7,right-lobe-of-liver.7_cosine,right-lobe-of-liver.14/right-lobe-of-liver.7_cosine_log2fc,TF1,TF2,right-lobe-of-liver.14_cosine,right-lobe-of-liver.8_cosine,right-lobe-of-liver.14/right-lobe-of-liver.8_cosine_log2fc
RXRBvar.2-ESRRB,RXRBvar.2,ESRRB,0.018847,0.172958,-2.446904,RXRBvar.2,ESRRB,0.018536,0.178292,-2.493541,...,RXRBvar.2,ESRRB,0.174534,0.030006,2.03857,RXRBvar.2,ESRRB,0.182861,0.025756,2.23683
ESRRB-RXRBvar.2,ESRRB,RXRBvar.2,0.018847,0.172958,-2.446904,ESRRB,RXRBvar.2,0.018536,0.178292,-2.493541,...,ESRRB,RXRBvar.2,0.174534,0.030006,2.03857,ESRRB,RXRBvar.2,0.182861,0.025756,2.23683
MEF2A-MEIS2var.2,MEF2A,MEIS2var.2,0.046449,0.273918,-2.219153,MEF2A,MEIS2var.2,0.049461,0.289161,-2.221594,...,MEF2A,MEIS2var.2,0.26996,0.076923,1.616588,MEF2A,MEIS2var.2,0.285203,0.076233,1.70144
MEIS2var.2-MEF2A,MEIS2var.2,MEF2A,0.046449,0.273918,-2.219153,MEIS2var.2,MEF2A,0.049461,0.289161,-2.221594,...,MEIS2var.2,MEF2A,0.26996,0.076923,1.616588,MEIS2var.2,MEF2A,0.285203,0.076233,1.70144
TFAP4-IRF4,TFAP4,IRF4,0.022615,0.16098,-2.203226,TFAP4,IRF4,0.021202,0.167671,-2.305012,...,TFAP4,IRF4,0.16427,0.061366,1.215195,TFAP4,IRF4,0.171509,0.030669,1.993759
IRF4-TFAP4,IRF4,TFAP4,0.022615,0.16098,-2.203226,IRF4,TFAP4,0.021202,0.167671,-2.305012,...,IRF4,TFAP4,0.16427,0.061366,1.215195,IRF4,TFAP4,0.171509,0.030669,1.993759
TFAP4-IRF9,TFAP4,IRF9,0.018785,0.136496,-2.139804,TFAP4,IRF9,0.017687,0.14065,-2.218702,...,TFAP4,IRF9,0.140227,0.049413,1.24974,TFAP4,IRF9,0.148155,0.027582,1.900026
IRF9-TFAP4,IRF9,TFAP4,0.018785,0.136496,-2.139804,IRF9,TFAP4,0.017687,0.14065,-2.218702,...,IRF9,TFAP4,0.140227,0.049413,1.24974,IRF9,TFAP4,0.148155,0.027582,1.900026
HOXA6-Crx,HOXA6,Crx,0.028562,0.149944,-1.902399,HOXA6,Crx,0.028015,0.156802,-1.974105,...,HOXA6,Crx,0.15326,0.047705,1.402865,HOXA6,Crx,0.161341,0.045076,1.529402
Crx-HOXA6,Crx,HOXA6,0.028562,0.149944,-1.902399,Crx,HOXA6,0.028015,0.156802,-1.974105,...,Crx,HOXA6,0.15326,0.047705,1.402865,Crx,HOXA6,0.161341,0.045076,1.529402


### Try and Error section

In [70]:
#mb_obj = CombObj().from_pickle(f"{main_analysis_path}right-lobe-of-liver.10.pkl")
obj_1 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.1.pkl")
obj_2 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.2.pkl")
obj_3 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.3.pkl")
obj_4 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.4.pkl")
obj_5 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.5.pkl")
obj_6 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.6.pkl")
#mb_obj.rules
#type(diff_obj.rules)
#diff_obj2.rules
df_diff = pd.concat([obj_1.rules, obj_2.rules, obj_3.rules, obj_4.rules, obj_5.rules, obj_6.rules], join="inner")



#df_diff2 = pd.concat([diff_obj_1_1.rules,diff_obj2.rules, diff_obj1_2.rules])

unified_duplicates = df_diff[df_diff.duplicated(subset=['TF1', 'TF2'], keep='first')]

df_diff2 = pd.concat([unified_duplicates, diff_obj1_2.rules])

unified_duplicates2 = df_diff2[df_diff2.duplicated(subset=['TF1', 'TF2'], keep='first')]

#df_diff = df_diff.drop_duplicates(subset=['TF1', 'TF2'])
#unified_duplicates
#unified_duplicates
#unified_duplicates2
#diff_obj_1_1.rules
#unified_duplicates



In [89]:
obj_1 = DiffCombObj().from_pickle(f"{differential_analysis_selection_path}right-lobe-of-liver.16__right-lobe-of-liver.1.pkl")
obj_2 = DiffCombObj().from_pickle(f"{differential_analysis_selection_path}right-lobe-of-liver.16__right-lobe-of-liver.2.pkl")

obj_1.simplify_rules()
obj_2.simplify_rules()
obj3 = obj_1.rules.merge(obj_2.rules, left_index=True, right_index=True, suffixes=(f"_{obj_1.prefixes[0]}_{obj_1.prefixes[1]}", f"_{obj_2.prefixes[0]}_{obj_2.prefixes[1]}"))
obj3


Unnamed: 0,TF1_right-lobe-of-liver.16_right-lobe-of-liver.1,TF2_right-lobe-of-liver.16_right-lobe-of-liver.1,right-lobe-of-liver.16_cosine_right-lobe-of-liver.16_right-lobe-of-liver.1,right-lobe-of-liver.1_cosine,right-lobe-of-liver.16/right-lobe-of-liver.1_cosine_log2fc,TF1_right-lobe-of-liver.16_right-lobe-of-liver.2,TF2_right-lobe-of-liver.16_right-lobe-of-liver.2,right-lobe-of-liver.16_cosine_right-lobe-of-liver.16_right-lobe-of-liver.2,right-lobe-of-liver.2_cosine,right-lobe-of-liver.16/right-lobe-of-liver.2_cosine_log2fc
MLXIPL-MLX,MLXIPL,MLX,0.016039,0.049144,-1.365381,MLXIPL,MLX,0.015862,0.031589,-0.813708
ETV5-EWSR1-FLI1,ETV5,EWSR1-FLI1,0.017486,0.043775,-1.118498,ETV5,EWSR1-FLI1,0.017295,0.041284,-1.05747
MEF2D-TBX1,MEF2D,TBX1,0.021175,0.051722,-1.116865,MEF2D,TBX1,0.020906,0.052591,-1.155862
MEF2D-TBX15,MEF2D,TBX15,0.021175,0.051722,-1.116865,MEF2D,TBX15,0.020906,0.052591,-1.155862
Nr2f6var.2-MEF2B,Nr2f6var.2,MEF2B,0.018306,0.043053,-1.045135,Nr2f6var.2,MEF2B,0.018095,0.036975,-0.864971
ZBTB7A-EWSR1-FLI1,ZBTB7A,EWSR1-FLI1,0.018895,0.040685,-0.936127,ZBTB7A,EWSR1-FLI1,0.018666,0.040533,-0.947275
MEF2C-KLF2,MEF2C,KLF2,0.027347,0.054701,-0.884938,MEF2C,KLF2,0.027347,0.051211,-0.79926
MEF2C-KLF6,MEF2C,KLF6,0.029429,0.05814,-0.87568,MEF2C,KLF6,0.029532,0.056594,-0.836844
TBX1-MEF2C,TBX1,MEF2C,0.022007,0.043663,-0.850429,TBX1,MEF2C,0.021741,0.047505,-0.976269
TBX5-MEF2C,TBX5,MEF2C,0.022007,0.043663,-0.850429,TBX5,MEF2C,0.021741,0.047505,-0.976269


In [None]:
diff_obj_1_1.rules.loc['Foxd3-ONECUT2']
diff_obj2.rules.loc['Foxd3-ONECUT2']

In [None]:
top30C = selectedC.select_top_rules(n=30)
top30C.rules.head(31)

In [237]:
df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo', 'fehler', 'nocheiner'],
                    'value': [1, 2, 3, 5,7,7]},
                   index=['my1', 'my2', 'my3', 'my4', 'my5', 'my6'])
df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo','test'],
                    'value': [5, 6, 7, 8, 9]},
                  index=['my1', 'not2', 'my3', 'not4', 'my5'])
df3 = pd.DataFrame({'rkey': ['new', 'lol'],
                    'value': [5, 6]},
                  index=['my1', 'not2'])


#df1.merge(df2, left_on='lkey', right_on='rkey')
#erg = df1.merge(df2, left_index=True, right_index=True, suffixes=("_test", "_test2"))
erg = df1.merge(df2, how='outer', left_index=True, right_index=True, suffixes=("_x", "_y"))
erg
erg = erg.merge(df3, how='outer', left_index=True, right_index=True, suffixes=("_x", "_"))
erg


Unnamed: 0,lkey,value_x,rkey_x,value_y,rkey_,value
my1,foo,1.0,foo,5.0,new,5.0
my2,bar,2.0,,,,
my3,baz,3.0,baz,7.0,,
my4,foo,5.0,,,,
my5,fehler,7.0,test,9.0,,
my6,nocheiner,7.0,,,,
not2,,,bar,6.0,lol,6.0
not4,,,foo,8.0,,


In [229]:
df1

Unnamed: 0,lkey,value
my1,foo,1
my2,bar,2
my3,baz,3
my4,foo,5
my5,fehler,7
my6,nocheiner,7


In [239]:
df2.columns = [f'{x}_df2' for x in df2.columns]
df2

Unnamed: 0,rkey_df2,value_df2
my1,foo,5
not2,bar,6
my3,baz,7
not4,foo,8
my5,test,9


In [242]:
[x+ 1 for x in df2.columns]

SyntaxError: invalid syntax (2074870395.py, line 1)

In [221]:
df1['class'] = df1.apply(lambda x: 'niedrig' if x[1] < 5 else 'hoch', axis=1) 

In [223]:
df1[df1['class'] == 'hoch']

Unnamed: 0,lkey,value,class
my4,foo,5,hoch
my5,fehler,7,hoch
my6,nocheiner,7,hoch


In [211]:
df1.groupby(['class', 'value']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,lkey
class,value,Unnamed: 2_level_1
hoch,1,1
hoch,2,1
hoch,3,1
hoch,5,1
niedrig,7,2


In [171]:
df2

Unnamed: 0,rkey,value
my1,foo,5
not2,bar,6
my3,baz,7
not4,foo,8
my5,test,9


In [197]:
df3.iloc[:, 2:3]

my1     5
not2    6
test    1
Name: drei, dtype: int64

In [None]:
top30C.plot_bubble()

In [None]:
top30C.plot_network()

In [None]:
selectedC.rules

In [147]:
#### OLD 
# Find the specific tf_cooccurences of a tissue that unique for the specific cluster in the tissue.
# 1. Diff analyse , 
# 2. Read in Diffanalyse for the specific cluster
# 3. Find tf-cooccurence of the diffob , which are occuring in each cluster
# Read in file Names of all analysis
files_main_mb= read_in_file_names_of_folder(rel_path=main_analysis_path)
print(f"Count of Files: {len(files_main_mb)}")
#print(f"Files: {files_main_mb}")

files_diff= read_in_file_names_of_folder(rel_path=differential_analysis_path)
print(f"Count of Files: {len(files_diff)}")
#print(f"Files: {files_diff}")
test = ""
for file_mb in files_main_mb:
    cluster_name = file_mb.split('.pkl')[0]
    print(cluster_name)
    diffs = list(filter(lambda x: cluster_name in x, files_diff))
    print(len(diffs))
    print(diffs)
    
    # Keeps the read in DiffCombObj diff_objects:
    diff_objects = []
    
    for diff in diffs:
        diff_obj = DiffCombObj().from_pickle(f"{differential_analysis_selection_path}{diff}")
        diff_objects.append(diff_obj)
    
    # erste DiffObj dataframe 
    initial_df = diff_objects[0].rules
    
    #has neg and pos foldchange
    cross_product_merged = initial_df
    
    # only pos foldchange
    pos_merged = initial_df[initial_df.iloc[:,4] > 0.00]
    
    # only neg foldchange
    neg_merged = initial_df[initial_df.iloc[:,4] < 0.00]
    for i in range(len(diff_objects)-1):
        obj_1= diff_objects[i]
        obj_2 = diff_objects[i + 1]
            
        # cross_product merge rules-dataframe by index (TF´s)
        cross_product = cross_product_merged.merge(obj_2.rules, left_index=True, right_index=True, suffixes=(f"_{obj_1.prefixes[0]}_{obj_1.prefixes[1]}", f"_{obj_2.prefixes[0]}_{obj_2.prefixes[1]}"))
        cross_product_merged = cross_product.copy(deep=True)
        
        # pos merge rules-dataframe by index (TF´s)
        obj2_df_pos = obj_2.rules[obj_2.rules.iloc[:,4] > 0.00]
        df_pos_merged = pos_merged.merge(obj2_df_pos, left_index=True, right_index=True, suffixes=(f"_{obj_1.prefixes[0]}_{obj_1.prefixes[1]}", f"_{obj_2.prefixes[0]}_{obj_2.prefixes[1]}"))
        pos_merged = df_pos_merged.copy(deep=True)
        
        # neg merge rules-dataframe by index (TF´s)
        obj2_df_neg = obj_2.rules[obj_2.rules.iloc[:,4] < 0.00]
        df_neg_merged = neg_merged.merge(obj2_df_neg, left_index=True, right_index=True, suffixes=(f"_{obj_1.prefixes[0]}_{obj_1.prefixes[1]}", f"_{obj_2.prefixes[0]}_{obj_2.prefixes[1]}"))
        neg_merged = df_neg_merged.copy(deep=True)
        
        
    cross_product_merged.to_pickle(path=f"{answers_path}{cluster_name}_cross.pkl")
    pos_merged.to_pickle(path=f"{answers_path}{cluster_name}_pos.pkl")
    neg_merged.to_pickle(path=f"{answers_path}{cluster_name}_neg.pkl")
    
print("Done")    
test


Count of Files: 16
Count of Files: 120
right-lobe-of-liver.10
15
['right-lobe-of-liver.10__right-lobe-of-liver.11.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.12.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.14.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.15.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.16.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.17.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.18.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.1.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.4.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.5.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.6.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.7.pkl', 'right-lobe-of-liver.10__right-lobe-of-liver.8.pkl']
Done


Unnamed: 0,TF1_right-lobe-of-liver.10_right-lobe-of-liver.11,TF2_right-lobe-of-liver.10_right-lobe-of-liver.11,right-lobe-of-liver.10_cosine_right-lobe-of-liver.10_right-lobe-of-liver.11,right-lobe-of-liver.11_cosine,right-lobe-of-liver.10/right-lobe-of-liver.11_cosine_log2fc,TF1_right-lobe-of-liver.10_right-lobe-of-liver.12,TF2_right-lobe-of-liver.10_right-lobe-of-liver.12,right-lobe-of-liver.10_cosine_right-lobe-of-liver.10_right-lobe-of-liver.12,right-lobe-of-liver.12_cosine,right-lobe-of-liver.10/right-lobe-of-liver.12_cosine_log2fc
BHLHE40-ZFP42,BHLHE40,ZFP42,0.038881,0.021341,0.74001,BHLHE40,ZFP42,0.040016,0.019084,0.879717
ZFP42-BHLHE40,ZFP42,BHLHE40,0.038881,0.021341,0.74001,ZFP42,BHLHE40,0.040016,0.019084,0.879717
DMRTC2-LMX1B,DMRTC2,LMX1B,0.044013,0.023768,0.772256,DMRTC2,LMX1B,0.044961,0.010286,1.655811
LMX1B-DMRTC2,LMX1B,DMRTC2,0.044013,0.023768,0.772256,LMX1B,DMRTC2,0.044961,0.010286,1.655811
LMX1B-DMRT3,LMX1B,DMRT3,0.042973,0.022586,0.802055,LMX1B,DMRT3,0.043955,0.012021,1.479514
DMRT3-LMX1B,DMRT3,LMX1B,0.042973,0.022586,0.802055,DMRT3,LMX1B,0.043955,0.012021,1.479514
MITF-USF1,MITF,USF1,0.054894,0.027921,0.866774,MITF,USF1,0.056648,0.027051,0.926033
USF1-MITF,USF1,MITF,0.054894,0.027921,0.866774,USF1,MITF,0.056648,0.027051,0.926033
PKNOX2-PAX5,PKNOX2,PAX5,0.058527,0.029318,0.891688,PKNOX2,PAX5,0.060783,0.009267,2.141307
PAX5-PKNOX2,PAX5,PKNOX2,0.058527,0.029318,0.891688,PAX5,PKNOX2,0.060783,0.009267,2.141307
