In [1]:
from tfcomb import CombObj, DiffCombObj
import os
import pathlib
import pandas as pd
import numpy as np
'''
Constants for this script
'''

#genome_path="../testdaten/hg19_masked.fa"
genome_path="../testdaten/homo_sapiens.104.mainChr.fa"

main_jaspar_file="../testdaten/JASPAR2020_CORE_vertebrates.meme" 

# path where market basket analyses for cluster are put.
result_path="./results/wp2/"
main_analysis_path=f"{result_path}main/"
differential_analysis_path=f"{result_path}diff_analysis/"

differential_analysis_selection_path=f"{differential_analysis_path}selection/"
answers_path=f"{result_path}answers/"

### folder of wp2, where the clusters are
#path_to_clusters="../testdaten/wp2/"
#path_to_clusters="/mnt/workspace_stud/stud3/WP6_data/"
path_to_clusters="/mnt/workspace_stud/stud4/WP6_data/"

# create result folders 
if not os.path.exists(result_path):
     pathlib.Path(result_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(main_analysis_path):
     pathlib.Path(main_analysis_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(differential_analysis_path):
     pathlib.Path(differential_analysis_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(answers_path):
     pathlib.Path(answers_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(differential_analysis_selection_path):
     pathlib.Path(differential_analysis_selection_path).mkdir(parents=True, exist_ok=True)

if not os.path.exists(genome_path):
    print(f"ERROR: path {genome_path} does not exist")

if not os.path.exists(main_jaspar_file):
    print(f"ERROR: path {main_jaspar_file} does not exist")

if not os.path.exists(path_to_clusters):
    print(f"ERROR: path {path_to_clusters} does not exist")


In [2]:
def do_market_basket_analyses_for_cell_cluster(cell_cluster_name: str, cell_cluster_path:str):
    '''
        Does market basket analyses.
    '''
    comb = CombObj()
    comb.TFBS_from_motifs(regions= cell_cluster_path,
                   motifs=main_jaspar_file,
                   genome=genome_path,
                   threads=4)
    
    print(f'Start market basket analyses for cell-cluster/type: {cell_cluster_name}')
    comb.market_basket(threads=10)
    if len(comb.rules) <= 0:
        print(f'Could not find TF-cooccurences for cell-cluster/type: {cell_cluster_name}')
        return
    print(f'Finished market basket analyses for cell-cluster/type: {cell_cluster_name}')
    print(f'Found rules: {len(comb.rules)}')
    comb.to_pickle(f'{main_analysis_path}{cell_cluster_name}.pkl')
    print(f'Saved: {main_analysis_path}{cell_cluster_name}.pkl')

In [3]:
def read_in_file_names_of_folder(rel_path:str):
    return [f for f in os.listdir(rel_path) if os.path.isfile(os.path.join(rel_path, f))]

cluster_file_names = read_in_file_names_of_folder(rel_path=path_to_clusters)
print(cluster_file_names)





['right-lobe-of-liver.10.bed', 'right-lobe-of-liver.11.bed', 'right-lobe-of-liver.12.bed', 'right-lobe-of-liver.14.bed', 'right-lobe-of-liver.15.bed', 'right-lobe-of-liver.16.bed', 'right-lobe-of-liver.17.bed', 'right-lobe-of-liver.18.bed', 'right-lobe-of-liver.1.bed', 'right-lobe-of-liver.2.bed', 'right-lobe-of-liver.3.bed', 'right-lobe-of-liver.4.bed', 'right-lobe-of-liver.5.bed', 'right-lobe-of-liver.6.bed', 'right-lobe-of-liver.7.bed', 'right-lobe-of-liver.8.bed']


In [None]:
# Has to be tested as soon as wp2 generates new .bed files
for file_name in cluster_file_names:
    cluster_name = file_name.split('.bed')[0]
    print(cluster_name)
    print(file_name)
    cluster_path=f"{path_to_clusters}{file_name}"
    do_market_basket_analyses_for_cell_cluster(cell_cluster_name=cluster_name, cell_cluster_path=cluster_path)

    

# Analysis

### Differential Analysis

In [4]:
# mb market basket analysis
files_main_mb= read_in_file_names_of_folder(rel_path=main_analysis_path)
print(f"Count of Files: {len(files_main_mb)}")
print(f"Files: {files_main_mb}")


Count of Files: 16
Files: ['right-lobe-of-liver.10.pkl', 'right-lobe-of-liver.11.pkl', 'right-lobe-of-liver.12.pkl', 'right-lobe-of-liver.14.pkl', 'right-lobe-of-liver.15.pkl', 'right-lobe-of-liver.16.pkl', 'right-lobe-of-liver.17.pkl', 'right-lobe-of-liver.18.pkl', 'right-lobe-of-liver.1.pkl', 'right-lobe-of-liver.2.pkl', 'right-lobe-of-liver.3.pkl', 'right-lobe-of-liver.4.pkl', 'right-lobe-of-liver.5.pkl', 'right-lobe-of-liver.6.pkl', 'right-lobe-of-liver.7.pkl', 'right-lobe-of-liver.8.pkl']


In [5]:
# Diff analysis for all clusters of a tissue:
# TODO: what should be compared ? All of a Tissue? or All? Naming?
tissue_name = 'right-lobe-of-liver'
combObj_to_compare = []
for i, file in enumerate(files_main_mb):
    print(file)
    name_i = file.split('.pkl')[0]
    tissue_name = file.split('.')[0]
    obj = CombObj().from_pickle(f"{main_analysis_path}{file}")
    obj.set_prefix(name_i)
    #print(obj)
    combObj_to_compare.append(obj)
    
compare_obj = DiffCombObj(combObj_to_compare, measure="cosine", join="outer", fillna=True)
compare_obj.to_pickle(f'{differential_analysis_path}{tissue_name}.pkl')
compare_obj.normalize()
compare_obj.calculate_foldchanges()
compare_obj.to_pickle(f'{differential_analysis_path}{tissue_name}_normalized.pkl')
#selection does not work?
#selected_std = compare_obj.select_rules()
#selected_std.to_pickle(f'{differential_analysis_selection_path}{tissue_name}.pkl')
print("Done differential analysis")

right-lobe-of-liver.10.pkl
right-lobe-of-liver.11.pkl
right-lobe-of-liver.12.pkl
right-lobe-of-liver.14.pkl
right-lobe-of-liver.15.pkl
right-lobe-of-liver.16.pkl
right-lobe-of-liver.17.pkl
right-lobe-of-liver.18.pkl
right-lobe-of-liver.1.pkl
right-lobe-of-liver.2.pkl
right-lobe-of-liver.3.pkl
right-lobe-of-liver.4.pkl
right-lobe-of-liver.5.pkl
right-lobe-of-liver.6.pkl
right-lobe-of-liver.7.pkl
right-lobe-of-liver.8.pkl
INFO: Calculating foldchange for contrast: right-lobe-of-liver.10 / right-lobe-of-liver.11
INFO: Calculating foldchange for contrast: right-lobe-of-liver.10 / right-lobe-of-liver.12
INFO: Calculating foldchange for contrast: right-lobe-of-liver.10 / right-lobe-of-liver.14
INFO: Calculating foldchange for contrast: right-lobe-of-liver.10 / right-lobe-of-liver.15
INFO: Calculating foldchange for contrast: right-lobe-of-liver.10 / right-lobe-of-liver.16
INFO: Calculating foldchange for contrast: right-lobe-of-liver.10 / right-lobe-of-liver.17
INFO: Calculating foldchange f

INFO: Calculating foldchange for contrast: right-lobe-of-liver.18 / right-lobe-of-liver.4
INFO: Calculating foldchange for contrast: right-lobe-of-liver.18 / right-lobe-of-liver.5
INFO: Calculating foldchange for contrast: right-lobe-of-liver.18 / right-lobe-of-liver.6
INFO: Calculating foldchange for contrast: right-lobe-of-liver.18 / right-lobe-of-liver.7
INFO: Calculating foldchange for contrast: right-lobe-of-liver.18 / right-lobe-of-liver.8
INFO: Calculating foldchange for contrast: right-lobe-of-liver.1 / right-lobe-of-liver.2
INFO: Calculating foldchange for contrast: right-lobe-of-liver.1 / right-lobe-of-liver.3
INFO: Calculating foldchange for contrast: right-lobe-of-liver.1 / right-lobe-of-liver.4
INFO: Calculating foldchange for contrast: right-lobe-of-liver.1 / right-lobe-of-liver.5
INFO: Calculating foldchange for contrast: right-lobe-of-liver.1 / right-lobe-of-liver.6
INFO: Calculating foldchange for contrast: right-lobe-of-liver.1 / right-lobe-of-liver.7
INFO: Calculatin

  self.rules[log2_col] = np.log2((p1_values + pseudo) / (p2_values + pseudo))


INFO: Calculating foldchange for contrast: right-lobe-of-liver.6 / right-lobe-of-liver.7
INFO: Calculating foldchange for contrast: right-lobe-of-liver.6 / right-lobe-of-liver.8
INFO: Calculating foldchange for contrast: right-lobe-of-liver.7 / right-lobe-of-liver.8
INFO: The calculated log2fc's are found in the rules table (<DiffCombObj>.rules)
Done differential analysis


In [None]:
compare_obj

In [None]:
compare_obj.rules

In [None]:
selected = compare_obj.select_rules()
selected.rules

## Old: Self implemented - Differential analysis - comparing each cluster

In [None]:
# Diff analysis between each cluster:
for i, file in enumerate(files_main_mb):
    print(file)
    name_i = file.split('.pkl')[0]
    
    for j in range(i + 1, len(files_main_mb), 1):
        file_j = files_main_mb[j]
        name_j = file_j.split('.pkl')[0]
        print(j)
        print(name_j)
        A = CombObj().from_pickle(f"{main_analysis_path}{file}")
        print(A)
        A.set_prefix(name_i)
        B = CombObj().from_pickle(f"{main_analysis_path}{file_j}")
        print(B)
        B.set_prefix(name_j)
        compare_obj = A.compare(B)
        compare_obj.to_pickle(f'{differential_analysis_path}{name_i}__{name_j}.pkl')
        
        selected_std = compare_obj.select_rules()
        
        #TODO: Save autamatically generated thresholds
        # utils.get_threshold(new.rules.iloc[:,4], 'both', percent=0.05)
        # logfc threshold (-xxx , +xxx)
        #  utils.get_threshold(new.rules.iloc[:,2:4].mean(axis=1), 'upper', percent=0.05)
        # cosine threshold
        selected_std.to_pickle(f'{differential_analysis_selection_path}{name_i}__{name_j}.pkl')
        
        break;
        
        
print("Done differential analysis")

## Specific analysis:
### Question:
Does the cluster 'right-lobe-of-liver.1.pkl' have specific tf-co-occurences, which can´t be found in the other clusters of the right-lobe-of-liver. ? Which are those tf-co and how many are there?
####  Steps:
    - Take all differential mb-analysis of cluster one. Merge them to a pandas dataframe.
    - Remove All diff tf-co´s which are found twice in the analysis.
    
    TODO:
    - simplify_rules für die selection einbauen
    - classify neg_pos foldchanges 'positiv', 'negativ'
    - Merge Diff_analysis für einen gesamten cluster zu großen dataframe (outer join, nicht inner join wie jetzt), um das dann auszuwerten 
    

In [6]:
# Analyses with whole diffcombj
diff_file_names=read_in_file_names_of_folder(rel_path=differential_analysis_path)

normalized_diff_objects = []
diff_objects = []

for file in diff_file_names:
    obj = DiffCombObj().from_pickle(f"{differential_analysis_path}{file}")
    if "normalized" in file:
        normalized_diff_objects.append(obj)
    else:
        diff_objects.append(obj)

print(normalized_diff_objects)
print(diff_objects) 

normalized_dfs = []
for obj in normalized_diff_objects:
    normalized_dfs.append(obj.rules)
print("Done: Preparing rules of DiffObj")    

[<tfcomb.objects.DiffCombObj object at 0x7f32f3bae150>]
[<tfcomb.objects.DiffCombObj object at 0x7f32f02ad110>]
Done: Preparing rules of DIFFObj


In [16]:
test = ''
for df in normalized_dfs:
    test= df
    for i, file in enumerate(files_main_mb):
        # print(file)
        cluster_name = file.split('.pkl')[0]
        print(cluster_name)
        
        cluster_cols = list(filter(lambda x: f'{cluster_name}' in x , df.columns))
        # NOT WORKING: logfc_cluster_cols = list(filter(lambda x: (f'{cluster_name}/' || f'/{cluster_name}') in x , cluster_cols)) 
        # This is important: for right-lob-of-liver-1 
        logfc_cluster_cols = []
        for entry in cluster_cols:
            if (f'{cluster_name}/' in entry) or (f'/{cluster_name}_cosine_log2fc' in entry):
                logfc_cluster_cols.append(entry)
        
        
        #print(logfc_cluster_cols)
        print(len(logfc_cluster_cols))
        #print(logfc_cluster_cols)
        tmp = df[logfc_cluster_cols]
        print(f'Initial Count: {tmp.shape}')
        tmp_val_counts = tmp[~tmp.isin([0])].count(axis=1).sort_values()
        #print(tmp_val_counts)

        # Selection if tf-tf occures in each cluster etc.
        tmp_tfs_occ = tmp_val_counts[tmp_val_counts >= len(logfc_cluster_cols)].index

        result = tmp.loc[tmp_tfs_occ]
        print(f'Specific: {result.shape}')
        test = result
test    


right-lobe-of-liver.10
15
Initial Count: (483427, 15)
Specific: (337985, 15)
right-lobe-of-liver.11
15
Initial Count: (483427, 15)
Specific: (344294, 15)
right-lobe-of-liver.12
15
Initial Count: (483427, 15)
Specific: (235404, 15)
right-lobe-of-liver.14
15
Initial Count: (483427, 15)
Specific: (79392, 15)
right-lobe-of-liver.15
15
Initial Count: (483427, 15)
Specific: (315999, 15)
right-lobe-of-liver.16
15
Initial Count: (483427, 15)
Specific: (314860, 15)
right-lobe-of-liver.17
15
Initial Count: (483427, 15)
Specific: (302572, 15)
right-lobe-of-liver.18
15
Initial Count: (483427, 15)
Specific: (325746, 15)
right-lobe-of-liver.1
15
Initial Count: (483427, 15)
Specific: (380570, 15)
right-lobe-of-liver.2
15
Initial Count: (483427, 15)
Specific: (406842, 15)
right-lobe-of-liver.3
15
Initial Count: (483427, 15)
Specific: (327597, 15)
right-lobe-of-liver.4
15
Initial Count: (483427, 15)
Specific: (333885, 15)
right-lobe-of-liver.5
15
Initial Count: (483427, 15)
Specific: (226936, 15)
right

Unnamed: 0,right-lobe-of-liver.10/right-lobe-of-liver.8_cosine_log2fc,right-lobe-of-liver.11/right-lobe-of-liver.8_cosine_log2fc,right-lobe-of-liver.12/right-lobe-of-liver.8_cosine_log2fc,right-lobe-of-liver.14/right-lobe-of-liver.8_cosine_log2fc,right-lobe-of-liver.15/right-lobe-of-liver.8_cosine_log2fc,right-lobe-of-liver.16/right-lobe-of-liver.8_cosine_log2fc,right-lobe-of-liver.17/right-lobe-of-liver.8_cosine_log2fc,right-lobe-of-liver.18/right-lobe-of-liver.8_cosine_log2fc,right-lobe-of-liver.1/right-lobe-of-liver.8_cosine_log2fc,right-lobe-of-liver.2/right-lobe-of-liver.8_cosine_log2fc,right-lobe-of-liver.3/right-lobe-of-liver.8_cosine_log2fc,right-lobe-of-liver.4/right-lobe-of-liver.8_cosine_log2fc,right-lobe-of-liver.5/right-lobe-of-liver.8_cosine_log2fc,right-lobe-of-liver.6/right-lobe-of-liver.8_cosine_log2fc,right-lobe-of-liver.7/right-lobe-of-liver.8_cosine_log2fc
MEF2D-FOXK1,-0.930217,-1.849058,-1.849058,-1.849058,-1.165643,-1.849058,-1.849058,-0.390519,-0.914829,-1.724589,-0.822692,-1.849058,-1.849058,-1.849058,-1.849058
TFAP2C-RUNX1,-1.300122,-0.116842,-0.037797,1.413944,-1.023880,-0.471980,-0.916108,0.099978,0.267854,-0.431514,-1.211854,-0.596265,-1.821412,-1.821412,-1.821412
HOXB8-ZBTB18,-2.596582,-2.596582,-2.596582,-2.596582,-2.596582,-2.596582,-2.596582,-2.596582,-2.596582,-2.438224,-1.599236,-2.596582,-2.596582,-2.596582,-2.596582
MYF6-Stat5aStat5b,-1.477228,-2.392833,-0.459531,-2.392833,-1.336506,-2.392833,-1.141378,-1.436349,-0.400940,-0.479083,-0.767633,-1.490741,-2.392833,-2.392833,-2.392833
YY2-ZNF684,-1.015791,-0.639873,0.016091,0.724576,-0.006440,-0.992740,-0.755732,-0.995560,-0.916533,-0.715343,-0.487917,-0.184143,-0.347450,-2.066589,-0.056121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
JUN-Znf281,0.452020,0.173372,0.200049,-1.640524,1.123623,0.999286,0.335483,1.196616,0.660971,-0.028626,0.341278,0.814001,0.331817,-1.640524,0.181488
Znf281-JUN,0.452020,0.173372,0.200049,-1.640524,1.123623,0.999286,0.335483,1.196616,0.660971,-0.028626,0.341278,0.814001,0.331817,-1.640524,0.181488
KLF5-GSC,-0.198431,-0.477087,-0.256832,0.609366,-1.024764,-0.164804,-0.663931,-0.481866,-1.206133,-0.469252,-0.765911,-0.960209,-0.971205,-0.020593,-0.948081
EOMES-LMX1A,-0.664641,0.065572,0.380048,2.479553,-1.449079,-1.290389,-0.318553,0.163570,-0.419069,-0.390058,-0.618654,-1.520193,-0.547126,1.934999,1.086078


In [None]:
# filtering if, nan values occure
filtered = df[logfc_cluster_cols]

val_counts = filtered.count(axis=1).sort_values()
#print(val_counts)
tfs_occ = val_counts[val_counts >=16].index
final = filtered.loc[tfs_occ]

In [None]:
cluster_name = "right-lobe-of-liver.1"
cluster_cols = list(filter(lambda x: cluster_name in x , df.columns))
logfc_cluster_cols = list(filter(lambda x: '/' in x , cluster_cols)) 
tmp = df[logfc_cluster_cols]
# wtf, tmp[~tmp.isin([0])]??
tmp_val_counts = tmp[~tmp.isin([0])].count(axis=1).sort_values()
print(tmp_val_counts)
tmp_tfs_occ = tmp_val_counts[tmp_val_counts == len(logfc_cluster_cols)].index
result = tmp.loc[tmp_tfs_occ]
print(result.shape)
result

In [None]:
tmp = filtered.fillna(0)
tmp_val_counts = tmp[~tmp.isin([0])].count(axis=1).sort_values()

print(tmp_val_counts)
tmp_tfs_occ = tmp_val_counts[tmp_val_counts >=16].index
result = filtered.loc[tmp_tfs_occ]

m1 = pd.DataFrame(result.mean(axis=1), columns=['mean'])
m1['sum'] = result.sum(axis=1)
m1.plot.hist(by='mean', bins=100)

## OLD implementation

In [None]:
def prepare_diff_obj_dataframe(diff_obj: DiffCombObj) -> pd.DataFrame:
    
    # possible prefix names ['right-lobe-of-liver.10', 'right-lobe-of-liver.16']
    df = diff_obj.rules
    tissue_name_c1 , cluster_nr_c1  = diff_obj.prefixes[0].split('.')
    tissue_name_c2 , cluster_nr_c2  = diff_obj.prefixes[1].split('.')
    suff = ""
    if tissue_name_c1 == tissue_name_c2:
        suff += f"_{tissue_name_c1}"
    else:
        suff += f"_{tissue_name_c1}_{tissue_name_c2}"

    if cluster_nr_c1 == cluster_nr_c2:
        suff += f"_{cluster_nr_c1}"
    else:
        suff += f"_{cluster_nr_c1}_{cluster_nr_c2}"

    df['log2fc_class'] = df.apply(lambda x: 'negativ' if x[4] < 0 else 'positiv', axis=1)
    df.columns = [f'{x}{suff}' for x in df.columns]
    
    return df.copy(deep=True)


# Find the specific tf_cooccurences of a tissue that unique for the specific cluster in the tissue.
# 1. Diff analyse , 
# 2. Read in Diffanalyse for the specific cluster
# 3. Find tf-cooccurence of the diffob , which are occuring in each cluster
# Read in file Names of all analysis
files_main_mb= read_in_file_names_of_folder(rel_path=main_analysis_path)
print(f"Count of Files: {len(files_main_mb)}")
#print(f"Files: {files_main_mb}")

files_diff= read_in_file_names_of_folder(rel_path=differential_analysis_path)
print(f"Count of Files: {len(files_diff)}")

test = ""
for file_mb in files_main_mb:
    cluster_name = file_mb.split('.pkl')[0]
    print(cluster_name)
    diffs = list(filter(lambda x: cluster_name in x, files_diff))
    print(len(diffs))
    print(diffs)
    
    # Keeps the read in DiffCombObj diff_objects:
    diff_objects = []
    
    for diff in diffs:
        diff_obj = DiffCombObj().from_pickle(f"{differential_analysis_selection_path}{diff}")
        diff_objects.append(diff_obj)
    
    erg = None
    for i in range(len(diff_objects)-1):
        
        if erg is None:
            obj_1= diff_objects[i]
            obj_2 = diff_objects[i + 1]
            df1 = prepare_diff_obj_dataframe(diff_obj = obj_1)
            df2 = prepare_diff_obj_dataframe(diff_obj = obj_2)
            
            erg = df1.merge(df2, how='outer', left_index=True, right_index=True)
        else:
            obj_2 = diff_objects[i + 1] 
            df2 = prepare_diff_obj_dataframe(diff_obj = obj_2)
            erg = erg.merge(df2, how='outer', left_index=True, right_index=True)
       
    test = erg
    erg.to_pickle(path=f"{answers_path}{cluster_name}.pkl")
    
print("Done")    
test.columns

In [None]:
answer_file_names=read_in_file_names_of_folder(rel_path=answers_path)
print(answer_file_names)
cluster_dfs = []
df = None
for name in answer_file_names:
    df = pd.read_pickle(f"{answers_path}{name}")
    cluster_dfs.append(name)
    df = df

    #df.groupby(['class', 'value']).count()
    break;
filter_columns = list(filter(lambda x: 'log2fc_class' in x , df.columns))
#len(filter_columns)
filtered_df = df[df[filter_columns].notna().all(1)] #
filtered_df
df
#df3.iloc[:, 2:3]
#df = pd.read_pickle(f"{answers_path}right-lobe-of-liver.6.pkl")
#df = pd.read_pickle(f"{differential_analysis_selection_path}{right-lobe-of-liver.6.pkl}")


#original = CombObj().from_pickle(f"{main_analysis_path}right-lobe-of-liver.6.pkl")
#original.rules.loc[df.index]


In [None]:
df = pd.read_pickle(f"{answers_path}right-lobe-of-liver.6.pkl")
selection = DiffCombObj().from_pickle(f"{differential_analysis_selection_path}right-lobe-of-liver.10__right-lobe-of-liver.16.pkl")
selection_orig = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.10__right-lobe-of-liver.16.pkl")
selection_orig
selection

original = CombObj().from_pickle(f"{main_analysis_path}right-lobe-of-liver.6.pkl")
original.rules.loc[df.index]
selection.prefixes

### Try and Error section

In [None]:
#mb_obj = CombObj().from_pickle(f"{main_analysis_path}right-lobe-of-liver.10.pkl")
obj_1 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.1.pkl")
obj_2 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.2.pkl")
obj_3 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.3.pkl")
obj_4 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.4.pkl")
obj_5 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.5.pkl")
obj_6 = DiffCombObj().from_pickle(f"{differential_analysis_path}right-lobe-of-liver.16__right-lobe-of-liver.6.pkl")
#mb_obj.rules
#type(diff_obj.rules)
#diff_obj2.rules
df_diff = pd.concat([obj_1.rules, obj_2.rules, obj_3.rules, obj_4.rules, obj_5.rules, obj_6.rules], join="inner")



#df_diff2 = pd.concat([diff_obj_1_1.rules,diff_obj2.rules, diff_obj1_2.rules])

unified_duplicates = df_diff[df_diff.duplicated(subset=['TF1', 'TF2'], keep='first')]

df_diff2 = pd.concat([unified_duplicates, diff_obj1_2.rules])

unified_duplicates2 = df_diff2[df_diff2.duplicated(subset=['TF1', 'TF2'], keep='first')]

#df_diff = df_diff.drop_duplicates(subset=['TF1', 'TF2'])
#unified_duplicates
#unified_duplicates
#unified_duplicates2
#diff_obj_1_1.rules
#unified_duplicates



In [None]:
obj_1 = DiffCombObj().from_pickle(f"{differential_analysis_selection_path}right-lobe-of-liver.16__right-lobe-of-liver.1.pkl")
obj_2 = DiffCombObj().from_pickle(f"{differential_analysis_selection_path}right-lobe-of-liver.16__right-lobe-of-liver.2.pkl")

obj_1.simplify_rules()
obj_2.simplify_rules()
obj3 = obj_1.rules.merge(obj_2.rules, left_index=True, right_index=True, suffixes=(f"_{obj_1.prefixes[0]}_{obj_1.prefixes[1]}", f"_{obj_2.prefixes[0]}_{obj_2.prefixes[1]}"))
obj3


In [None]:
#df['log2fc_class'] = df.apply(lambda x: 'negativ' if x[4] < 0 else 'positiv', axis=1)

#removedNAN = df[df.notna().all(1)]

#df2 = removedNAN[(removedNAN[filter_columns] > 0.0) | (removedNAN[filter_columns] < 0.0)]
#df2[df2.notna().all(1)]
#filtered_df = df[df[filter_columns].notna().all(1)]

In [None]:
diff_obj_1_1.rules.loc['Foxd3-ONECUT2']
diff_obj2.rules.loc['Foxd3-ONECUT2']

In [None]:
top30C = selectedC.select_top_rules(n=30)
top30C.rules.head(31)

In [None]:
df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo', 'fehler', 'nocheiner'],
                    'value': [1, 2, 3, 5,7,7]},
                   index=['my1', 'my2', 'my3', 'my4', 'my5', 'my6'])
df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo','test'],
                    'value': [5, 6, 7, 8, 9]},
                  index=['my1', 'not2', 'my3', 'not4', 'my5'])
df3 = pd.DataFrame({'rkey': ['new', 'lol'],
                    'value': [5, 6]},
                  index=['my1', 'not2'])


#df1.merge(df2, left_on='lkey', right_on='rkey')
#erg = df1.merge(df2, left_index=True, right_index=True, suffixes=("_test", "_test2"))
erg = df1.merge(df2, how='outer', left_index=True, right_index=True, suffixes=("_x", "_y"))
#erg = erg.merge(df3, how='outer', left_index=True, right_index=True, suffixes=("_x", "_"))
erg


In [None]:
df1

In [None]:
df2

In [None]:
df2.columns = [f'{x}_df2' for x in df2.columns]
df2

In [None]:
[x+ 1 for x in df2.columns]

In [None]:
df1['class'] = df1.apply(lambda x: 'niedrig' if x[1] < 5 else 'hoch', axis=1) 

In [None]:
df1[df1['class'] == 'hoch']

In [None]:
df1.groupby(['class', 'value']).count()

In [None]:
df2

In [None]:
df3.iloc[:, 2:3]

In [None]:
top30C.plot_bubble()

In [None]:
top30C.plot_network()

In [None]:
selectedC.rules

In [None]:
#### OLD 
# Find the specific tf_cooccurences of a tissue that unique for the specific cluster in the tissue.
# 1. Diff analyse , 
# 2. Read in Diffanalyse for the specific cluster
# 3. Find tf-cooccurence of the diffob , which are occuring in each cluster
# Read in file Names of all analysis
files_main_mb= read_in_file_names_of_folder(rel_path=main_analysis_path)
print(f"Count of Files: {len(files_main_mb)}")
#print(f"Files: {files_main_mb}")

files_diff= read_in_file_names_of_folder(rel_path=differential_analysis_path)
print(f"Count of Files: {len(files_diff)}")
#print(f"Files: {files_diff}")
test = ""
for file_mb in files_main_mb:
    cluster_name = file_mb.split('.pkl')[0]
    print(cluster_name)
    diffs = list(filter(lambda x: cluster_name in x, files_diff))
    print(len(diffs))
    print(diffs)
    
    # Keeps the read in DiffCombObj diff_objects:
    diff_objects = []
    
    for diff in diffs:
        diff_obj = DiffCombObj().from_pickle(f"{differential_analysis_selection_path}{diff}")
        diff_objects.append(diff_obj)
    
    # erste DiffObj dataframe 
    initial_df = diff_objects[0].rules
    
    #has neg and pos foldchange
    cross_product_merged = initial_df
    
    # only pos foldchange
    pos_merged = initial_df[initial_df.iloc[:,4] > 0.00]
    
    # only neg foldchange
    neg_merged = initial_df[initial_df.iloc[:,4] < 0.00]
    for i in range(len(diff_objects)-1):
        obj_1= diff_objects[i]
        obj_2 = diff_objects[i + 1]
            
        # cross_product merge rules-dataframe by index (TF´s)
        cross_product = cross_product_merged.merge(obj_2.rules, left_index=True, right_index=True, suffixes=(f"_{obj_1.prefixes[0]}_{obj_1.prefixes[1]}", f"_{obj_2.prefixes[0]}_{obj_2.prefixes[1]}"))
        cross_product_merged = cross_product.copy(deep=True)
        
        # pos merge rules-dataframe by index (TF´s)
        obj2_df_pos = obj_2.rules[obj_2.rules.iloc[:,4] > 0.00]
        df_pos_merged = pos_merged.merge(obj2_df_pos, left_index=True, right_index=True, suffixes=(f"_{obj_1.prefixes[0]}_{obj_1.prefixes[1]}", f"_{obj_2.prefixes[0]}_{obj_2.prefixes[1]}"))
        pos_merged = df_pos_merged.copy(deep=True)
        
        # neg merge rules-dataframe by index (TF´s)
        obj2_df_neg = obj_2.rules[obj_2.rules.iloc[:,4] < 0.00]
        df_neg_merged = neg_merged.merge(obj2_df_neg, left_index=True, right_index=True, suffixes=(f"_{obj_1.prefixes[0]}_{obj_1.prefixes[1]}", f"_{obj_2.prefixes[0]}_{obj_2.prefixes[1]}"))
        neg_merged = df_neg_merged.copy(deep=True)
        
        
    cross_product_merged.to_pickle(path=f"{answers_path}{cluster_name}_cross.pkl")
    pos_merged.to_pickle(path=f"{answers_path}{cluster_name}_pos.pkl")
    neg_merged.to_pickle(path=f"{answers_path}{cluster_name}_neg.pkl")
    
print("Done")    
test
