# Checking the affinity of TF to Loci in Regolatory Regions 
In this file, I do the process where I take loci from the MPRA data and check the affinity of the TFs to the ancestral and derived DNA sequences. 

Written by Matanya Wiener

In [1]:
# First let's import the libraries we will use:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import os
import json
import seaborn as sns
import pandas as pd
from functools import reduce
import utils_matanya as um; 
import subprocess

In [2]:
# Useful constants
CURRENT_RUN_NAME = "run_2025_04_09"
# CURRENT_RUN_NAME = "run_non_DA"


os.chdir("/home/labs/davidgo/matanyaw")
MY_DATA_DIR = "/home/labs/davidgo/matanyaw/data"
RESULTS_DIR = "/home/labs/davidgo/matanyaw/results"
FULL_MPRA_FILE = "/home/labs/davidgo/Collaboration/humanMPRA/chondrocytes/comparative_analysis_combined/humanMPRA_with_seq_final2.csv"
PBM_FILE = "/home/labs/davidgo/matanyaw/data/pbm_8mer_aggregated_data.csv"
JOBS_OUT_DIR = "/home/labs/davidgo/matanyaw/jobs_outputs"
JOBS_ERR_DIR = "/home/labs/davidgo/matanyaw/jobs_errors"

CURRENT_RUN_RESULTS_DIR = os.path.join(RESULTS_DIR, CURRENT_RUN_NAME)


os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(CURRENT_RUN_RESULTS_DIR, exist_ok=True)


## Loading MPRA Data

You can change this block to see difference in affinity in whatever MPRA lines you like! 
Just make sure you change the CURRENT_RUN_NAME variable to get it in a different folder. 

The default MPRA lines I take are the Differential Active ones (DA). 
Nadav's idea was to check also the Active but not DA, to see the difference in results. This is a great idea! 
Just change the MPRA that you read here. 

**Comment this block to run Nadav's Null Hypothesis**


In [3]:
columns_needed = ['oligo', 'sequence_ancestral', 'sequence_derived', 'logFC_derived_vs_ancestral', 'differential_activity']
full_mpra_df = pd.read_csv(FULL_MPRA_FILE, index_col=0, usecols=columns_needed)

mpra_df = full_mpra_df[full_mpra_df['differential_activity'] == True]
MPRA_LINES_TO_PROCESS_FILE = os.path.join(MY_DATA_DIR, "humanMPRA_with_seq_final2_differential_active.csv")

mpra_df.to_csv(MPRA_LINES_TO_PROCESS_FILE, index=True)
mpra_df

Unnamed: 0_level_0,logFC_derived_vs_ancestral,differential_activity,sequence_ancestral,sequence_derived
oligo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
seq_100038_chr6:4358790-4359059_SCREEN_a3_L1,0.299749,True,cagggatgggcacacctggccggctgaggggagcctacagccaggc...,cagggatgggcacacctggccggctgaggggagcctacagccaggc...
seq_100065_chr7:138979123-138979392_SCREEN_a3_L1,0.316918,True,tccattttaccaatcacgaaactgaatcagagagctaagcaacttg...,tccattttaccaatcacgaaactgaatcagagagctaagcaacttg...
seq_100070_chr7:79861027-79861296_SCREEN_a3_L1,-0.275615,True,ctgtttattgttatactcaagagcacttggtattgttaaagtcgta...,ctgtttattgttatactcaagagcacttggtattgttaaagtcgta...
seq_100075_chr16:54376420-54376689_SCREEN_a3_L1,0.318055,True,aggaaggaggatctctgtgtgtgtgggtcgcccagcctcctgccag...,aggaaggaggatctctgtgtgtgtgggtcgcccagcctcctgccag...
seq_100090_chr20:31380149-31380418_SCREEN_a3_L1,-0.281553,True,atgtctgccttttgagtgggaggtgaccactgacattccctagggg...,atgtctgccttttgagtgggaggtgaccactgacattccctagggg...
...,...,...,...,...
seq_99921_chr11:34262393-34262662_SCREEN_a3_L1,0.286606,True,tggcctcccaaagtgctgggattacaggtgtgagccaccgtacctg...,tggcctcccaaagtgctgggattacaggtgtgagccaccgtacctg...
seq_99930_chr10:128329049-128329318_SCREEN_a3_L1,-0.770237,True,ccctgagcttggaaacatagagatgagaaaataagttccctacact...,ccctgagcttggaaacatagagatgagaaaataagttccctacact...
seq_99966_chr21:35967796-35968065_SCREEN_a3_L1,-0.230129,True,gtgtacagacccatctattcatcatagacttcactgtctgggaatg...,gtgtacagacccatctattcatcatagacttcactgtctgggaatg...
seq_99973_chr14:22846595-22846864_SCREEN_a3_L1,-0.305473,True,gtcaagttggaacttagattctttttcttgcggaagtgggaggcat...,gtcaagttggaacttagattctttttcttgcggaagtgggaggcat...


# Nadav's Idea: Create a Null Hypothesis
**Leave this in comment unless you want to run it!**

Nadav suggested to compare the heatmaps we get with the heatmaps when the loci is active, yet not "Differencially Active" to create a null hypothesys. 
We can see if there is a difference in the binding differences when the suspected enhancers are truely differentially activating the translation and when they don't. 

In [None]:
# # Creating the csv file of the active, yet not differential active loci
# columns_needed = ['oligo', 'sequence_ancestral', 'sequence_derived', 'logFC_derived_vs_ancestral', 'differential_activity']
# full_mpra_df = pd.read_csv(FULL_MPRA_FILE, index_col=0, usecols=columns_needed)

# active_loci = full_mpra_df[full_mpra_df["differential_activity"].notna()]
# mpra_df = active_loci[active_loci["differential_activity"] == False].sample(n=15000, random_state=42)
# MPRA_LINES_TO_PROCESS_FILE = os.path.join(MY_DATA_DIR, "humanMPRA_with_seq_final2_non_DA.csv")

# mpra_df.to_csv(MPRA_LINES_TO_PROCESS_FILE, index=True)
# mpra_df


## Loading PBM data
I will use the z-score of the TFs to decide the affinity, and the e-score to decide how certain I am in this result. I will use e-score limit of 0.35 to decide whether I use this z-score value or not (per TF per 8-mer).

In [4]:
all_8mer_pbm_df = pd.read_csv(PBM_FILE, index_col=0, header=[0,1])
all_8mer_pbm_df.columns = pd.MultiIndex.from_tuples([(lvl1, tf.split('_')[0]) for lvl1, tf in all_8mer_pbm_df.columns])
escore_df, median_df, zscore_df = all_8mer_pbm_df['E-score'], all_8mer_pbm_df['Median'], all_8mer_pbm_df['Z-score']
zscore_df

Unnamed: 0_level_0,ARX,Ahctf1,Alx3,Alx4,Ar,Arid3a,Arid5a,Ascl2,Atf3,BCL11A,...,TFAP2A,Tbx2,Tef,VAX2,VENTX,VSX1,VSX2,WT1,ZNF200,ZNF655
8-mer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAAAAAAA,-2.5642,9.0057,2.3401,1.7044,1.1648,2.423686,6.037755,5.100156,2.0726,0.7592,...,0.8049,-0.2041,0.4316,0.4226,-4.0103,1.1738,1.9972,1.3045,1.6922,0.1415
AAAAAAAC,-2.1900,7.2302,1.3416,1.4467,1.1031,-0.021227,3.549164,2.741533,1.5885,-0.9601,...,-0.3715,0.7683,1.4607,0.1454,-2.8133,0.2560,1.8681,0.8530,1.7340,0.0316
AAAAAAAG,-2.9855,6.1877,1.3386,1.0929,-0.0242,1.853928,2.381357,0.994396,2.8981,-0.7073,...,1.5246,0.8301,0.6015,-0.5753,-3.8743,-0.0920,0.8050,1.4324,1.8098,0.5059
AAAAAAAT,-1.8606,8.8501,2.1334,2.3737,0.1240,3.591318,2.863760,2.641339,2.4990,-0.6869,...,1.7388,1.1866,1.4478,1.1230,-1.8313,1.9065,2.6485,1.1602,1.4194,0.9966
AAAAAACA,-2.2716,5.3682,1.6949,1.1549,1.7486,1.395083,3.334386,1.987293,1.7003,-0.2942,...,-1.8648,1.1769,1.4901,0.5406,-1.8409,0.2393,1.8914,1.5113,1.7164,0.2976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTCCAAA,-0.2410,2.9411,0.5581,0.2975,-0.0945,-0.002882,1.680976,-0.630805,0.9508,-0.2414,...,1.2079,1.0096,0.2931,1.0831,-0.7304,1.0992,1.2829,0.7699,0.8522,0.6617
TTTCGAAA,-2.3461,1.3296,0.8202,-0.0643,1.5874,0.995079,1.286309,-0.143761,1.7645,-1.5172,...,0.1766,0.7694,1.6737,1.4748,-2.8707,-0.2231,2.3837,0.7933,0.0525,0.4264
TTTGAAAA,-1.9434,3.5111,1.6207,1.2021,1.5820,1.298207,3.294636,0.557449,1.6849,0.0998,...,1.3062,1.7171,1.8915,0.9733,-3.1528,1.5440,2.3192,1.2091,1.5615,0.6381
TTTGCAAA,-0.4407,1.5736,1.0773,-0.3050,0.3458,-1.077195,-0.764979,0.412647,2.1186,-0.7710,...,0.7302,1.9076,0.5129,2.8273,-0.5586,1.1500,1.5334,0.6524,0.1814,2.7454


In [5]:
def run_single_job(mpra_csv_file, pbm_csv_file, output_dir, window_size, start_index, end_index,
                   job_name="process_TF_to_loci_files", 
                   jobs_outputs_dir=JOBS_OUT_DIR, 
                   jobs_errors_dir=JOBS_ERR_DIR, 
                   print_info=True):
    """
    Run a single job to create the TF to loci files.
    args:
    mpra_csv_file: str, path to the MPRA CSV file
    pbm_csv_file: str, path to the PBM CSV file
    output_dir: str, path to the output directory
    window_size: int, the window size to use (for finding sequences in the PBM file)
    start_index: int, the start index to use (for the MPRA file, which can be split into multiple jobs)
    end_index: int, the end index to use (for the MPRA file, which can be split into multiple jobs)
    job_name: str, the name of the job
    jobs_outputs_dir: str, path to the jobs outputs directory
    jobs_errors_dir: str, path to the jobs errors directory
    """
    
    os.makedirs(jobs_outputs_dir, exist_ok=True)
    os.makedirs(jobs_errors_dir, exist_ok=True)

    TF_2_LOCI_SCRIPT_FILE = "/home/labs/davidgo/matanyaw/backup/create_TF_to_loci_script.py"
    
    command = [
        "bsub",  # Executable
        "-q", "short", 
        "-R", "rusage[mem=2000]",   # Resource requirements in Mb
        "-J", job_name,
        "-o", jobs_outputs_dir,
        "-e", jobs_errors_dir,
        "python", TF_2_LOCI_SCRIPT_FILE,  # The Python script to run
        "--mpra_file", mpra_csv_file,
        "--pbm_file", pbm_csv_file,
        "--output_dir", output_dir,
        "--window_size", str(window_size),
        "--start_index", str(start_index),
        "--end_index", str(end_index),
    ]

    try:
        subprocess.run(command, capture_output=True, text=True, check=True)
        if print_info:
            print(f"Job {job_name} submitted.")
    except subprocess.CalledProcessError as e:
        print(f"Error running the job: {e.stderr}")
        return None

Optional: Test run:

In [6]:
# Optional Test Run
from backup.create_TF_to_loci_script import create_tf_to_loci_files
start_index = 35
end_index = 40
job_name = f"daily_test_{CURRENT_RUN_NAME}"

# You can Run it as a Job:
run_single_job(mpra_csv_file=MPRA_LINES_TO_PROCESS_FILE,
        pbm_csv_file=PBM_FILE,
        output_dir=os.path.join(CURRENT_RUN_RESULTS_DIR, f"{job_name}_job"),
        window_size=8,
        start_index=start_index,
        end_index=end_index, 
        job_name=f"{job_name}_job")


# You can run it as an imported function:
create_tf_to_loci_files(MPRA_LINES_TO_PROCESS_FILE, 
                        PBM_FILE, 
                        os.path.join(CURRENT_RUN_RESULTS_DIR, job_name), 
                        8, 
                        start_index, 
                        end_index)

Job daily_test_run_2025_04_09_job submitted.
Done with 36/15077
Done with 37/15077
Done with 38/15077
Done with 39/15077
Done with 40/15077


Here, we send the for all 15K sequences, the 

In [7]:
# In case I want to run it multiple times
session_id = 0

Full Run - Split the 15K loci into jobs (as many as I like, I do 1500). Takes a while to run. 

In [8]:
PER_LOCUS_DIR = os.path.join(CURRENT_RUN_RESULTS_DIR, f"per_locus_dir") # ? Change this to the desired output directory


In [9]:
num_jobs = 1500
chunk_size = mpra_df.shape[0] // num_jobs
for i in range(num_jobs):
        start_index = i * chunk_size
        end_index = (i + 1) * chunk_size
        if i == num_jobs - 1:
            end_index = mpra_df.shape[0]
        job_name = f"job_{i}_sessions_{session_id}"
        run_single_job(mpra_csv_file=MPRA_LINES_TO_PROCESS_FILE,
                pbm_csv_file=PBM_FILE,
                output_dir=PER_LOCUS_DIR,
                window_size=8,
                start_index=start_index,
                end_index=end_index, 
                job_name=job_name,
                jobs_outputs_dir=os.path.join(JOBS_OUT_DIR, f"session_{session_id}"),           # I want each 1500 jobs to have its own output directory
                jobs_errors_dir=os.path.join(JOBS_ERR_DIR, f"session_{session_id}"),
                print_info=False)
        print(f"Sent job {i}/{num_jobs} for lines: {start_index} to {end_index}, session {session_id}")

print(f"All {num_jobs} Jobs are Sent!")
session_id += 1

Sent job 0/1500 for lines: 0 to 10, session 0
Sent job 1/1500 for lines: 10 to 20, session 0
Sent job 2/1500 for lines: 20 to 30, session 0
Sent job 3/1500 for lines: 30 to 40, session 0
Sent job 4/1500 for lines: 40 to 50, session 0
Sent job 5/1500 for lines: 50 to 60, session 0
Sent job 6/1500 for lines: 60 to 70, session 0
Sent job 7/1500 for lines: 70 to 80, session 0
Sent job 8/1500 for lines: 80 to 90, session 0
Sent job 9/1500 for lines: 90 to 100, session 0
Sent job 10/1500 for lines: 100 to 110, session 0
Sent job 11/1500 for lines: 110 to 120, session 0
Sent job 12/1500 for lines: 120 to 130, session 0
Sent job 13/1500 for lines: 130 to 140, session 0
Sent job 14/1500 for lines: 140 to 150, session 0
Sent job 15/1500 for lines: 150 to 160, session 0
Sent job 16/1500 for lines: 160 to 170, session 0
Sent job 17/1500 for lines: 170 to 180, session 0
Sent job 18/1500 for lines: 180 to 190, session 0
Sent job 19/1500 for lines: 190 to 200, session 0
Sent job 20/1500 for lines: 20

**Stop! Make sure all relevant running jobs are done before you continue down here**

## Looking at the Overall Differences and Tendancies

Now we will run over all the loci and see things about the differences in the affinity of TF. 
Here I will create a figure that for every loci will show the biggest differences of the TF binding between Ancestral and Derived variants. 

In [None]:
# Collecting the relevant files

OVERALL_TF_BINDING_DIR = os.path.join(CURRENT_RUN_RESULTS_DIR, "overall_tf_binding_differences")
os.makedirs(OVERALL_TF_BINDING_DIR, exist_ok=True)


Collecting all relevant files from the 15K folders

In [None]:
def collect_difference_files(per_locus_dir):
    """ Returns a list of tuples with the paths to the difference PBM files and the metadata files. 
        Will be used to draw overall TF binding affinity conclusions.
    """
    diff_pbm_and_metadata_file_paths = []
    # tf_affinity_dir = os.path.join(MY_DATA_DIR, f"tf_to_locus_zscores_v{session_id}")
    print("Collecting files from directory: ", per_locus_dir)
    for locus_dir in os.listdir(per_locus_dir):
        locus_dir_path = os.path.join(per_locus_dir, locus_dir)
        if not os.path.isdir(locus_dir_path):
            continue

        metadata_path = os.path.join(locus_dir_path, "metadata.json")
        diff_csv_path = os.path.join(locus_dir_path, "Difference_PBM_filtered.csv")
        if not os.path.exists(metadata_path) or not os.path.exists(diff_csv_path):
            print(f"Missing files in {locus_dir_path}")
            continue
        diff_pbm_and_metadata_file_paths.append((diff_csv_path, metadata_path))

    len(diff_pbm_and_metadata_file_paths)
    print("Done.")
    return diff_pbm_and_metadata_file_paths


In [None]:
session_id -= 1
diff_pbm_and_metadata_file_paths = collect_difference_files(PER_LOCUS_DIR)
diff_pbm_and_metadata_file_paths

Creating CSVs that store the Min Max Difference in binding values for every TF in every Locus. 
This takes like 9 minutes to run.

In [None]:
# Creating the DataFrames that contain the most significant TF binding difference per locus
TF_binding_stronger_in_derived = pd.DataFrame()
TF_binding_weaker_in_derived = pd.DataFrame()

for diff_csv_path, metadata_path in diff_pbm_and_metadata_file_paths:
    df = pd.read_csv(diff_csv_path, index_col=0)
    
    with open(metadata_path, "r") as f:
        metadata = json.load(f)
    
    locus = metadata["original_locus"]
    max_pos_diff_per_tf = df.max(axis=1)
    max_neg_diff_per_tf = df.min(axis=1)
    max_pos_diff_per_tf.name = locus
    max_neg_diff_per_tf.name = locus
    TF_binding_stronger_in_derived = pd.concat([TF_binding_stronger_in_derived, max_pos_diff_per_tf], axis=1)
    TF_binding_weaker_in_derived = pd.concat([TF_binding_weaker_in_derived, max_neg_diff_per_tf], axis=1)

# TF_binding_stronger_in_derived.to_csv(os.path.join(OVERALL_TF_BINDING_DIR, f"TF_binding_stronger_in_derived_all_loci_v{session_id}.csv"))
# TF_binding_weaker_in_derived.to_csv(os.path.join(OVERALL_TF_BINDING_DIR, f"TF_binding_weaker_in_derived_all_loci_v{session_id}.csv"))

TF_binding_stronger_in_derived

Creating a csv file with the main difference in binding of every TF in every locus. This means I take for every TF the value that is most extreme in binding difference (per locus).

In [None]:
overall_binding_differences_df = np.where(TF_binding_stronger_in_derived.abs() > TF_binding_weaker_in_derived.abs(), TF_binding_stronger_in_derived, TF_binding_weaker_in_derived)

overall_binding_differences_df = pd.DataFrame(overall_binding_differences_df, index=TF_binding_weaker_in_derived.index, columns=TF_binding_weaker_in_derived.columns).astype(float).clip(lower=-10, upper=10)
TF_OVERALL_BINDING_DIFFERENCES_FILE = os.path.join(OVERALL_TF_BINDING_DIR, f"TF_overall_binding_differences.csv")
overall_binding_differences_df.to_csv(TF_OVERALL_BINDING_DIFFERENCES_FILE)
overall_binding_differences_df

In [None]:
def create_overall_binding_heatmap(df, title, output_dir):
    sns.set(style="whitegrid")

    df_clipped = np.clip(df, -10, 10)

    # --- Plot & Save Derived Heatmap ---
    plt.figure(figsize=(16, 10))
    sns.heatmap(
        df_clipped,
        cmap="coolwarm",
        center=0,
        xticklabels=False,
        # linewidths=0.5,
        # linecolor='gray'
    )
    plt.title(title, fontsize=14)
    plt.xlabel("Locus")
    plt.ylabel("Transcription Factor")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"{title.replace(' ' , '_')}.png"))
    plt.close()



In [None]:
# Painting the Heatmaps
# create_overall_binding_heatmap(TF_binding_weaker_in_derived, "TF_binding_weakwer_in_derived_filtered", output_dir=OVERALL_TF_BINDING_CONCLUSION_DIR )
# create_overall_binding_heatmap(TF_binding_stronger_in_derived, "TF_binding_stronger_in_derived_filtered", output_dir=OVERALL_TF_BINDING_CONCLUSION_DIR )
create_overall_binding_heatmap(overall_binding_differences_df, "overall_binding_differences_heatmap", output_dir=OVERALL_TF_BINDING_DIR )

##### Create Clustermap 
Note: This takse circa ~15 minutes.

In [None]:
sns.set(style="white")

# Create clustered heatmap
clustermap = sns.clustermap(
    overall_binding_differences_df,              # TFs = rows, loci = columns
    cmap="coolwarm",
    center=0,
    figsize=(16, 10),
    xticklabels=False,        # Hide x labels if too many loci
    yticklabels=True,         # Show TF names
    cbar_kws={'label': 'Binding Strength'},
    method='average',         # Clustering method (can be 'ward', 'single', etc.)
    metric='euclidean'        # Distance metric
)

clustermap.fig.suptitle("overall_binding_differences_clustermap".replace("_", " "), fontsize=16)

# Save the figure,,
clustermap.savefig(os.path.join(OVERALL_TF_BINDING_DIR, "clustermap_job.png"))
plt.close()


Sending the jobs to create the many files

In [None]:
# PER_LOCUS_DIR = os.path.join(MY_DATA_DIR, f"TF_to_locus_zscored_non_DA") # ? Change this to the desired output directory
# session_id = "non_DA"
# num_jobs = 1500
# chunk_size = mpra_df.shape[0] // num_jobs
# for i in range(num_jobs):
#         start_index = i * chunk_size
#         end_index = (i + 1) * chunk_size
#         if i == num_jobs - 1:
#             end_index = mpra_df.shape[0]
#         job_name = f"job_{i}_sessions_{session_id}"
#         run_single_job(mpra_csv_file=NON_DIFFERENTIAL_ACTIVE_MPRA_FILE,
#                 pbm_csv_file=PBM_FILE,
#                 output_dir=PER_LOCUS_DIR,
#                 window_size=8,
#                 start_index=start_index,
#                 end_index=end_index, 
#                 job_name=job_name,
#                 jobs_outputs_dir=os.path.join(JOBS_OUT_DIR, f"session_{session_id}"),           # I want each 1500 jobs to have its own output directory
#                 jobs_errors_dir=os.path.join(JOBS_ERR_DIR, f"session_{session_id}"))
#         print(f"Sent job {i}/{num_jobs} for lines: {start_index} to {end_index}, session {session_id}")

# print(f"All {num_jobs} Jobs are Sent!")

In [None]:
# session_id = "non_DA"

In [None]:
# diff_pbm_and_metadata_file_paths_non_diff = collect_difference_files(PER_LOCUS_DIR)

# len(diff_pbm_and_metadata_file_paths_non_diff)

In [None]:
# # Creating the DataFrames that contain the most significant TF binding difference per locus
# TF_binding_stronger_in_derived_non_diff = pd.DataFrame()
# TF_binding_weaker_in_derived_non_diff = pd.DataFrame()

# for diff_csv_path, metadata_path in diff_pbm_and_metadata_file_paths_non_diff:
#     df = pd.read_csv(diff_csv_path, index_col=0)
    
#     with open(metadata_path, "r") as f:
#         metadata = json.load(f)
    
#     locus = metadata["original_locus"]
#     max_pos_diff_per_tf = df.max(axis=1)
#     max_neg_diff_per_tf = df.min(axis=1)
#     max_pos_diff_per_tf.name = locus
#     max_neg_diff_per_tf.name = locus
#     TF_binding_stronger_in_derived_non_diff = pd.concat([TF_binding_stronger_in_derived_non_diff, max_pos_diff_per_tf], axis=1)
#     TF_binding_weaker_in_derived_non_diff = pd.concat([TF_binding_weaker_in_derived_non_diff, max_neg_diff_per_tf], axis=1)

# TF_binding_stronger_in_derived_non_diff.to_csv(os.path.join(OVERALL_TF_BINDING_CONCLUSION_DIR, f"TF_binding_stronger_in_derived_all_loci_{session_id}.csv"))
# TF_binding_weaker_in_derived_non_diff.to_csv(os.path.join(OVERALL_TF_BINDING_CONCLUSION_DIR, f"TF_binding_weaker_in_derived_all_loci_{session_id}.csv"))

# TF_binding_stronger_in_derived_non_diff

In [None]:
# combined_df_non_diff = np.where(TF_binding_stronger_in_derived_non_diff.abs() > TF_binding_weaker_in_derived_non_diff.abs(),
#                                 TF_binding_stronger_in_derived_non_diff, 
#                                 TF_binding_weaker_in_derived_non_diff)

# combined_df_non_diff = pd.DataFrame(combined_df_non_diff, index=TF_binding_weaker_in_derived_non_diff.index,
#                                     columns=TF_binding_weaker_in_derived_non_diff.columns).astype(float).clip(lower=-10, upper=10)
# combined_df_non_diff.to_csv(os.path.join(OVERALL_TF_BINDING_DIR, "TF_binding_all_loci_combined_non_diff.csv"))
# combined_df_non_diff

In [None]:
# # create_overall_binding_heatmap(TF_binding_weaker_in_derived_non_diff, "TF_binding_weakwer_in_derived_filtered_non_diff", output_dir=OVERALL_TF_BINDING_DIR )
# # create_overall_binding_heatmap(TF_binding_stronger_in_derived_non_diff, "TF_binding_stronger_in_derived_filtered_non_diff", output_dir=OVERALL_TF_BINDING_DIR )
# create_overall_binding_heatmap(combined_df_non_diff, "TF_binding_combined_filtered_non_diff", output_dir=OVERALL_TF_BINDING_DIR)