# Checking the affinity of TF to Loci in Regolatory Regions 
In this file, I do the process where I take loci from the MPRA data and check the affinity of the TFs to the ancestral and derived DNA sequences. 

Written by Matanya Wiener

In [70]:
# First let's import the libraries we will use:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import os
import json
import seaborn as sns
import pandas as pd
from functools import reduce
import utils_matanya as um; 
import subprocess

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
# Useful constants
CURRENT_RUN_NAME = "final_run_2025_04_09"
# CURRENT_RUN_NAME = "run_non_DA_2025_04_09"


os.chdir("/home/labs/davidgo/matanyaw")
MY_DATA_DIR = "/home/labs/davidgo/matanyaw/data"
RESULTS_DIR = "/home/labs/davidgo/matanyaw/results"
FULL_MPRA_FILE = "/home/labs/davidgo/Collaboration/humanMPRA/chondrocytes/comparative_analysis_combined/humanMPRA_with_seq_final2.csv"
PBM_FILE = "/home/labs/davidgo/matanyaw/data/pbm_8mer_aggregated_data.csv"
JOBS_OUT_DIR = "/home/labs/davidgo/matanyaw/jobs_outputs"
JOBS_ERR_DIR = "/home/labs/davidgo/matanyaw/jobs_errors"

CURRENT_RUN_RESULTS_DIR = os.path.join(RESULTS_DIR, CURRENT_RUN_NAME)


os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(CURRENT_RUN_RESULTS_DIR, exist_ok=True)


## Loading MPRA Data

You can change this block to see difference in affinity in whatever MPRA lines you like! 
Just make sure you change the CURRENT_RUN_NAME variable to get it in a different folder. 

The default MPRA lines I take are the Differential Active ones (DA). 
Nadav's idea was to check also the Active but not DA, to see the difference in results. This is a great idea! 
Just change the MPRA that you read here. 

**Comment this block to run Nadav's Null Hypothesis**


In [72]:
columns_needed = ['oligo', 'sequence_ancestral', 'sequence_derived', 'logFC_derived_vs_ancestral', 'differential_activity']
full_mpra_df = pd.read_csv(FULL_MPRA_FILE, index_col=0, usecols=columns_needed)

mpra_df = full_mpra_df[full_mpra_df['differential_activity'] == True]
MPRA_LINES_TO_PROCESS_FILE = os.path.join(MY_DATA_DIR, "humanMPRA_with_seq_final2_differential_active.csv")

mpra_df.to_csv(MPRA_LINES_TO_PROCESS_FILE, index=True)
mpra_df

Unnamed: 0_level_0,logFC_derived_vs_ancestral,differential_activity,sequence_ancestral,sequence_derived
oligo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
seq_100038_chr6:4358790-4359059_SCREEN_a3_L1,0.299749,True,cagggatgggcacacctggccggctgaggggagcctacagccaggc...,cagggatgggcacacctggccggctgaggggagcctacagccaggc...
seq_100065_chr7:138979123-138979392_SCREEN_a3_L1,0.316918,True,tccattttaccaatcacgaaactgaatcagagagctaagcaacttg...,tccattttaccaatcacgaaactgaatcagagagctaagcaacttg...
seq_100070_chr7:79861027-79861296_SCREEN_a3_L1,-0.275615,True,ctgtttattgttatactcaagagcacttggtattgttaaagtcgta...,ctgtttattgttatactcaagagcacttggtattgttaaagtcgta...
seq_100075_chr16:54376420-54376689_SCREEN_a3_L1,0.318055,True,aggaaggaggatctctgtgtgtgtgggtcgcccagcctcctgccag...,aggaaggaggatctctgtgtgtgtgggtcgcccagcctcctgccag...
seq_100090_chr20:31380149-31380418_SCREEN_a3_L1,-0.281553,True,atgtctgccttttgagtgggaggtgaccactgacattccctagggg...,atgtctgccttttgagtgggaggtgaccactgacattccctagggg...
...,...,...,...,...
seq_99921_chr11:34262393-34262662_SCREEN_a3_L1,0.286606,True,tggcctcccaaagtgctgggattacaggtgtgagccaccgtacctg...,tggcctcccaaagtgctgggattacaggtgtgagccaccgtacctg...
seq_99930_chr10:128329049-128329318_SCREEN_a3_L1,-0.770237,True,ccctgagcttggaaacatagagatgagaaaataagttccctacact...,ccctgagcttggaaacatagagatgagaaaataagttccctacact...
seq_99966_chr21:35967796-35968065_SCREEN_a3_L1,-0.230129,True,gtgtacagacccatctattcatcatagacttcactgtctgggaatg...,gtgtacagacccatctattcatcatagacttcactgtctgggaatg...
seq_99973_chr14:22846595-22846864_SCREEN_a3_L1,-0.305473,True,gtcaagttggaacttagattctttttcttgcggaagtgggaggcat...,gtcaagttggaacttagattctttttcttgcggaagtgggaggcat...


# Nadav's Idea: Create a Null Hypothesis
**Leave this in comment unless you want to run it!**

Nadav suggested to compare the heatmaps we get with the heatmaps when the loci is active, yet not "Differencially Active" to create a null hypothesys. 
We can see if there is a difference in the binding differences when the suspected enhancers are truely differentially activating the translation and when they don't. 

In [73]:
# # Creating the csv file of the active, yet not differential active loci
# columns_needed = ['oligo', 'sequence_ancestral', 'sequence_derived', 'logFC_derived_vs_ancestral', 'differential_activity']
# full_mpra_df = pd.read_csv(FULL_MPRA_FILE, index_col=0, usecols=columns_needed)

# active_loci = full_mpra_df[full_mpra_df["differential_activity"].notna()]
# mpra_df = active_loci[active_loci["differential_activity"] == False].sample(n=15000, random_state=42)
# MPRA_LINES_TO_PROCESS_FILE = os.path.join(MY_DATA_DIR, "humanMPRA_with_seq_final2_non_DA.csv")

# mpra_df.to_csv(MPRA_LINES_TO_PROCESS_FILE, index=True)
# mpra_df


## Loading PBM data
I will use the z-score of the TFs to decide the affinity, and the e-score to decide how certain I am in this result. I will use e-score limit of 0.35 to decide whether I use this z-score value or not (per TF per 8-mer).

In [74]:
all_8mer_pbm_df = pd.read_csv(PBM_FILE, index_col=0, header=[0,1])
all_8mer_pbm_df.columns = pd.MultiIndex.from_tuples([(lvl1, tf.split('_')[0]) for lvl1, tf in all_8mer_pbm_df.columns])
escore_df, median_df, zscore_df = all_8mer_pbm_df['E-score'], all_8mer_pbm_df['Median'], all_8mer_pbm_df['Z-score']
escore_df = escore_df.loc[:, ~escore_df.columns.duplicated()]
zscore_df = zscore_df.loc[:, ~zscore_df.columns.duplicated()]

escore_df.columns

Index(['ARX', 'Ahctf1', 'Alx3', 'Alx4', 'Ar', 'Arid3a', 'Arid5a', 'Ascl2',
       'Atf3', 'BCL11A',
       ...
       'TFAP2A', 'Tbx2', 'Tef', 'VAX2', 'VENTX', 'VSX1', 'VSX2', 'WT1',
       'ZNF200', 'ZNF655'],
      dtype='object', length=119)

In [75]:
def run_single_job(mpra_csv_file, pbm_csv_file, output_dir, window_size, start_index, end_index,
                   job_name="process_TF_to_loci_files", 
                   jobs_outputs_dir=JOBS_OUT_DIR, 
                   jobs_errors_dir=JOBS_ERR_DIR, 
                   print_info=True):
    """
    Run a single job to create the TF to loci files.
    args:
    mpra_csv_file: str, path to the MPRA CSV file
    pbm_csv_file: str, path to the PBM CSV file
    output_dir: str, path to the output directory
    window_size: int, the window size to use (for finding sequences in the PBM file)
    start_index: int, the start index to use (for the MPRA file, which can be split into multiple jobs)
    end_index: int, the end index to use (for the MPRA file, which can be split into multiple jobs)
    job_name: str, the name of the job
    jobs_outputs_dir: str, path to the jobs outputs directory
    jobs_errors_dir: str, path to the jobs errors directory
    """
    
    os.makedirs(jobs_outputs_dir, exist_ok=True)
    os.makedirs(jobs_errors_dir, exist_ok=True)

    TF_2_LOCI_SCRIPT_FILE = "/home/labs/davidgo/matanyaw/backup/create_TF_to_loci_script.py"
    
    command = [
        "bsub",  # Executable
        "-q", "short", 
        "-R", "rusage[mem=2000]",   # Resource requirements in Mb
        "-J", job_name,
        "-o", jobs_outputs_dir,
        "-e", jobs_errors_dir,
        "python", TF_2_LOCI_SCRIPT_FILE,  # The Python script to run
        "--mpra_file", mpra_csv_file,
        "--pbm_file", pbm_csv_file,
        "--output_dir", output_dir,
        "--window_size", str(window_size),
        "--start_index", str(start_index),
        "--end_index", str(end_index),
    ]

    try:
        subprocess.run(command, capture_output=True, text=True, check=True)
        if print_info:
            print(f"Job {job_name} submitted.")
    except subprocess.CalledProcessError as e:
        print(f"Error running the job: {e.stderr}")
        return None

Optional: Test run:

In [76]:
# Optional Test Run
from backup.create_TF_to_loci_script import create_tf_to_loci_files
start_index = 35
end_index = 40
job_name = f"daily_test_{CURRENT_RUN_NAME}"

# You can Run it as a Job:
# run_single_job(mpra_csv_file=MPRA_LINES_TO_PROCESS_FILE,
#         pbm_csv_file=PBM_FILE,
#         output_dir=os.path.join(CURRENT_RUN_RESULTS_DIR, f"{job_name}_job"),
#         window_size=8,
#         start_index=start_index,
#         end_index=end_index, 
#         job_name=f"{job_name}_job")


# You can run it as an imported function:
create_tf_to_loci_files(MPRA_LINES_TO_PROCESS_FILE, 
                        PBM_FILE, 
                        os.path.join(CURRENT_RUN_RESULTS_DIR, job_name), 
                        8, 
                        start_index, 
                        end_index)

Done with 36/15077
Done with 37/15077
Done with 38/15077
Done with 39/15077
Done with 40/15077


Here, we send the for all 15K sequences, the 

In [77]:
# In case I want to run it multiple times
session_id = 0

Full Run - Split the 15K loci into jobs (as many as I like, I do 1500). Takes a while to run. 

In [78]:
PER_LOCUS_DIR = os.path.join(CURRENT_RUN_RESULTS_DIR, f"per_locus_dir") # ? Change this to the desired output directory


In [79]:
num_jobs = 1500
chunk_size = mpra_df.shape[0] // num_jobs
for i in range(num_jobs):
        start_index = i * chunk_size
        end_index = (i + 1) * chunk_size
        if i == num_jobs - 1:
            end_index = mpra_df.shape[0]
        job_name = f"job_{i}_sessions_{session_id}"
        run_single_job(mpra_csv_file=MPRA_LINES_TO_PROCESS_FILE,
                pbm_csv_file=PBM_FILE,
                output_dir=PER_LOCUS_DIR,
                window_size=8,
                start_index=start_index,
                end_index=end_index, 
                job_name=job_name,
                jobs_outputs_dir=os.path.join(JOBS_OUT_DIR, f"session_{session_id}"),           # I want each 1500 jobs to have its own output directory
                jobs_errors_dir=os.path.join(JOBS_ERR_DIR, f"session_{session_id}"),
                print_info=False)
        print(f"Sent job {i}/{num_jobs} for lines: {start_index} to {end_index}, session {session_id}")

print(f"All {num_jobs} Jobs are Sent!")
session_id += 1

Sent job 0/1500 for lines: 0 to 10, session 0
Sent job 1/1500 for lines: 10 to 20, session 0
Sent job 2/1500 for lines: 20 to 30, session 0
Sent job 3/1500 for lines: 30 to 40, session 0
Sent job 4/1500 for lines: 40 to 50, session 0
Sent job 5/1500 for lines: 50 to 60, session 0
Sent job 6/1500 for lines: 60 to 70, session 0
Sent job 7/1500 for lines: 70 to 80, session 0
Sent job 8/1500 for lines: 80 to 90, session 0
Sent job 9/1500 for lines: 90 to 100, session 0
Sent job 10/1500 for lines: 100 to 110, session 0
Sent job 11/1500 for lines: 110 to 120, session 0
Sent job 12/1500 for lines: 120 to 130, session 0
Sent job 13/1500 for lines: 130 to 140, session 0
Sent job 14/1500 for lines: 140 to 150, session 0
Sent job 15/1500 for lines: 150 to 160, session 0
Sent job 16/1500 for lines: 160 to 170, session 0
Sent job 17/1500 for lines: 170 to 180, session 0
Sent job 18/1500 for lines: 180 to 190, session 0
Sent job 19/1500 for lines: 190 to 200, session 0
Sent job 20/1500 for lines: 20

**Stop! Make sure all relevant running jobs are done before you continue down here**

## Looking at the Overall Differences and Tendancies

Now we will run over all the loci and see things about the differences in the affinity of TF. 
Here I will create a figure that for every loci will show the biggest differences of the TF binding between Ancestral and Derived variants. 

In [80]:
# Collecting the relevant files

OVERALL_TF_BINDING_DIR = os.path.join(CURRENT_RUN_RESULTS_DIR, "overall_tf_binding_differences")
os.makedirs(OVERALL_TF_BINDING_DIR, exist_ok=True)


Collecting all relevant files from the 15K folders

In [81]:
def collect_difference_files(per_locus_dir):
    """ Returns a list of tuples with the paths to the difference PBM files and the metadata files. 
        Will be used to draw overall TF binding affinity conclusions.
    """
    diff_pbm_and_metadata_file_paths = []
    # tf_affinity_dir = os.path.join(MY_DATA_DIR, f"tf_to_locus_zscores_v{session_id}")
    print("Collecting files from directory: ", per_locus_dir)
    for locus_dir in os.listdir(per_locus_dir):
        locus_dir_path = os.path.join(per_locus_dir, locus_dir)
        if not os.path.isdir(locus_dir_path):
            continue

        metadata_path = os.path.join(locus_dir_path, "metadata.json")
        diff_csv_path = os.path.join(locus_dir_path, "Difference_PBM_filtered.csv")
        if not os.path.exists(metadata_path) or not os.path.exists(diff_csv_path):
            print(f"Missing files in {locus_dir_path}")
            continue
        diff_pbm_and_metadata_file_paths.append((diff_csv_path, metadata_path))

    len(diff_pbm_and_metadata_file_paths)
    print("Done.")
    return diff_pbm_and_metadata_file_paths


In [82]:
session_id -= 1
diff_pbm_and_metadata_file_paths = collect_difference_files(PER_LOCUS_DIR)
diff_pbm_and_metadata_file_paths

Collecting files from directory:  /home/labs/davidgo/matanyaw/results/final_run_2025_04_09/per_locus_dir
Done.


[('/home/labs/davidgo/matanyaw/results/final_run_2025_04_09/per_locus_dir/seq_295193_chr12_54782669-54782938_SCREEN_a2_L3/Difference_PBM_filtered.csv',
  '/home/labs/davidgo/matanyaw/results/final_run_2025_04_09/per_locus_dir/seq_295193_chr12_54782669-54782938_SCREEN_a2_L3/metadata.json'),
 ('/home/labs/davidgo/matanyaw/results/final_run_2025_04_09/per_locus_dir/seq_71981_chr12_67085937-67086206_SCREEN_a2_L1/Difference_PBM_filtered.csv',
  '/home/labs/davidgo/matanyaw/results/final_run_2025_04_09/per_locus_dir/seq_71981_chr12_67085937-67086206_SCREEN_a2_L1/metadata.json'),
 ('/home/labs/davidgo/matanyaw/results/final_run_2025_04_09/per_locus_dir/seq_33133_chr11_119894841-119895110_SCREEN_a1_L1/Difference_PBM_filtered.csv',
  '/home/labs/davidgo/matanyaw/results/final_run_2025_04_09/per_locus_dir/seq_33133_chr11_119894841-119895110_SCREEN_a1_L1/metadata.json'),
 ('/home/labs/davidgo/matanyaw/results/final_run_2025_04_09/per_locus_dir/seq_248781_chr18_75279211-75279480_SCREEN_a3_L2/Diffe

Creating CSVs that store the Min Max Difference in binding values for every TF in every Locus. 
This takes like 9 minutes to run.

In [83]:
# Creating the DataFrames that contain the most significant TF binding difference per locus
TF_binding_stronger_in_derived = pd.DataFrame()
TF_binding_weaker_in_derived = pd.DataFrame()

for diff_csv_path, metadata_path in diff_pbm_and_metadata_file_paths:
    df = pd.read_csv(diff_csv_path, index_col=0)
    
    with open(metadata_path, "r") as f:
        metadata = json.load(f)
    
    locus = metadata["original_locus"]
    max_pos_diff_per_tf = df.max(axis=1)
    max_neg_diff_per_tf = df.min(axis=1)
    max_pos_diff_per_tf.name = locus
    max_neg_diff_per_tf.name = locus
    TF_binding_stronger_in_derived = pd.concat([TF_binding_stronger_in_derived, max_pos_diff_per_tf], axis=1)
    TF_binding_weaker_in_derived = pd.concat([TF_binding_weaker_in_derived, max_neg_diff_per_tf], axis=1)

# TF_binding_stronger_in_derived.to_csv(os.path.join(OVERALL_TF_BINDING_DIR, f"TF_binding_stronger_in_derived_all_loci_v{session_id}.csv"))
# TF_binding_weaker_in_derived.to_csv(os.path.join(OVERALL_TF_BINDING_DIR, f"TF_binding_weaker_in_derived_all_loci_v{session_id}.csv"))

TF_binding_stronger_in_derived

Unnamed: 0,seq_295193_chr12:54782669-54782938_SCREEN_a2_L3,seq_71981_chr12:67085937-67086206_SCREEN_a2_L1,seq_33133_chr11:119894841-119895110_SCREEN_a1_L1,seq_248781_chr18:75279211-75279480_SCREEN_a3_L2,seq_147743_chr2:135099118-135099387_SCREEN_a1_L2,seq_334414_chr15:67425482-67425751_SCREEN_a3_L3,seq_45763_chr19:35476338-35476607_SCREEN_a1_L1,seq_208942_chr20:32400013-32400282_SCREEN_a2_L2,seq_148431_chr5:149894095-149894364_SCREEN_a1_L2,seq_319422_chr14:74035636-74035905_SCREEN_a2_L3,...,seq_56641_chr18:47836471-47836740_SCREEN_a2_L1,seq_327468_chr2:54281210-54281479_SCREEN_a2_L3,seq_134370_chr17:54676015-54676284_SCREEN_a1_L2,seq_301884_chr1:93428644-93428913_SCREEN_a2_L3,seq_347825_chr12:48147478-48147747_SCREEN_a3_L3,seq_292377_chr2:134103245-134103514_SCREEN_a2_L3,seq_34252_chr7:43331359-43331628_SCREEN_a1_L1,seq_185850_chr21:46469828-46470097_SCREEN_a2_L2,seq_135927_chr12:43281493-43281762_SCREEN_a1_L2,seq_319091_chr18:22724760-22725029_SCREEN_a2_L3
ARX,0.0000,2.7807,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0000,4.2163,0.7648
Ahctf1,0.0000,5.3654,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,2.5386,0.0000,0.0,0.0,0.0,0.0,2.5201,6.8531,0.0000
Alx3,0.0000,2.7213,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0000,4.5466,0.0000
Alx4,0.0000,6.9911,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0000,9.8966,0.0000
Ar,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,0.0000,3.5152,0.0,0.0,0.0,0.0,3.8692,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VSX1,0.0000,3.1938,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0000,5.2351,0.0000
VSX2,0.0000,2.5473,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,0.6526,0.0000,0.0,0.0,0.0,0.0,0.0000,4.2920,0.0000
WT1,1.5044,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,4.3547,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0000
ZNF200,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0000


Creating a csv file with the main difference in binding of every TF in every locus. This means I take for every TF the value that is most extreme in binding difference (per locus).

In [84]:
overall_binding_differences_df = np.where(TF_binding_stronger_in_derived.abs() > TF_binding_weaker_in_derived.abs(), TF_binding_stronger_in_derived, TF_binding_weaker_in_derived)

overall_binding_differences_df = pd.DataFrame(overall_binding_differences_df, index=TF_binding_weaker_in_derived.index, columns=TF_binding_weaker_in_derived.columns).astype(float).clip(lower=-10, upper=10)
TF_OVERALL_BINDING_DIFFERENCES_FILE = os.path.join(OVERALL_TF_BINDING_DIR, f"TF_overall_binding_differences.csv")
overall_binding_differences_df.to_csv(TF_OVERALL_BINDING_DIFFERENCES_FILE)
overall_binding_differences_df

Unnamed: 0,seq_295193_chr12:54782669-54782938_SCREEN_a2_L3,seq_71981_chr12:67085937-67086206_SCREEN_a2_L1,seq_33133_chr11:119894841-119895110_SCREEN_a1_L1,seq_248781_chr18:75279211-75279480_SCREEN_a3_L2,seq_147743_chr2:135099118-135099387_SCREEN_a1_L2,seq_334414_chr15:67425482-67425751_SCREEN_a3_L3,seq_45763_chr19:35476338-35476607_SCREEN_a1_L1,seq_208942_chr20:32400013-32400282_SCREEN_a2_L2,seq_148431_chr5:149894095-149894364_SCREEN_a1_L2,seq_319422_chr14:74035636-74035905_SCREEN_a2_L3,...,seq_56641_chr18:47836471-47836740_SCREEN_a2_L1,seq_327468_chr2:54281210-54281479_SCREEN_a2_L3,seq_134370_chr17:54676015-54676284_SCREEN_a1_L2,seq_301884_chr1:93428644-93428913_SCREEN_a2_L3,seq_347825_chr12:48147478-48147747_SCREEN_a3_L3,seq_292377_chr2:134103245-134103514_SCREEN_a2_L3,seq_34252_chr7:43331359-43331628_SCREEN_a1_L1,seq_185850_chr21:46469828-46470097_SCREEN_a2_L2,seq_135927_chr12:43281493-43281762_SCREEN_a1_L2,seq_319091_chr18:22724760-22725029_SCREEN_a2_L3
ARX,0.0000,2.7807,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,0.0000,0.0000,0.000,0.0,0.0000,0.0,0.0000,4.2163,0.7648
Ahctf1,0.0000,5.3654,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,2.5386,0.0000,-4.399,0.0,0.0000,0.0,2.5201,6.8531,-0.1639
Alx3,0.0000,2.7213,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,0.0000,0.0000,0.000,0.0,0.0000,0.0,0.0000,4.5466,0.0000
Alx4,0.0000,6.9911,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,0.0000,0.0000,0.000,0.0,0.0000,0.0,0.0000,9.8966,0.0000
Ar,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,0.0000,3.5152,0.000,0.0,0.0000,0.0,3.8692,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VSX1,0.0000,3.1938,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,0.0000,0.0000,0.000,0.0,0.0000,0.0,0.0000,5.2351,0.0000
VSX2,0.0000,2.5473,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,0.6526,0.0000,0.000,0.0,0.0000,0.0,0.0000,4.2920,0.0000
WT1,1.5044,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,4.3547,0.0000,0.0000,0.000,0.0,0.0000,0.0,0.0000,0.0000,-0.8160
ZNF200,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0000,0.0000,0.0000,0.000,0.0,0.0000,0.0,0.0000,0.0000,0.0000


In [None]:
def create_overall_binding_heatmap(df, title, output_dir):
    sns.set(style="whitegrid")

    df_clipped = np.clip(df, -10, 10)

    # --- Plot & Save Derived Heatmap ---
    plt.figure(figsize=(16, 10))
    sns.heatmap(
        df_clipped,
        cmap="coolwarm",
        center=0,
        xticklabels=False,
        # linewidths=0.5,
        # linecolor='gray'
    )
    plt.title(title.replace('_', ' '), fontsize=14)
    plt.xlabel("Locus")
    plt.ylabel("Transcription Factor")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"{title.replace(' ' , '_')}.png"), dpi=600)
    plt.close()



In [86]:
# Painting the Heatmaps
# create_overall_binding_heatmap(TF_binding_weaker_in_derived, "TF_binding_weakwer_in_derived_filtered", output_dir=OVERALL_TF_BINDING_CONCLUSION_DIR )
# create_overall_binding_heatmap(TF_binding_stronger_in_derived, "TF_binding_stronger_in_derived_filtered", output_dir=OVERALL_TF_BINDING_CONCLUSION_DIR )
create_overall_binding_heatmap(overall_binding_differences_df, "overall_binding_differences_heatmap", output_dir=OVERALL_TF_BINDING_DIR )

##### Create Clustermap 
Note: This takse circa ~15 minutes.

In [99]:
sns.set(style="white")

# Create clustered heatmap
clustermap = sns.clustermap(
    overall_binding_differences_df,              # TFs = rows, loci = columns
    cmap="coolwarm",
    center=0,
    figsize=(16, 10),
    xticklabels=False,        # Hide x labels if too many loci
    yticklabels=True,         # Show TF names
    cbar_kws={'label': 'Binding Strength'},
    method='average',         # Clustering method (can be 'ward', 'single', etc.)
    metric='euclidean'        # Distance metric
)
clustermap.ax_heatmap.set_yticklabels(
    clustermap.ax_heatmap.get_yticklabels(),
    fontsize=5  # or 5 if needed
)
clustermap.fig.suptitle("overall_binding_differences_clustermap".replace("_", " "), fontsize=16)

# Save the figure,,
clustermap.savefig(
    os.path.join(OVERALL_TF_BINDING_DIR, "clustermap_job_highres.png"),
    dpi=1200  # or 1200 for print-quality zooming
)
plt.tight_layout()

plt.close()
