# Find Correlation of TF Binding Difference to logFC
In this file, I will try to create scatter plots of the TF binding differences together with the log Fold Change in the MPRA data. 

In [1]:
# First let's import the libraries we will use:
%load_ext autoreload
%autoreload 2
%matplotlib inline


import numpy as np
import matplotlib.pyplot as plt
import os
import sys

import seaborn as sns
import pandas as pd
from functools import reduce
import utils_matanya as um
import subprocess

In [2]:
# Useful constants
MY_DATA_DIR = "/home/labs/davidgo/matanyaw/data"

MPRA_FILE = "/home/labs/davidgo/Collaboration/humanMPRA/chondrocytes/comparative_analysis_combined/humanMPRA_with_seq_final2.csv"
PBM_FILE = "/home/labs/davidgo/matanyaw/data/pbm_8mer_aggregated_data.csv"
JOBS_OUT_DIR = "/home/labs/davidgo/matanyaw/jobs_outputs"
JOBS_ERR_DIR = "/home/labs/davidgo/matanyaw/jobs_errors"
OVERALL_TF_BINDING_DIR = "/home/labs/davidgo/matanyaw/data/overall_tf_binding_conclusion"
# TF_2_LOCUS_ZSCORES_DIR = "/home/labs/davidgo/matanyaw/data/tf_2_locus_zscores/"



In [3]:
used_columns = ['oligo', 'logFC_derived_vs_ancestral', 'differential_activity']
full_mpra_df = pd.read_csv(MPRA_FILE, usecols=used_columns, )
# We will have a look only at the differencially expressing oligos
mpra_df = full_mpra_df[full_mpra_df['differential_activity'] == True]
mpra_df.reset_index(drop=True, inplace=True)
DIFFERENTIAL_ACTIVE_MPRA_FILE = os.path.join(MY_DATA_DIR, "humanMPRA_with_seq_final2_differential_active.csv")
mpra_df.to_csv(DIFFERENTIAL_ACTIVE_MPRA_FILE, index=False)
mpra_df

Unnamed: 0,oligo,logFC_derived_vs_ancestral,differential_activity
0,seq_100038_chr6:4358790-4359059_SCREEN_a3_L1,0.299749,True
1,seq_100065_chr7:138979123-138979392_SCREEN_a3_L1,0.316918,True
2,seq_100070_chr7:79861027-79861296_SCREEN_a3_L1,-0.275615,True
3,seq_100075_chr16:54376420-54376689_SCREEN_a3_L1,0.318055,True
4,seq_100090_chr20:31380149-31380418_SCREEN_a3_L1,-0.281553,True
...,...,...,...
15072,seq_99921_chr11:34262393-34262662_SCREEN_a3_L1,0.286606,True
15073,seq_99930_chr10:128329049-128329318_SCREEN_a3_L1,-0.770237,True
15074,seq_99966_chr21:35967796-35968065_SCREEN_a3_L1,-0.230129,True
15075,seq_99973_chr14:22846595-22846864_SCREEN_a3_L1,-0.305473,True


In [4]:
def read_overall_zscore_differences_df(file_name):
    # Load and transpose
    df = pd.read_csv(file_name).T

    # Set first row as column headers
    df.columns = df.iloc[0]
    df = df[1:]

    # Rename and set index
    df.rename(columns={"Unnamed: 0": "oligo"}, inplace=True)
    df = df.reset_index()
    df = df.rename(columns={"index": "oligo"})
    df.set_index("oligo", inplace=True)

    return df


In [5]:

stronger_in_ancestral_df = read_overall_zscore_differences_df(os.path.join(OVERALL_TF_BINDING_DIR, "TF_binding_stronger_in_ancestral_all_loci_v10.csv"))
stronger_in_derived_df = read_overall_zscore_differences_df(os.path.join(OVERALL_TF_BINDING_DIR, "TF_binding_stronger_in_derived_all_loci_v10.csv"))

In [6]:
stronger_in_ancestral_df

Unnamed: 0,ARX,Ahctf1_mus_musculus,Alx3_mus_musculus,Alx4_mus_musculus,Ar_mus_musculus,Arid3a_mus_musculus,Arid5a_mus_musculus,Ascl2_mus_musculus,Atf3_mus_musculus,BCL11A,...,TFAP2A_mus_musculus,Tbx2_mus_musculus,Tef_mus_musculus,VAX2,VENTX,VSX1,VSX2,WT1,ZNF200,ZNF655
oligo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
seq_295193_chr12_54782669-54782938_SCREEN_a2_L3,0.0,0.0,0.0,0.0,0.0,-0.638874,-0.863455,-3.063394,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.2547,0.0,0.0
seq_71981_chr12_67085937-67086206_SCREEN_a2_L1,0.0,0.0,0.0,0.0,0.0,-0.819694,-0.435851,-2.425517,0.0,0.0,...,0.0,0.0,-0.7,0.0,-1.7899,0.0,0.0,0.0,0.0,0.0
seq_33133_chr11_119894841-119895110_SCREEN_a1_L1,0.0,0.0,0.0,0.0,0.0,-0.99274,0.0,-2.118522,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seq_248781_chr18_75279211-75279480_SCREEN_a3_L2,0.0,0.0,0.0,0.0,0.0,-0.32185,-2.67606,-1.973593,0.0,-1.222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seq_147743_chr2_135099118-135099387_SCREEN_a1_L2,0.0,0.0,0.0,0.0,0.0,-1.05263,-0.275956,-1.486843,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
seq_292377_chr2_134103245-134103514_SCREEN_a2_L3,0.0,0.0,0.0,0.0,0.0,-2.81763,-1.58452,-1.520262,0.0,0.0,...,0.0,-7.5715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.3746
seq_34252_chr7_43331359-43331628_SCREEN_a1_L1,0.0,0.0,0.0,0.0,0.0,-1.302816,-1.398006,-1.731576,0.0,-3.5202,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seq_185850_chr21_46469828-46470097_SCREEN_a2_L2,0.0,0.0,0.0,0.0,0.0,-1.542075,-0.880984,-2.85397,0.0,0.0,...,0.0,-2.4109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seq_135927_chr12_43281493-43281762_SCREEN_a1_L2,0.0,0.0,0.0,0.0,0.0,-1.896441,-1.54978,-1.772401,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
stronger_in_derived_df

Unnamed: 0,ARX,Ahctf1_mus_musculus,Alx3_mus_musculus,Alx4_mus_musculus,Ar_mus_musculus,Arid3a_mus_musculus,Arid5a_mus_musculus,Ascl2_mus_musculus,Atf3_mus_musculus,BCL11A,...,TFAP2A_mus_musculus,Tbx2_mus_musculus,Tef_mus_musculus,VAX2,VENTX,VSX1,VSX2,WT1,ZNF200,ZNF655
oligo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
seq_295193_chr12_54782669-54782938_SCREEN_a2_L3,0.0,0.0,0.0,0.0,0.0,1.16273,1.57553,4.032911,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5044,0.0,0.0
seq_71981_chr12_67085937-67086206_SCREEN_a2_L1,2.7807,5.3654,2.7213,6.9911,0.0,3.266925,2.802036,2.529784,0.0,0.0,...,0.0,0.0,0.0,4.4412,0.0,3.1938,2.5473,0.0,0.0,0.0
seq_33133_chr11_119894841-119895110_SCREEN_a1_L1,0.0,0.0,0.0,0.0,0.0,0.88095,3.257774,0.09462,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seq_248781_chr18_75279211-75279480_SCREEN_a3_L2,0.0,0.0,0.0,0.0,0.0,1.919853,1.271352,1.800392,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seq_147743_chr2_135099118-135099387_SCREEN_a1_L2,0.0,0.0,0.0,0.0,0.0,1.688172,1.741709,1.57954,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
seq_292377_chr2_134103245-134103514_SCREEN_a2_L3,0.0,0.0,0.0,0.0,0.0,3.398744,1.583084,1.736507,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seq_34252_chr7_43331359-43331628_SCREEN_a1_L1,0.0,0.0,0.0,0.0,0.0,2.932179,0.907727,1.877061,0.0,0.0,...,3.8378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seq_185850_chr21_46469828-46470097_SCREEN_a2_L2,0.0,2.5201,0.0,0.0,3.8692,4.098923,2.748593,3.42668,0.0,0.0,...,0.0,1.2602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.032
seq_135927_chr12_43281493-43281762_SCREEN_a1_L2,4.2163,6.8531,4.5466,9.8966,0.0,3.192678,3.571664,2.24946,1.431,0.0,...,0.0,0.0,0.0,10.3973,0.0,5.2351,4.292,0.0,0.0,0.0


In [8]:
combined_df = np.where(stronger_in_ancestral_df.abs() > stronger_in_derived_df.abs(), stronger_in_ancestral_df, stronger_in_derived_df)
combined_df = pd.DataFrame(combined_df, index=stronger_in_ancestral_df.index, columns=stronger_in_ancestral_df.columns).astype(float).clip(lower=-10, upper=10)
combined_df

Unnamed: 0,ARX,Ahctf1_mus_musculus,Alx3_mus_musculus,Alx4_mus_musculus,Ar_mus_musculus,Arid3a_mus_musculus,Arid5a_mus_musculus,Ascl2_mus_musculus,Atf3_mus_musculus,BCL11A,...,TFAP2A_mus_musculus,Tbx2_mus_musculus,Tef_mus_musculus,VAX2,VENTX,VSX1,VSX2,WT1,ZNF200,ZNF655
oligo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
seq_295193_chr12_54782669-54782938_SCREEN_a2_L3,0.0000,0.0000,0.0000,0.0000,0.0000,1.162730,1.575530,4.032911,0.000,0.0000,...,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,1.5044,0.0,0.0000
seq_71981_chr12_67085937-67086206_SCREEN_a2_L1,2.7807,5.3654,2.7213,6.9911,0.0000,3.266925,2.802036,2.529784,0.000,0.0000,...,0.0000,0.0000,-0.7,4.4412,-1.7899,3.1938,2.5473,0.0000,0.0,0.0000
seq_33133_chr11_119894841-119895110_SCREEN_a1_L1,0.0000,0.0000,0.0000,0.0000,0.0000,-0.992740,3.257774,-2.118522,0.000,0.0000,...,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000
seq_248781_chr18_75279211-75279480_SCREEN_a3_L2,0.0000,0.0000,0.0000,0.0000,0.0000,1.919853,-2.676060,-1.973593,0.000,-1.2220,...,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000
seq_147743_chr2_135099118-135099387_SCREEN_a1_L2,0.0000,0.0000,0.0000,0.0000,0.0000,1.688172,1.741709,1.579540,0.000,0.0000,...,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
seq_292377_chr2_134103245-134103514_SCREEN_a2_L3,0.0000,0.0000,0.0000,0.0000,0.0000,3.398744,-1.584520,1.736507,0.000,0.0000,...,0.0000,-7.5715,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,-3.3746
seq_34252_chr7_43331359-43331628_SCREEN_a1_L1,0.0000,0.0000,0.0000,0.0000,0.0000,2.932179,-1.398006,1.877061,0.000,-3.5202,...,3.8378,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000
seq_185850_chr21_46469828-46470097_SCREEN_a2_L2,0.0000,2.5201,0.0000,0.0000,3.8692,4.098923,2.748593,3.426680,0.000,0.0000,...,0.0000,-2.4109,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,1.0320
seq_135927_chr12_43281493-43281762_SCREEN_a1_L2,4.2163,6.8531,4.5466,9.8966,0.0000,3.192678,3.571664,2.249460,1.431,0.0000,...,0.0000,0.0000,0.0,10.0000,0.0000,5.2351,4.2920,0.0000,0.0,0.0000


In [9]:
combined_df.to_csv(os.path.join(OVERALL_TF_BINDING_DIR, "TF_binding_all_loci_combined.csv"))


Create Heatmap


In [10]:
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Set Seaborn style
sns.set(style="whitegrid")

# --- Plot & Save Derived Heatmap ---
plt.figure(figsize=(16, 10))

sns.heatmap(
    combined_df.T,            # Transpose so TFs are on y-axis
    cmap="coolwarm",
    center=0,
    # linewidths=0.5,
    # linecolor='gray'
)

plt.title("TF Binding Differences", fontsize=14)
plt.xlabel("Locus")
plt.ylabel("Transcription Factor")
plt.tight_layout()

# Save the figure
plt.savefig(os.path.join(OVERALL_TF_BINDING_DIR, "overall_tf_binding_heatmap.png"))
plt.close()


##### Create Clustermap 
Note: This takse circa ~15 minutes.

In [13]:
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Set seaborn style
sns.set(style="white")

# Create clustered heatmap
clustermap = sns.clustermap(
    combined_df,              # TFs = rows, loci = columns
    cmap="coolwarm",
    center=0,
    figsize=(16, 10),
    xticklabels=False,        # Hide x labels if too many loci
    yticklabels=True,         # Show TF names
    cbar_kws={'label': 'Binding Strength'},
    method='average',         # Clustering method (can be 'ward', 'single', etc.)
    metric='euclidean'        # Distance metric
)

clustermap.fig.suptitle("Clustermap of Max TF Binding (All Loci)", fontsize=16)

# Save the figure,,
clustermap.savefig(os.path.join(OVERALL_TF_BINDING_DIR, "overall_tf_binding_clustermap.png"))
plt.close()


In [11]:
logFC_series = pd.Series(mpra_df['logFC_derived_vs_ancestral'].values, index=combined_df.index, name="logFC")
combined_df_logFC = pd.concat([logFC_series, combined_df], axis=1)
combined_df_logFC

Unnamed: 0_level_0,logFC,ARX,Ahctf1_mus_musculus,Alx3_mus_musculus,Alx4_mus_musculus,Ar_mus_musculus,Arid3a_mus_musculus,Arid5a_mus_musculus,Ascl2_mus_musculus,Atf3_mus_musculus,...,TFAP2A_mus_musculus,Tbx2_mus_musculus,Tef_mus_musculus,VAX2,VENTX,VSX1,VSX2,WT1,ZNF200,ZNF655
oligo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
seq_295193_chr12_54782669-54782938_SCREEN_a2_L3,0.299749,0.0000,0.0000,0.0000,0.0000,0.0000,1.162730,1.575530,4.032911,0.000,...,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,1.5044,0.0,0.0000
seq_71981_chr12_67085937-67086206_SCREEN_a2_L1,0.316918,2.7807,5.3654,2.7213,6.9911,0.0000,3.266925,2.802036,2.529784,0.000,...,0.0000,0.0000,-0.7,4.4412,-1.7899,3.1938,2.5473,0.0000,0.0,0.0000
seq_33133_chr11_119894841-119895110_SCREEN_a1_L1,-0.275615,0.0000,0.0000,0.0000,0.0000,0.0000,-0.992740,3.257774,-2.118522,0.000,...,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000
seq_248781_chr18_75279211-75279480_SCREEN_a3_L2,0.318055,0.0000,0.0000,0.0000,0.0000,0.0000,1.919853,-2.676060,-1.973593,0.000,...,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000
seq_147743_chr2_135099118-135099387_SCREEN_a1_L2,-0.281553,0.0000,0.0000,0.0000,0.0000,0.0000,1.688172,1.741709,1.579540,0.000,...,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
seq_292377_chr2_134103245-134103514_SCREEN_a2_L3,0.286606,0.0000,0.0000,0.0000,0.0000,0.0000,3.398744,-1.584520,1.736507,0.000,...,0.0000,-7.5715,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,-3.3746
seq_34252_chr7_43331359-43331628_SCREEN_a1_L1,-0.770237,0.0000,0.0000,0.0000,0.0000,0.0000,2.932179,-1.398006,1.877061,0.000,...,3.8378,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000
seq_185850_chr21_46469828-46470097_SCREEN_a2_L2,-0.230129,0.0000,2.5201,0.0000,0.0000,3.8692,4.098923,2.748593,3.426680,0.000,...,0.0000,-2.4109,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,1.0320
seq_135927_chr12_43281493-43281762_SCREEN_a1_L2,-0.305473,4.2163,6.8531,4.5466,9.8966,0.0000,3.192678,3.571664,2.249460,1.431,...,0.0000,0.0000,0.0,10.0000,0.0000,5.2351,4.2920,0.0000,0.0,0.0000


In [None]:
tf_name = "ARX"

sns.scatterplot(
    data=combined_df_logFC.loc[:, ["logFC", tf_name]],
    x="logFC",
    y=tf_name)
plt.title(f"Correlation between TF binding and logFC for {tf_name}")
plt.xlabel("logFC (derived vs ancestral)")
plt.ylabel(f"TF Binding Difference: {tf_name}")
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(OVERALL_TF_BINDING_DIR, f"TF_binding_vs_logFC_{tf_name}.png"))
plt.show()
plt.close()


In [12]:
TF_BINDING_DIFF_TO_LOGFC_DIR = os.path.join(MY_DATA_DIR, "TF_binding_diff_to_logFC")
os.makedirs(TF_BINDING_DIFF_TO_LOGFC_DIR, exist_ok=True)

for i, tf_name in enumerate(combined_df_logFC.columns[1:]):
    sns.scatterplot(
        data=combined_df_logFC.loc[:, ["logFC", tf_name]],
        x="logFC",
        y=tf_name)
    plt.title(f"Correlation Between TF Binding-Difference and logFC for {tf_name}")
    plt.xlabel("logFC (Derived vs Ancestral) of Enhancer")
    plt.ylabel(f"TF Binding Difference: {tf_name} to Enhancer")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(TF_BINDING_DIFF_TO_LOGFC_DIR, f"TF_binding_vs_logFC_{tf_name}.png"))
    if i % 10 == 0:
        plt.show()
    plt.close()