# Find Correlation of TF Binding Difference to logFC
In this file, I will try to create scatter plots of the TF binding differences together with the log Fold Change in the MPRA data. 
The goal is to try to find some correlation in the TFs that have a significant binding difference to the (suspected) regulatory regions, and the log Fold Change (logFC) in that enhancer between the Ancestral (Chimp) and the Derived (Human)

## Todo List
[ ] Filter scatter plots to only to only top 10%
[ ] Comute Correlation 
[ ] Compute p-values

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline


import numpy as np
import matplotlib.pyplot as plt
import os
import sys

import seaborn as sns
import pandas as pd
from functools import reduce
import utils_matanya as um
import subprocess

In [2]:
# Useful constants
MY_DATA_DIR = "/home/labs/davidgo/matanyaw/data"

MPRA_FILE = "/home/labs/davidgo/Collaboration/humanMPRA/chondrocytes/comparative_analysis_combined/humanMPRA_with_seq_final2.csv"
OVERALL_TF_BINDING_CONCLUSION_DIR = "/home/labs/davidgo/matanyaw/data/overall_tf_binding_conclusion"



In [3]:
use_columns = ['oligo', 'logFC_derived_vs_ancestral', 'differential_activity']

full_mpra_df = pd.read_csv(MPRA_FILE, usecols=use_columns, index_col=0)
# We will have a look only at the differencially expressing oligos
mpra_df = full_mpra_df[full_mpra_df['differential_activity'] == True]
# DIFFERENTIAL_ACTIVE_MPRA_FILE = os.path.join(MY_DATA_DIR, "humanMPRA_with_seq_final2_differential_active.csv")
# mpra_df.to_csv(DIFFERENTIAL_ACTIVE_MPRA_FILE, index=True)
mpra_df

Unnamed: 0_level_0,logFC_derived_vs_ancestral,differential_activity
oligo,Unnamed: 1_level_1,Unnamed: 2_level_1
seq_100038_chr6:4358790-4359059_SCREEN_a3_L1,0.299749,True
seq_100065_chr7:138979123-138979392_SCREEN_a3_L1,0.316918,True
seq_100070_chr7:79861027-79861296_SCREEN_a3_L1,-0.275615,True
seq_100075_chr16:54376420-54376689_SCREEN_a3_L1,0.318055,True
seq_100090_chr20:31380149-31380418_SCREEN_a3_L1,-0.281553,True
...,...,...
seq_99921_chr11:34262393-34262662_SCREEN_a3_L1,0.286606,True
seq_99930_chr10:128329049-128329318_SCREEN_a3_L1,-0.770237,True
seq_99966_chr21:35967796-35968065_SCREEN_a3_L1,-0.230129,True
seq_99973_chr14:22846595-22846864_SCREEN_a3_L1,-0.305473,True


In [4]:
def read_overall_zscore_differences_df(file_name):
    """
    I wrote his function to read the overall z-score differences in a comfortable way, no major logics here
    """
    # Load and transpose
    df = pd.read_csv(file_name).T

    # Set first row as column headers
    df.columns = df.iloc[0]
    df = df[1:]

    # Rename and set index
    df.rename(columns={"Unnamed: 0": "oligo"}, inplace=True)
    df = df.reset_index()
    df = df.rename(columns={"index": "oligo"})
    df.set_index("oligo", inplace=True)

    return df


In [42]:
loci_TF_binding_diff_df = read_overall_zscore_differences_df(os.path.join(OVERALL_TF_BINDING_CONCLUSION_DIR, "TF_binding_all_loci_combined.csv"))
TFs = loci_TF_binding_diff_df.columns
loci_TF_binding_diff_df


Unnamed: 0,ARX,Ahctf1_mus_musculus,Alx3_mus_musculus,Alx4_mus_musculus,Ar_mus_musculus,Arid3a_mus_musculus,Arid5a_mus_musculus,Ascl2_mus_musculus,Atf3_mus_musculus,BCL11A,...,TFAP2A_mus_musculus,Tbx2_mus_musculus,Tef_mus_musculus,VAX2,VENTX,VSX1,VSX2,WT1,ZNF200,ZNF655
oligo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
seq_295193_chr12:54782669-54782938_SCREEN_a2_L3,0.0,0.0,0.0,0.0,0.0,1.16273,1.57553,4.032911,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5044,0.0,0.0
seq_71981_chr12:67085937-67086206_SCREEN_a2_L1,2.7807,5.3654,2.7213,6.9911,0.0,3.266925,2.802036,2.529784,0.0,0.0,...,0.0,0.0,-0.7,4.4412,-1.7899,3.1938,2.5473,0.0,0.0,0.0
seq_33133_chr11:119894841-119895110_SCREEN_a1_L1,0.0,0.0,0.0,0.0,0.0,-0.99274,3.257774,-2.118522,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seq_248781_chr18:75279211-75279480_SCREEN_a3_L2,0.0,0.0,0.0,0.0,0.0,1.919853,-2.67606,-1.973593,0.0,-1.222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seq_147743_chr2:135099118-135099387_SCREEN_a1_L2,0.0,0.0,0.0,0.0,0.0,1.688172,1.741709,1.57954,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
seq_292377_chr2:134103245-134103514_SCREEN_a2_L3,0.0,0.0,0.0,0.0,0.0,3.398744,-1.58452,1.736507,0.0,0.0,...,0.0,-7.5715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.3746
seq_34252_chr7:43331359-43331628_SCREEN_a1_L1,0.0,0.0,0.0,0.0,0.0,2.932179,-1.398006,1.877061,0.0,-3.5202,...,3.8378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seq_185850_chr21:46469828-46470097_SCREEN_a2_L2,0.0,2.5201,0.0,0.0,3.8692,4.098923,2.748593,3.42668,0.0,0.0,...,0.0,-2.4109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.032
seq_135927_chr12:43281493-43281762_SCREEN_a1_L2,4.2163,6.8531,4.5466,9.8966,0.0,3.192678,3.571664,2.24946,1.431,0.0,...,0.0,0.0,0.0,10.0,0.0,5.2351,4.292,0.0,0.0,0.0


##### Create Clustermap 
Note: This takse circa ~15 minutes.

In [None]:
# import os
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Set seaborn style
# sns.set(style="white")

# # Create clustered heatmap
# clustermap = sns.clustermap(
#     TF_affinity_diffs_df,              # TFs = rows, loci = columns
#     cmap="coolwarm",
#     center=0,
#     figsize=(16, 10),
#     xticklabels=False,        # Hide x labels if too many loci
#     yticklabels=True,         # Show TF names
#     cbar_kws={'label': 'Binding Strength'},
#     method='average',         # Clustering method (can be 'ward', 'single', etc.)
#     metric='euclidean'        # Distance metric
# )

# clustermap.fig.suptitle("Clustermap of Max TF Binding (All Loci)", fontsize=16)

# # Save the figure,,
# clustermap.savefig(os.path.join(OVERALL_TF_BINDING_CONCLUSION_DIR, "overall_tf_binding_clustermap.png"))
# plt.close()


## Now Continue Looking for Correlations

In [19]:
mpra_index = set(mpra_df.index)
tf_index = set(loci_TF_binding_diff_df.index)

print("Shared loci:", len(mpra_index & tf_index))
print("Only in MPRA:", len(mpra_index - tf_index))
print("Only in TF affinity:", len(tf_index - mpra_index))

# Corrected: use sorted() not .sort (which returns None)
sorted_mpra = sorted(mpra_index)
sorted_tf = sorted(tf_index)

# Compare the sorted lists
for index1, index2 in zip(sorted_mpra, sorted_tf):
    if index1 != index2:
        print("Different oligo names:", index1, index2)
        break


Shared loci: 15077
Only in MPRA: 0
Only in TF affinity: 0


In [None]:
combined_df_logFC = loci_TF_binding_diff_df.join(mpra_df['logFC_derived_vs_ancestral'])


oligo
seq_295193_chr12:54782669-54782938_SCREEN_a2_L3    -0.170510
seq_71981_chr12:67085937-67086206_SCREEN_a2_L1      0.241081
seq_33133_chr11:119894841-119895110_SCREEN_a1_L1   -0.368757
seq_248781_chr18:75279211-75279480_SCREEN_a3_L2    -0.207360
seq_147743_chr2:135099118-135099387_SCREEN_a1_L2    0.562105
                                                      ...   
seq_292377_chr2:134103245-134103514_SCREEN_a2_L3    0.275310
seq_34252_chr7:43331359-43331628_SCREEN_a1_L1       0.273129
seq_185850_chr21:46469828-46470097_SCREEN_a2_L2     0.084888
seq_135927_chr12:43281493-43281762_SCREEN_a1_L2    -0.555110
seq_319091_chr18:22724760-22725029_SCREEN_a2_L3     0.202324
Name: logFC_derived_vs_ancestral, Length: 15077, dtype: float64

Small check up with the data

In [31]:
combined_df_logFC.loc["seq_100065_chr7:138979123-138979392_SCREEN_a3_L1","logFC_derived_vs_ancestral"]

0.316918089505277

In [45]:
TF_BINDING_DIFF_TO_LOGFC_DIR = os.path.join(MY_DATA_DIR, "binding_logFC_correlation_v2")
os.makedirs(TF_BINDING_DIFF_TO_LOGFC_DIR, exist_ok=True)

In [37]:
tf_name = "ARX"

sns.scatterplot(
    data=combined_df_logFC.loc[:, ["logFC_derived_vs_ancestral", tf_name]],
    x="logFC_derived_vs_ancestral",
    y=tf_name)
plt.title(f"Correlation between TF binding and logFC for {tf_name}")
plt.xlabel("logFC (derived vs ancestral)")
plt.ylabel(f"TF Binding Difference: {tf_name}")
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(TF_BINDING_DIFF_TO_LOGFC_DIR, f"TF_binding_vs_logFC_{tf_name}.png"))
plt.show()
plt.close()


In [46]:
for i, tf_name in enumerate(TFs):
    df = combined_df_logFC.loc[:, ["logFC_derived_vs_ancestral", tf_name]].copy()
    df["abs_binding"] = df[tf_name].abs()
    threshold = df["abs_binding"].quantile(0.9)
    df_top10 = df[df["abs_binding"] >= threshold]

    sns.scatterplot(
        data=df_top10,
        x="logFC_derived_vs_ancestral",
        y=tf_name)
    plt.title(f"Correlation Between TF Binding-Difference and logFC for {tf_name}")
    plt.xlabel("logFC (Derived vs Ancestral) of Enhancer")
    plt.ylabel(f"TF Binding Difference: {tf_name} to Enhancer")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(TF_BINDING_DIFF_TO_LOGFC_DIR, f"TF_binding_vs_logFC_{tf_name}.png"))
    # if i % 10 == 0:
    #     plt.show()
    plt.close()

In [50]:
df = combined_df_logFC.loc[:, ["logFC_derived_vs_ancestral", "Bhlhb2_mus_musculus"]].copy()
df.loc[:, :]


Unnamed: 0_level_0,logFC_derived_vs_ancestral,Bhlhb2_mus_musculus
oligo,Unnamed: 1_level_1,Unnamed: 2_level_1
seq_295193_chr12:54782669-54782938_SCREEN_a2_L3,-0.170510,-1.514667
seq_71981_chr12:67085937-67086206_SCREEN_a2_L1,0.241081,-10.0
seq_33133_chr11:119894841-119895110_SCREEN_a1_L1,-0.368757,-1.667543
seq_248781_chr18:75279211-75279480_SCREEN_a3_L2,-0.207360,1.251978
seq_147743_chr2:135099118-135099387_SCREEN_a1_L2,0.562105,2.314359
...,...,...
seq_292377_chr2:134103245-134103514_SCREEN_a2_L3,0.275310,-4.850473
seq_34252_chr7:43331359-43331628_SCREEN_a1_L1,0.273129,-2.107476
seq_185850_chr21:46469828-46470097_SCREEN_a2_L2,0.084888,-7.725172
seq_135927_chr12:43281493-43281762_SCREEN_a1_L2,-0.555110,-1.521132
