# Find Correlation of TF Binding Difference to logFC
In this file, I will try to create scatter plots of the TF binding differences together with the log Fold Change in the MPRA data. 
The goal is to try to find some correlation in the TFs that have a significant binding difference to the (suspected) regulatory regions, and the log Fold Change (logFC) in that enhancer between the Ancestral (Chimp) and the Derived (Human)

## Todo List
[ ] Filter scatter plots to only to only top 10%
[ ] Comute Correlation 
[ ] Compute p-values

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline


import numpy as np
import matplotlib.pyplot as plt
import os
import sys

import seaborn as sns
import pandas as pd
from functools import reduce
import utils_matanya as um
import subprocess

In [2]:
# Useful constants
MY_DATA_DIR = "/home/labs/davidgo/matanyaw/data"

MPRA_FILE = "/home/labs/davidgo/Collaboration/humanMPRA/chondrocytes/comparative_analysis_combined/humanMPRA_with_seq_final2.csv"
OVERALL_TF_BINDING_DIR = os.path.join(MY_DATA_DIR, "overall_tf_binding")
TF_MPRA_CORRELATION_DIR = os.path.join(MY_DATA_DIR, "tf_mpra_correlation")
os.makedirs(TF_MPRA_CORRELATION_DIR, exist_ok=True)



In [3]:
mpra_use_columns = ['oligo', 'logFC_derived_vs_ancestral', 'differential_activity']

full_mpra_df = pd.read_csv(MPRA_FILE, usecols=mpra_use_columns, index_col=0)
# We will have a look only at the differencially expressing oligos
mpra_df = full_mpra_df[full_mpra_df['differential_activity'] == True]
mpra_df

Unnamed: 0_level_0,logFC_derived_vs_ancestral,differential_activity
oligo,Unnamed: 1_level_1,Unnamed: 2_level_1
seq_100038_chr6:4358790-4359059_SCREEN_a3_L1,0.299749,True
seq_100065_chr7:138979123-138979392_SCREEN_a3_L1,0.316918,True
seq_100070_chr7:79861027-79861296_SCREEN_a3_L1,-0.275615,True
seq_100075_chr16:54376420-54376689_SCREEN_a3_L1,0.318055,True
seq_100090_chr20:31380149-31380418_SCREEN_a3_L1,-0.281553,True
...,...,...
seq_99921_chr11:34262393-34262662_SCREEN_a3_L1,0.286606,True
seq_99930_chr10:128329049-128329318_SCREEN_a3_L1,-0.770237,True
seq_99966_chr21:35967796-35968065_SCREEN_a3_L1,-0.230129,True
seq_99973_chr14:22846595-22846864_SCREEN_a3_L1,-0.305473,True


In [11]:
TF_BINDING_DIFFERENCES_FILE = os.path.join(OVERALL_TF_BINDING_DIR, "TF_overall_binding_differences.csv")
tf_binding_differences_df = pd.read_csv(TF_BINDING_DIFFERENCES_FILE, index_col=0).T
tf_binding_differences_df


Unnamed: 0,ARX,Ahctf1_mus_musculus,Alx3_mus_musculus,Alx4_mus_musculus,Ar_mus_musculus,Arid3a_mus_musculus,Arid5a_mus_musculus,Ascl2_mus_musculus,Atf3_mus_musculus,BCL11A,...,TFAP2A_mus_musculus,Tbx2_mus_musculus,Tef_mus_musculus,VAX2,VENTX,VSX1,VSX2,WT1,ZNF200,ZNF655
seq_295193_chr12:54782669-54782938_SCREEN_a2_L3,0.0000,0.0000,0.0000,0.0000,0.0000,1.162730,1.575530,4.032911,0.000,0.0000,...,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,1.5044,0.0,0.0000
seq_71981_chr12:67085937-67086206_SCREEN_a2_L1,2.7807,5.3654,2.7213,6.9911,0.0000,3.266925,2.802036,2.529784,0.000,0.0000,...,0.0000,0.0000,-0.7,4.4412,-1.7899,3.1938,2.5473,0.0000,0.0,0.0000
seq_33133_chr11:119894841-119895110_SCREEN_a1_L1,0.0000,0.0000,0.0000,0.0000,0.0000,-0.992740,3.257774,-2.118522,0.000,0.0000,...,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000
seq_248781_chr18:75279211-75279480_SCREEN_a3_L2,0.0000,0.0000,0.0000,0.0000,0.0000,1.919853,-2.676060,-1.973593,0.000,-1.2220,...,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000
seq_147743_chr2:135099118-135099387_SCREEN_a1_L2,0.0000,0.0000,0.0000,0.0000,0.0000,1.688172,1.741709,1.579540,0.000,0.0000,...,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
seq_292377_chr2:134103245-134103514_SCREEN_a2_L3,0.0000,0.0000,0.0000,0.0000,0.0000,3.398744,-1.584520,1.736507,0.000,0.0000,...,0.0000,-7.5715,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,-3.3746
seq_34252_chr7:43331359-43331628_SCREEN_a1_L1,0.0000,0.0000,0.0000,0.0000,0.0000,2.932179,-1.398006,1.877061,0.000,-3.5202,...,3.8378,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000
seq_185850_chr21:46469828-46470097_SCREEN_a2_L2,0.0000,2.5201,0.0000,0.0000,3.8692,4.098923,2.748593,3.426680,0.000,0.0000,...,0.0000,-2.4109,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,1.0320
seq_135927_chr12:43281493-43281762_SCREEN_a1_L2,4.2163,6.8531,4.5466,9.8966,0.0000,3.192678,3.571664,2.249460,1.431,0.0000,...,0.0000,0.0000,0.0,10.0000,0.0000,5.2351,4.2920,0.0000,0.0,0.0000


## Looking for Correlations

In [12]:
mpra_index = set(mpra_df.index)
tf_index = set(tf_binding_differences_df.index)

print("Shared loci:", len(mpra_index & tf_index))
print("Only in MPRA:", len(mpra_index - tf_index))
print("Only in TF affinity:", len(tf_index - mpra_index))

# Corrected: use sorted() not .sort (which returns None)
sorted_mpra = sorted(mpra_index)
sorted_tf = sorted(tf_index)

# Compare the sorted lists
for index1, index2 in zip(sorted_mpra, sorted_tf):
    if index1 != index2:
        print("Different oligo names:", index1, index2)
        break


Shared loci: 15077
Only in MPRA: 0
Only in TF affinity: 0


In [15]:
combined_df_logFC = tf_binding_differences_df.join(mpra_df['logFC_derived_vs_ancestral'])
combined_df_logFC.loc[:, 'logFC_derived_vs_ancestral'] 


seq_295193_chr12:54782669-54782938_SCREEN_a2_L3    -0.170510
seq_71981_chr12:67085937-67086206_SCREEN_a2_L1      0.241081
seq_33133_chr11:119894841-119895110_SCREEN_a1_L1   -0.368757
seq_248781_chr18:75279211-75279480_SCREEN_a3_L2    -0.207360
seq_147743_chr2:135099118-135099387_SCREEN_a1_L2    0.562105
                                                      ...   
seq_292377_chr2:134103245-134103514_SCREEN_a2_L3    0.275310
seq_34252_chr7:43331359-43331628_SCREEN_a1_L1       0.273129
seq_185850_chr21:46469828-46470097_SCREEN_a2_L2     0.084888
seq_135927_chr12:43281493-43281762_SCREEN_a1_L2    -0.555110
seq_319091_chr18:22724760-22725029_SCREEN_a2_L3     0.202324
Name: logFC_derived_vs_ancestral, Length: 15077, dtype: float64

Small check up with the data

In [19]:
combined_df_logFC.loc["seq_339948_chr18:45748707-45748976_SCREEN_a3_L3","logFC_derived_vs_ancestral"]

-0.728836412952397

In [25]:
TF_BINDING_DIFF_TO_LOGFC_DIR = os.path.join(MY_DATA_DIR, "binding_logFC_correlation")
os.makedirs(TF_BINDING_DIFF_TO_LOGFC_DIR, exist_ok=True)

In [37]:
tf_name = "ARX"

sns.scatterplot(
    data=combined_df_logFC.loc[:, ["logFC_derived_vs_ancestral", tf_name]],
    x="logFC_derived_vs_ancestral",
    y=tf_name)
plt.title(f"Correlation between TF binding and logFC for {tf_name}")
plt.xlabel("logFC (derived vs ancestral)")
plt.ylabel(f"TF Binding Difference: {tf_name}")
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(TF_BINDING_DIFF_TO_LOGFC_DIR, f"TF_binding_vs_logFC_{tf_name}.png"))
plt.show()
plt.close()


In [26]:
TFs = tf_binding_differences_df.columns
# tfs = ["ARX", "H3K4me1", "H3K27ac", "H3K4me3", "H3K9ac"]
for i, tf_name in enumerate(TFs):
    df = combined_df_logFC.loc[:, ["logFC_derived_vs_ancestral", tf_name]].copy()
    df["abs_binding"] = df[tf_name].abs()
    threshold = df["abs_binding"].quantile(0.9)
    df = df[df["abs_binding"] >= threshold]

    sns.scatterplot(
        data=df,
        x="logFC_derived_vs_ancestral",
        y=tf_name)
    plt.title(f"Correlation Between TF Binding-Difference and logFC for {tf_name}")
    plt.xlabel("logFC (Derived vs Ancestral) of Enhancer")
    plt.ylabel(f"TF Binding Difference: {tf_name} to Enhancer")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(TF_BINDING_DIFF_TO_LOGFC_DIR, f"TF_binding_vs_logFC_{tf_name}.png"))
    # if i % 10 == 0:
    #     plt.show()
    plt.close()