# TF reporter activity analysis
### Aim
~36,000 reporters for 86 TFs were transfected into 9 different cell types & tested in 97 different perturbation conditions. In this script I will analyze TF reporter activities in detail and review how individual reporters respond to TF concentration variations.

---
Load libraries

In [8]:
# Load libraries:
import matplotlib.pyplot as plt  # Equivalent to RColorBrewer and ggplot2
import pandas as pd  # Equivalent to dplyr, tibble, and readr
import seaborn as sns  # Equivalent to pheatmap and ggpubr
import plotly  # Equivalent to plotly
import numpy as np  # Equivalent to maditr
from sklearn.ensemble import RandomForestClassifier  # Equivalent to randomForest
from sklearn.metrics import roc_curve, auc  # Equivalent to pROC
import string  # Equivalent to stringr
from scipy.stats import iqr  # Equivalent to IHW
import re  # Equivalent to stringr

---
**Load data frames**

In [9]:
# Import processed bc counts from the preprocessing step
cDNA_df = pd.read_csv("/DATA/usr/m.trauernicht/projects/SuRE-TF/data/gcf7124_stimulations/results/mt20240124_reporter_activity_filt_combined.csv")

# We are not going to use NIH3T3 data, so remove it for now
cDNA_df = cDNA_df[cDNA_df['cell'] != "NIH3T3"]

# Rename stimulation status of control conditions
cDNA_df['stimulation'].fillna("no", inplace=True)

# Load RNA-seq data
tf_rna = pd.read_csv("/DATA/usr/m.trauernicht/data/RNA_seq/rna_tpm_all_tfs.tsv", sep='\t')

# Prepare data frame for following analyses
cDNA_df2 = cDNA_df.copy()
cDNA_df2['tf'] = cDNA_df2['tf'].apply(lambda x: re.sub("_.*", "", x))
cDNA_df2 = cDNA_df2[(cDNA_df2['neg_ctrls'] == "No") &
                    (cDNA_df2['hPGK'] == "No") &
                    (~cDNA_df2['tf'].str.contains('RANDOM', case=False, regex=False, na=False, flags=0, regex=True, default=True)) &
                    (cDNA_df2['native_enhancer'] == "No")]
cDNA_df2['reporter_activity_minP'] = cDNA_df2['reporter_activity_minP'].apply(lambda x: np.log2(x))
cDNA_df2 = cDNA_df2.drop_duplicates(subset=['tf', 'condition', 'stimulation', 'reference_condition', 'tf_target', 'effect_size', 'off_target',
                                             'cell', 'reporter_id', 'commercial_reporter', 'reporter_activity_minP', 'gcf', 'reporter_dif_minP'])
cDNA_df2 = cDNA_df2.drop_duplicates()

# Define optimal candidate conditions for each TF: either the highest expressing cell line, or the stimulated condition
ref_conditions1 = cDNA_df2[cDNA_df2['tf_target'] == 1].drop_duplicates(subset=['tf', 'condition', 'cell', 'reporter_id', 'commercial_reporter', 'reporter_activity_minP'])
ref_conditions1 = ref_conditions1.groupby(['tf', 'condition']).apply(lambda x: x.sort_values('reporter_activity_minP', ascending=False).head(10))
ref_conditions1 = ref_conditions1.groupby('tf').apply(lambda x: x.sort_values('median_reporter_activity_minP', ascending=False).head(1))
ref_conditions1 = ref_conditions1[['tf', 'condition']].drop_duplicates()

ref_conditions2 = pd.merge(cDNA_df2[cDNA_df2['stimulation'] == "no"], tf_rna.rename(columns={'tf': 'tf'}), on=['cell', 'tf'])
ref_conditions2['mean_reporter_activity_minP'] = ref_conditions2.groupby(['tf', 'condition'])['reporter_activity_minP'].transform('median')
ref_conditions2 = ref_conditions2.drop_duplicates(subset=['mean_reporter_activity_minP', 'tf', 'condition', 'nTPM'])
ref_conditions2 = ref_conditions2.groupby('tf').apply(lambda x: x.sort_values('nTPM', ascending=False).head(3))
ref_conditions2 = ref_conditions2.groupby('tf').apply(lambda x: x.sort_values('mean_reporter_activity_minP', ascending=False).head(2))
ref_conditions2 = ref_conditions2[['tf', 'condition']].drop_duplicates()

ref_conditions = pd.concat([ref_conditions1, ref_conditions2])
ref_conditions = ref_conditions[['tf', 'condition']].drop_duplicates()

# Data frame with off-target activities
off_target_activities = cDNA_df2[cDNA_df2['tf_target'] == 2]
off_target_activities = off_target_activities[~((off_target_activities['tf'] == "SOX2") & (off_target_activities['condition'] == "mES_POU2F1"))]
off_target_activities['effect_size'] = pd.to_numeric(off_target_activities['effect_size'], errors='coerce')
off_target_activities['effect_size'] = off_target_activities['effect_size'].fillna(0)
off_target_activities['reporter_dif_mean'] = off_target_activities.groupby(['tf', 'condition'])['reporter_dif_minP'].transform(lambda x: np.median(x.dropna()))
off_target_activities['reporter_dif_mean'] = np.where(off_target_activities['effect_size'] == 0, -off_target_activities['reporter_dif_mean'], off_target_activities['reporter_dif_mean'])
off_target_activities = off_target_activities.groupby('tf').apply(lambda x: x.sort_values('reporter_dif_mean', ascending=False).head(1))
off_target_activities = off_target_activities[['reporter_id', 'tf', 'commercial_reporter', 'reporter_dif_minP', 'condition', 'effect_size']]
off_target_activities = off_target_activities.drop_duplicates()

on_target_activities = cDNA_df2[cDNA_df2['tf_target'] == 1]
on_target_activities['effect_size'] = pd.to_numeric(on_target_activities['effect_size'], errors='coerce')
on_target_activities['reporter_dif_mean'] = on_target_activities.groupby(['tf', 'condition'])['reporter_dif_minP'].transform(lambda x: np.median(x.dropna()))
on_target_activities['reporter_dif_mean'] = np.where(on_target_activities['effect_size'] == 0, -on_target_activities['reporter_dif_mean'], on_target_activities['reporter_dif_mean'])
on_target_activities = on_target_activities.groupby('tf').apply(lambda x: x.sort_values('reporter_dif_mean', ascending=False).head(1))
on_target_activities = on_target_activities[['reporter_id', 'tf', 'commercial_reporter', 'reporter_dif_minP', 'condition', 'effect_size']]
on_target_activities = on_target_activities.drop_duplicates()


SyntaxError: keyword argument repeated (<string>, line 18)

### Plot activities per cell type

Aim: Show that there are differences in activities between cell types

# Filter and transform cDNA_df
cDNA_df2 = cDNA_df.loc[
    (cDNA_df['stimulation'] == 'no') &
    (cDNA_df['commercial_reporter'] == 'No') &
    (cDNA_df['hPGK'] == 'No') &
    (~cDNA_df['tf'].str.contains('RANDOM', case=False, regex=False, na=False, flags=0, regex=True, default=True)) &
    (cDNA_df['native_enhancer'] == 'No')
].drop_duplicates(subset=['tf', 'reporter_activity_minP', 'cell', 'reporter_id', 'neg_ctrls'])
cDNA_df2['reporter_activity_minP'] = cDNA_df2.groupby(['cell', 'reporter_id'])['reporter_activity_minP'].transform('mean')
cDNA_df2['reporter_activity_minP'] = np.log2(cDNA_df2['reporter_activity_minP'])
cDNA_df2 = cDNA_df2.drop_duplicates()

# Plot activities per cell type
cell_order = ["A549", "K562", "HCT116", "MCF7", "HepG2", "U2OS", "HEK293", "mES", "NPC", "NIH3T3"]
plt.figure(figsize=(10, 8))
sns.kdeplot(data=cDNA_df2, x='reporter_activity_minP', fill='neg_ctrls', alpha=0.4, common_norm=False)
sns.FacetGrid(data=cDNA_df2, col='cell', col_wrap=2, col_order=cell_order).map(sns.kdeplot, 'reporter_activity_minP', fill='neg_ctrls', alpha=0.4)
plt.fill_between([-10, 10], 0, 1, color='grey70', label='Yes')
plt.fill_between([-10, 10], 0, 1, color='#DD6B48', label='No')
plt.legend()
plt.show()

# Figure S1G: Compare TF reporters to native enhancer controls
native_activities = cDNA_df.loc[
    (cDNA_df['condition'] == 'mES_2i_LIF') &
    (cDNA_df['neg_ctrls'] == 'No')
].drop_duplicates(subset=['reporter_id', 'tf', 'reporter_activity_minP', 'condition'])
native_activities['tf'] = native_activities['tf'].str.replace('_.*', '', regex=True)
native_activities['native_enhancer'] = np.where(native_activities['tf'].isin(['e97', 'e6', 'e19', 'e11', 'e93']), native_activities['tf'], 'tf_reporter')
native_activities['native_enhancer2'] = np.where(native_activities['native_enhancer'] == 'tf_reporter', native_activities['native_enhancer'], 'native_enhancer')

plt.figure(figsize=(10, 8))
sns.stripplot(data=native_activities, x='native_enhancer2', y=np.log2(native_activities['reporter_activity_minP']), rasterized=True, color='black', size=0.5)
sns.boxplot(data=native_activities, x='native_enhancer2', y=np.log2(native_activities['reporter_activity_minP']), color='red', width=0.25, linewidth=0.4)
plt.show()

# Figure S2A: Median TF activities in all 9 cell types
tf_activities_median = cDNA_df.loc[
    (cDNA_df['stimulation'] == 'no') &
    (cDNA_df['neg_ctrls'] == 'No') &
    (cDNA_df['commercial_reporter'] == 'No') &
    (cDNA_df['hPGK'] == 'No') &
    (~cDNA_df['tf'].str.contains('RANDOM', case=False, regex=False, na=False, flags=0, regex=True, default=True)) &
    (cDNA_df['native_enhancer'] == 'No')
].drop_duplicates(subset=['tf', 'reporter_activity_minP', 'cell', 'neg_ctrls'])
tf_activities_median['reporter_activity_minP'] = tf_activities_median.groupby(['cell', 'tf'])['reporter_activity_minP'].transform('median')
tf_activities_median['reporter_activity_minP'] = np.log2(tf_activities_median['reporter_activity_minP'])
tf_activities_median = tf_activities_median.drop_duplicates(subset=['reporter_activity_minP', 'tf', 'cell'])

plt.figure(figsize=(10, 8))
sns.barplot(data=tf_activities_median, x='cell', y='reporter_activity_minP', hue='reporter_activity_minP', dodge=False)
plt.yscale('log')
plt.xticks(rotation=90)
plt.show()
