In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import scipy.optimize as opt
import itertools
import random
import pickle
import os
from library2_utils.color_scheme import cell_line_colors, cell_line_symbols
from library2_utils.additive_model import add_mirna_expression, max_mirna_expression
from typing import Union
import ast

# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

cell_lines_measured = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH7", "A549", "HaCaT", "JEG3", "Tera1", "PC3"]

plot_folder = "../plots/8_shuffled_target_sites"
# create folder if it does not exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)

## Load the measurement data

In [2]:
data_dir_input = "../measured_data/2_normalized_log10"

# get the name of all files in "reference" folder
reference_files = os.listdir(data_dir_input)

# read them into a dictionary
reference_dict = {}
for reference_file in reference_files:
    if reference_file.endswith(".csv"):
        reference_dict[reference_file.split('.')[0]] = pd.read_csv(os.path.join(data_dir_input, reference_file), index_col=0)

In [3]:
# inidividual miRNA sites
single_df_full = reference_dict["1_mirna_full_single_high_conf"].copy()

In [4]:
# shuffled designs
df_shuffle_full = reference_dict["22_miRNA_full_combination_shuffle_x5"].copy()
# remove the _3UTR from column names
df_shuffle_full.columns = [x.replace("_3UTR", "") for x in df_shuffle_full.columns]

## Calculate miRNA expression by using the inverted transfer function

In [5]:
from library2_utils.transfer_functions import transfer_function, inverse_transfer
from library2_utils.mirna_combinations import get_combinations
from library2_utils.additive_model import add_mirna_expression

In [None]:
single_df_full = reference_dict["1_mirna_full_single_high_conf"].copy()
single_df_full.set_index("miRNA1", inplace=True)
single_df_full.columns = [x.replace("_3UTR", "") for x in single_df_full.columns]

# get the actual stability values
single_knockdown = single_df_full[cell_lines_measured].copy()
single_knockdown = 10**single_knockdown
single_knockdown

In [7]:
used_mirna_name="combined_dataset"
with open(f"../outputs/3_fitting/{used_mirna_name}/{used_mirna_name}_popt_wo_crosstalk.pkl", "rb") as f:
    popt = pickle.load(f)
with open(f"../outputs/3_fitting/{used_mirna_name}/{used_mirna_name}_scale_dict_wo_crosstalk.pkl", "rb") as f:
    scale_dict = pickle.load(f)

# make all values larger than 1 equal to 0.999
single_knockdown[single_knockdown > 1] = 0.999

# get the inverse of the expression
mirna_expr_fr_knockdown = inverse_transfer(single_knockdown, *popt)

In [8]:
# add the calculcated stability
df = df_shuffle_full.copy()
# check all columns containing "miRNA" for whether they are in mirna_expr_fr_knockdown
miRNA_columns = [col for col in df.columns if col.startswith("miRNA")]
df[miRNA_columns] = df[df[miRNA_columns].isin(mirna_expr_fr_knockdown.index)][miRNA_columns]
df_shuffle_full = df.copy()

In [9]:
added_df = add_mirna_expression(mirna_expr_fr_knockdown, df_shuffle_full)
strongest_df = max_mirna_expression(mirna_expr_fr_knockdown, df_shuffle_full)

# apply the transfer function
knockdown_from_added = np.log10(transfer_function(added_df, *popt))
knockdown_from_strongest = np.log10(transfer_function(strongest_df, *popt))       

# mean versus individual stability

In [10]:
# unroll the dataframe
unrolled_df = pd.DataFrame(columns=["cell_line", "sorted_mirnas", "stability", "additive", "antagonistic"])

# add the knockdown of the non-mutated mirna
for i, row in df_shuffle_full.iterrows():
    # get the mirnas
    sorted_mirnas = row["sorted_mirnas"]
    for cell_line in cell_lines_measured:
        stability = row[cell_line]
        additive_model = knockdown_from_added.loc[i, cell_line]
        antagonistic_model = knockdown_from_strongest.loc[i, cell_line]
        new_index = f"{i}_{cell_line}"
        
        # add the new row
        new_row = {"cell_line": cell_line, "sorted_mirnas": sorted_mirnas, "stability": stability, "additive": additive_model, "antagonistic": antagonistic_model}
        unrolled_df.loc[new_index] = new_row
        
# for each group of cell lines and sorted_mirnas, calculate the mean
mean_df = unrolled_df.groupby(["sorted_mirnas", "cell_line"])["stability"].mean().reset_index()

# add the mean to the unrolled dataframe
unrolled_df = unrolled_df.merge(mean_df, on=["sorted_mirnas", "cell_line"], suffixes=("", "_mean"))

In [None]:
# plot the mean expression versus the expression
#plt.figure(figsize=(2, 1.6))
plt.figure(figsize=(1.5, 1.2))

r2 = stats.pearsonr(unrolled_df["stability"], unrolled_df["stability_mean"])[0]**2
rmsd = np.sqrt(np.mean((unrolled_df["stability"] - unrolled_df["stability_mean"])**2))
plt.scatter(unrolled_df["stability_mean"], unrolled_df["stability"], color="tab:blue", s=2, alpha=0.5, rasterized=True, edgecolors="none")
    
plt.plot([-2, 0.25], [-2, 0.25], color="black", linestyle="--", linewidth=1)
plt.xlabel(r"log$_{10}$(mean stability)")
plt.ylabel(r"log$_{10}$(stability)")
plt.text(-2, 0, r"$r^2$"+f": {r2:.2f}", fontsize=7)
# plt.text(-2, -0, f"rmsd: {rmsd:.2f}", fontsize=7)
plt.tight_layout()
for format in [".png", ".svg"]:
    plt.savefig(os.path.join(plot_folder, f"8.1_mean_vs_stability_full_rmsd{format}"), dpi=600)

In [None]:
# plot the additive model versus the expression
plt.figure(figsize=(2, 1.6))

r2 = stats.pearsonr(unrolled_df["stability"], unrolled_df["additive"])[0]**2
rmsd = np.sqrt(np.mean((unrolled_df["stability"] - unrolled_df["additive"])**2))
plt.scatter(unrolled_df["additive"], unrolled_df["stability"], color="tab:blue", s=2, alpha=0.5, rasterized=True, edgecolors="none")
    
plt.plot([-2, 0.25], [-2, 0.25], color="black", linestyle="--", linewidth=1)
plt.xlabel(r"log$_{10}$(additive model stability)")
plt.ylabel(r"log$_{10}$(stability)")
# plt.text(-2, 0, r"$r^2$"+f": {r2:.2f}", fontsize=7)
plt.text(-2, 0, f"rmsd: {rmsd:.2f}", fontsize=7)
plt.tight_layout()
for format in [".png", ".svg"]:
    plt.savefig(os.path.join(plot_folder, f"8.1_additive_model_vs_stability_full{format}"), dpi=600)

In [None]:
# plot the additive model versus the expression
plt.figure(figsize=(2, 1.6))

r2 = stats.pearsonr(unrolled_df["stability"], unrolled_df["antagonistic"])[0]**2
rmsd = np.sqrt(np.mean((unrolled_df["stability"] - unrolled_df["antagonistic"])**2))
plt.scatter(unrolled_df["antagonistic"], unrolled_df["stability"], color="tab:blue", s=2, alpha=0.5, rasterized=True, edgecolors="none")
    
plt.plot([-2, 0.25], [-2, 0.25], color="black", linestyle="--", linewidth=1)
plt.xlabel(r"log$_{10}$(antagonistic model stability)")
plt.ylabel(r"log$_{10}$(stability)")
# plt.text(-2, 0, r"$r^2$"+f": {r2:.2f}", fontsize=7)
plt.text(-2, 0, f"rmsd: {rmsd:.2f}", fontsize=7)
plt.tight_layout()
for format in [".png", ".svg"]:
    plt.savefig(os.path.join(plot_folder, f"8.1_antagonistic_model_vs_stability_full{format}"), dpi=600)