In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import scipy.stats as stats

# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

cell_lines_measured = ["HEK293T", "HeLa", "SKNSH", "MCF7"]
plot_folder = "../plots/3_additive_model"
# create folder if it does not exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)

In [4]:
mirna_expression_filter = pd.read_csv("../input_data/miRNA_expression_data/1_output/1.10_alles_quantile_crosstalk_filter.csv", index_col=0)  
mirna_expression = mirna_expression_filter

In [20]:
data_dir_input = "../input_data/measurements_lib1/"

# get the name of all files in "reference" folder
reference_files = os.listdir(data_dir_input)

# read them into a dictionary
reference_dict = {}
for reference_file in reference_files:
    if reference_file.endswith(".csv"):
        reference_dict[reference_file.split('.')[0]] = pd.read_csv(os.path.join(data_dir_input, reference_file), index_col=0)

# get all dfs that contain "single" in their key
single_dfs = {key: reference_dict[key].copy() for key in reference_dict.keys() if "single" in key}
AND_dfs = {key: reference_dict[key].copy() for key in reference_dict.keys() if "AND" in key and "full" in key}

In [22]:
single_and_AND_dfs = AND_dfs.copy()
single_and_AND_dfs["2_full_singleX2_context1"] = single_dfs["2_full_singleX2_context1"]
single_and_AND_dfs["2_full_singleX2_context2"] = single_dfs["2_full_singleX2_context2"]
single_and_AND_dfs["3_full_singleX3_context1"] = single_dfs["3_full_singleX3_context1"]
single_and_AND_dfs["3_full_singleX3_context2"] = single_dfs["3_full_singleX3_context2"]

In [23]:
# filter out all mirnas that are not in the mirna expression data
for key in single_and_AND_dfs.keys():
    df = single_and_AND_dfs[key]
    miRNA_columns = [col for col in df.columns if col.startswith("miRNA")]
    
    # check for each miRNA if it is in mirna_expression.index
    df[miRNA_columns] = df[df[miRNA_columns].isin(mirna_expression.index)][miRNA_columns]
    single_and_AND_dfs[key] = df.dropna()

In [60]:
# split these into expression dfs and knockdown dfs
expression_dfs = {}
knockdown_dfs = {}
for key in single_and_AND_dfs.keys():
    df = single_and_AND_dfs[key]
    expression_dfs[key] = df[df.columns[df.columns.str.startswith("miRNA")]]
    knockdown_dfs[key] = df[df.columns[df.columns.str.endswith("_3UTR_log10")]]
    # drop the 3UTR suffix
    knockdown_dfs[key].columns = knockdown_dfs[key].columns.str.replace("_3UTR_log10", "")

## 3.1 - Predict stability based on microRNA expression

In [25]:
from lib.transfer_functions import transfer_function, inverse_transfer
from lib.mirna_combinations import get_combinations
from lib.additive_model import add_mirna_expression

In [44]:
mirna_expression_lin = 10**mirna_expression
popt = [10**3.5924, 10**10]

In [33]:
added_dfs = {key: add_mirna_expression(mirna_expression_lin, expression_dfs[key]) for key in expression_dfs.keys()}

In [54]:
# apply the transfer function to the added dfs
knockdown_from_added = {}
for key in added_dfs.keys():
    knockdown_from_added[key] = np.log10(transfer_function(added_dfs[key], *popt).astype(float))

### plot the data as a single plot

In [61]:
%%capture output
cell_line_colors = {"HEK293T": "blue", "SKNSH": "green", "HeLa": "red", "MCF7": "orange"}
cell_line_symbols = {"HEK293T": "o", "SKNSH": "s", "HeLa": "D", "MCF7": "v"}
curr_folder = "3.1.2_add_mirna_expression_filter"

# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

r2_vals = pd.DataFrame(index=knockdown_dfs.keys(), columns=cell_lines_measured)
for key in knockdown_dfs.keys():
    curr_knock_df = knockdown_dfs[key]
    curr_added_df = knockdown_from_added[key]
    
    plt.rcParams.update({'font.size': 8})
    fig = plt.figure(figsize=(3.5, 2.5))

    for i, cell_line in enumerate(cell_lines_measured):
        r2 = stats.pearsonr(curr_added_df[cell_line],
                            curr_knock_df[cell_line])[0]**2
        plt.scatter(curr_added_df[cell_line], curr_knock_df[cell_line], color=cell_line_colors[cell_line],
            s=3, marker=cell_line_symbols[cell_line], label=f"{cell_line}, " + r"$r^2$=" + f"{round(r2, 2)}")
        
        r2_vals.loc[key, cell_line] = r2
        if i == 3:
            plt.plot([-2.1, 0.1], [-2.1, 0.1], color="black", linewidth=1.5, ls="--")
        
    plt.xlabel("predicted")
    plt.ylabel("measured")
    plt.title(f"{key} added miRNA expression")
    
    plt.xlim(-2, 0.15)
    plt.ylim(-2, 0.15)
    
    plt.xticks([-2, -1.5, -1, -0.5, 0])
    plt.yticks([-2, -1.5, -1, -0.5, 0])

    plt.legend(loc="upper left", frameon=False, fontsize=7)
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/added_expression_{key}.{format}", dpi=300, bbox_inches='tight')
        
r2_vals.to_csv(f"{plot_folder}/{curr_folder}/add_r2_values.csv")

### plot sublibraries into a single plot

In [62]:
comb_AND_knock = [knockdown_dfs[key].copy() for key in knockdown_dfs if "AND" in key and "context1" in key and not "AND5" in key]
comb_single_knock = [knockdown_dfs[key].copy() for key in knockdown_dfs if "single" in key and "context1" in key]

comb_AND_expr = [knockdown_from_added[key].copy() for key in expression_dfs if "AND" in key and "context1" in key and not "AND5" in key]
comb_single_expr = [knockdown_from_added[key].copy() for key in expression_dfs if "single" in key and "context1" in key]

In [63]:
# merge the dfs. The columns are the same, so we can just use concat
comb_AND_knock = pd.concat(comb_AND_knock, axis=0)
comb_single_knock = pd.concat(comb_single_knock, axis=0)

comb_AND_expr = pd.concat(comb_AND_expr, axis=0)	
comb_single_expr = pd.concat(comb_single_expr, axis=0)

In [64]:
comb_knock = {"single": comb_single_knock, "AND": comb_AND_knock}
comb_expr = {"single": comb_single_expr, "AND": comb_AND_expr}

In [65]:
%%capture output
cell_line_colors = {"HEK293T": "blue", "SKNSH": "green", "HeLa": "red", "MCF7": "orange"}
cell_line_symbols = {"HEK293T": "o", "SKNSH": "s", "HeLa": "D", "MCF7": "v"}

curr_folder = "3.2.2_combined_mirna_exp_added_filter"

# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

r2_vals = pd.DataFrame(index=knockdown_dfs.keys(), columns=cell_lines_measured)
for key in comb_knock.keys():
    curr_knock_df = comb_knock[key]
    curr_added_df = comb_expr[key]
    
    plt.rcParams.update({'font.size': 8})
    fig = plt.figure(figsize=(3.5, 2.5))

    for i, cell_line in enumerate(cell_lines_measured):
        r2 = stats.pearsonr(curr_added_df[cell_line],
                            curr_knock_df[cell_line])[0]**2
        plt.scatter(curr_added_df[cell_line], curr_knock_df[cell_line], color=cell_line_colors[cell_line],
            s=3, marker=cell_line_symbols[cell_line], label=f"{cell_line}, " + r"$r^2$=" + f"{round(r2, 2)}")
        
        r2_vals.loc[key, cell_line] = r2
        if i == 3:
            plt.plot([-2.1, 0.1], [-2.1, 0.1], color="black", linewidth=1.5, ls="--")
        
    plt.xlabel("predicted")
    plt.ylabel("measured")
    plt.title(f"{key} added miRNA expression")
    
    plt.xlim(-2, 0.15)
    plt.ylim(-2, 0.15)
    
    plt.xticks([-2, -1.5, -1, -0.5, 0])
    plt.yticks([-2, -1.5, -1, -0.5, 0])

    plt.legend(loc="upper left", frameon=False, fontsize=7)
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/3.2.2_added_expression_{key}.{format}", dpi=300, bbox_inches='tight')
        
r2_vals.to_csv(f"{plot_folder}/{curr_folder}/3.2.2_add_r2_values.csv")

## 3.2 - Use the inverted transfer function

In [71]:
# get the actual expression values
single_knockdown = single_dfs["1_full_single_context1"]
# make miRNA the index
single_knockdown = single_knockdown.set_index("miRNA1")
# drop all columns that do not contain "3UTR"
single_knockdown = single_knockdown.filter(regex="_3UTR_log10")
# convert log10 to actual expression
single_knockdown = 10**single_knockdown
# drop the _log10 suffix
single_knockdown.columns = single_knockdown.columns.str.replace("_3UTR_log10", "")
# make all values larger than 1 equal to 1
single_knockdown[single_knockdown > 1] = 1
# get the inverse of the expression
mirna_expr_fr_knockdown = inverse_transfer(single_knockdown, *popt)

In [72]:
added_dfs = {key: add_mirna_expression(mirna_expr_fr_knockdown, expression_dfs[key]) for key in expression_dfs.keys()}
# apply the transfer function to the added dfs
knockdown_from_added = {}
for key in added_dfs.keys():
    knockdown_from_added[key] = np.log10(transfer_function(added_dfs[key], *popt).astype(float))

In [73]:
%%capture output
cell_line_colors = {"HEK293T": "blue", "SKNSH": "green", "HeLa": "red", "MCF7": "orange"}
cell_line_symbols = {"HEK293T": "o", "SKNSH": "s", "HeLa": "D", "MCF7": "v"}

curr_folder = "3.2.1_add_knockdown"

# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")
r2_vals = pd.DataFrame(index=knockdown_dfs.keys(), columns=cell_lines_measured)

for key in knockdown_dfs.keys():
    curr_knock_df = knockdown_dfs[key]
    curr_added_df = knockdown_from_added[key]
    
    plt.rcParams.update({'font.size': 8})
    fig = plt.figure(figsize=(3.5, 2.5))

    rs = []
    for i, cell_line in enumerate(cell_lines_measured):
        r2 = stats.pearsonr(curr_added_df[cell_line],
                            curr_knock_df[cell_line])[0]**2
        plt.scatter(curr_added_df[cell_line], curr_knock_df[cell_line], color=cell_line_colors[cell_line],
            s=3, marker=cell_line_symbols[cell_line], label=f"{cell_line}, " + r"$r^2$=" + f"{round(r2, 2)}")
        r2_vals.loc[key, cell_line] = r2
        if i == 3:
            plt.plot([-2.1, 0.1], [-2.1, 0.1], color="black", linewidth=1.5, ls="--")
        
    plt.xlabel("predicted")
    plt.ylabel("measured")
    plt.title(f"{key} added miRNA expression")
    
    plt.xlim(-2, 0.15)
    plt.ylim(-2, 0.15)
    
    plt.xticks([-2, -1.5, -1, -0.5, 0])
    plt.yticks([-2, -1.5, -1, -0.5, 0])

    plt.legend(loc="upper left", frameon=False, fontsize=7)
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/3.2.1_added_knockdown_{key}.{format}", dpi=300, bbox_inches='tight')

r2_vals.to_csv(f"{plot_folder}/{curr_folder}/3.2.1_knock_r2_values.csv")