In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
import scipy.stats as stats
import seaborn as sns
from library2_utils.color_scheme import cell_line_colors, cell_line_symbols
from scipy.stats import mannwhitneyu
from library2_utils.transfer_functions import transfer_function, inverse_transfer
from library2_utils.mirna_combinations import get_combinations
from library2_utils.additive_model import add_mirna_expression, max_mirna_expression
from library2_utils.plotting_utilities import HandlerSize, return_pvalue_text

# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

cell_lines_subset = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH7", "A549"]
cell_lines_rest = ["HaCaT", "JEG3", "Tera1", "PC3"]
cell_lines_measured = cell_lines_subset + cell_lines_rest

plot_folder = "../plots/7a_additive_full"
# create folder if it does not exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)
output_folder = "../outputs/7a_additive_full"
# create folder if it does not exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    
# get mirbase
mirbase = pd.read_csv("../microrna_data/mirbase_extended.csv", index_col=0)

### Load data

In [2]:
df_combined = pd.read_csv("../microrna_data/3_output/Alles_Keller_combined_expression_wo_crosstalk_scaled.csv", index_col=0)
df_combined_with_crosstalk = pd.read_csv("../microrna_data/3_output/Alles_Keller_combined_expression_with_crosstalk.csv", index_col=0)

df_combined = df_combined.dropna()
mirna_expression = df_combined
used_mirna_name = "combined_dataset"

In [3]:
data_dir_input = "../measured_data/2_normalized_log10/"

# get the name of all files in "reference" folder
reference_files = os.listdir(data_dir_input)

# read them into a dictionary
reference_dict = {}
for reference_file in reference_files:
    if reference_file.endswith(".csv"):
        reference_dict[reference_file.split('.')[0]] = pd.read_csv(os.path.join(data_dir_input, reference_file), index_col=0)

In [4]:
# load design dicts
design_input_dir = "../design_files/inserted_designs/"
design_files = os.listdir(design_input_dir)

# read them into a dictionary
design_dict = {}
for design_file in design_files:
    if design_file.endswith(".csv"):
        design_dict[design_file.split('.')[0]] = pd.read_csv(os.path.join(design_input_dir, design_file), index_col=0)
        
design_df_flat = pd.concat(design_dict.values(), axis=0)

In [5]:
# get all relevant dfs
single_dfs = {key: reference_dict[key].copy() for key in reference_dict.keys() if "1_mirna_full_single_high_conf" in key or "full_repeat" in key}
AND_dfs = {key: reference_dict[key].copy() for key in reference_dict.keys() if "full_combination_probe" in key and "full" in key}
design_dfs = {key: reference_dict[key].copy() for key in reference_dict.keys() if "quality" in key or "mse_designs" in key}

single_and_AND_dfs = single_dfs.copy()
single_and_AND_dfs.update(AND_dfs)
single_and_AND_dfs.update(design_dfs)

### Load fitted parameters

In [6]:
with open(f"../outputs/3_fitting/{used_mirna_name}/{used_mirna_name}_popt_wo_crosstalk.pkl", "rb") as f:
    popt = pickle.load(f)

### Split into knockdown and expression dfs

In [None]:
# filter out all mirnas that are not in the mirna expression data
for key in single_and_AND_dfs.keys():
    df = single_and_AND_dfs[key]
    miRNA_columns = [col for col in df.columns if col.startswith("miRNA")]
    
    print(key, len(df))
    
    # check for each miRNA that it is not NaN
    for index, row in df.iterrows():
        mirnas = row[miRNA_columns]
        # for all mirnas, check that they are in the allowed mirnas
        if not all(mirnas.isin(mirna_expression.index)):
            df = df.drop(index)
        else:
            expression_vals = mirna_expression.loc[mirnas, cell_lines_measured]
        # are any of these NaN?
        if expression_vals.isnull().values.any():
            df = df.drop(index)
    
    # check for each miRNA if it is in mirna_expression.index
    # single expression
    # df[miRNA_columns] = df[df[miRNA_columns].isin(mirna_expression.index)][miRNA_columns]
    
    df = df.dropna()
    
    print(key, len(df))
    single_and_AND_dfs[key] = df

In [8]:
# split these into expression dfs and knockdown dfs
expression_dfs = {}
knockdown_dfs = {}
for key in single_and_AND_dfs.keys():
    df = single_and_AND_dfs[key]
    expression_dfs[key] = df[df.columns[df.columns.str.startswith("miRNA")]]
    knockdown_dfs[key] = df[df.columns[df.columns.str.endswith("_3UTR")]]
    # drop the 3UTR_ prefix
    knockdown_dfs[key].columns = knockdown_dfs[key].columns.str.replace("_3UTR", "")

In [9]:
# Get let7 data
expression_dfs_let7 = {}
knockdown_dfs_let7 = {}
for key in single_dfs.keys():
    df = single_dfs[key].copy()
    # make miRNA1 the index
    df = df.set_index("miRNA1", drop=False)

    # filter to those that contain "let-7"
    df = df[df.index.str.contains("let-7")]

    expression_dfs_let7[key] = df_combined_with_crosstalk.loc[df.index, cell_lines_measured]
    knockdown_dfs_let7[key] = df[df.columns[df.columns.str.endswith("_3UTR")]]

    # drop the 3UTR_ prefix
    knockdown_dfs_let7[key].columns = knockdown_dfs_let7[key].columns.str.replace("_3UTR", "")

## potentially filter certain constructs

In [10]:
exclusion_FLAG = False
exclusion_string = "exclusion" if exclusion_FLAG else ""

# with open(f"../outputs/14_high_stability/suspiciously_stable_mirnas.pkl", "rb") as f:
#     suspiciously_stable_mirnas = pickle.load(f)

# with open(os.path.join(output_folder, "ddG_exclusion_dfs_by_design.pkl"), "rb") as f:
#     ddG_exlusion_by_design = pickle.load(f)
    
# if exclusion_FLAG:
#     for key in expression_dfs.keys():
#         # exclude those with high ddG values
#         df = expression_dfs[key].copy()
#         print(key, ",", len(df))
#         # df_original = df.copy()
#         # df = df[~df.index.isin(ddG_exlusion_by_design.index)]
#         # print(f"Percent discarded due to ddG: {100-(len(df)/len(df_original))*100:.2f}%")
        
#         # are any of the mirnas in suspiciously_stable_mirnas?
#         # df_original = df.copy()
#         # mirna_columns = [column for column in df.columns if column.startswith("miRNA") and column != "miRNA"]
#         # df.loc[:, "suspiciously_stable"] = df[mirna_columns].apply(lambda x: any([mirna in suspiciously_stable_mirnas for mirna in x]), axis=1)
#         # df = df[~df["suspiciously_stable"]]
#         # df = df.drop(columns=["suspiciously_stable"])
#         # print(f"Percent discarded due to suspiciously stable miRNAs: {100-(len(df)/len(df_original))*100:.2f}%")
        
#         # remove those with very high stabilities
#         df_original = df.copy()
#         df_knockdown = knockdown_dfs[key].loc[df.index].copy()
#         curr_max = (10**df_knockdown[cell_lines_measured]).max(axis=1)
#         # find all those that have a max value of more than 1.5
#         mask = curr_max > 1.5
#         df= df[~mask]
#         print(f"Percent discarded due to high stability: {100-(len(df)/len(df_original))*100:.2f}%")
#         print(key, ",", len(df))
        
#         expression_dfs[key] = df
#         knockdown_dfs[key] = knockdown_dfs[key].loc[df.index]

## 7.1 - Predict knockdown based on microRNA expression

In [11]:
mirna_expression_lin = 10**mirna_expression

In [12]:
added_dfs = {key: add_mirna_expression(mirna_expression_lin, expression_dfs[key]) for key in expression_dfs.keys()}
strongest_dfs = {key: max_mirna_expression(mirna_expression_lin, expression_dfs[key]) for key in expression_dfs.keys()}

In [13]:
# apply the transfer function to the added dfs
knockdown_from_added = {}
for key in added_dfs.keys():
    knockdown_from_added[key] = np.log10(transfer_function(added_dfs[key], *popt))
    
knockdown_from_strongest = {}
for key in strongest_dfs.keys():
    knockdown_from_strongest[key] = np.log10(transfer_function(strongest_dfs[key], *popt))    

In [14]:
deviation_dfs = {key: knockdown_dfs[key] - knockdown_from_added[key] for key in knockdown_dfs.keys()}

## 7.1.1 - Plot the data as a individual plots

In [15]:
# %%capture output
# curr_folder = "7.1.1_add_mirna_expression_unfiltered"

# # create folder if it does not exist
# if not os.path.exists(f"{plot_folder}/{curr_folder}"):
#     os.makedirs(f"{plot_folder}/{curr_folder}")

# r2_vals = pd.DataFrame(index=knockdown_dfs.keys(), columns=cell_lines_measured)
# for key in knockdown_dfs.keys():
#     curr_knock_df = knockdown_dfs[key]
#     curr_added_df = knockdown_from_added[key]
#     print_key = "_".join(key.split("_")[2:])

#     fig = plt.figure(figsize=(2.2, 1.8))

#     for i, cell_line in enumerate(cell_lines_measured):
#         r2 = stats.pearsonr(curr_added_df[cell_line],
#                             curr_knock_df[cell_line])[0]**2
#         rmsd = np.sqrt(np.mean((curr_added_df[cell_line] - curr_knock_df[cell_line])**2))
#         plt.scatter(curr_added_df[cell_line], curr_knock_df[cell_line], color=cell_line_colors[cell_line],
#             s=3, marker=cell_line_symbols[cell_line], label=f"{cell_line}, " + r"$r^2$=" + f"{round(r2, 2)}")# + ", RMSD=" + f"{round(rmsd, 2)}")
        
#         r2_vals.loc[key, cell_line] = r2
#         if i == 3:
#             plt.plot([-2.1, 0.1], [-2.1, 0.1], color="black", linewidth=1.5, ls="--")
        
#     plt.xlabel(r"log$_{10}$(predicted stability)")
#     plt.ylabel(r"log$_{10}$(measured stability)")
#     plt.title(f"{print_key} added expression", fontsize=7.5)
    
#     plt.xlim(-2, 0.15)
#     plt.ylim(-2, 0.15)
    
#     plt.xticks([-2, -1.5, -1, -0.5, 0])
#     plt.yticks([-2, -1.5, -1, -0.5, 0])

#     plt.tight_layout()
#     plt.legend(loc=[1, -0.2], frameon=False, fontsize=6)
#     for format in ["png", "svg"]:
#         plt.savefig(f"{plot_folder}/{curr_folder}/7.1.1_added_{print_key}_{exclusion_string}.{format}", dpi=300, bbox_inches='tight')
        
# r2_vals.to_csv(f"{plot_folder}/{curr_folder}/add_r2_values.csv")

## 7.1.2 - Plot repeat data invidually

In [16]:
use_single_data = False

## Overview Figure

In [None]:
curr_folder = "7.1.2_individual_repeat_data"

# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

x_range_log = np.arange(1, 6, 0.1)
x_range_lin = 10**x_range_log
colors = ['#606060', '#704848', '#805030', '#906018', '#A07010', '#B06000']
colors = sns.color_palette("inferno", n_colors=6)

fig, ax = plt.subplots(figsize=(1.8, 1.4))
i = 0
for repeat_number in ["1", "2", "3", "4", "5", "6"]:
    y_range_log = np.log10(transfer_function(int(repeat_number)*x_range_lin, *popt))
    ax.plot(x_range_log, y_range_log, ls="--", lw=1, color=colors[i], label=f"{repeat_number} repeats")

    i += 1

ax.set_xlim(1, 6)
ax.set_ylim(-2.2, 0.3)
ax.set_xticks([1, 2, 3, 4, 5, 6])
ax.set_ylabel(r"log$_{10}$(stability)")

ax.set_xlabel(r"log$_{10}$(miRNA expr.)")

# ax.legend(loc=[0.01, 0.1], frameon=False, fontsize=6)
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/{curr_folder}/7.1.2_overview.{format}", dpi=600, bbox_inches='tight')
# plt.close()

### Selected cell lines

In [18]:
curr_folder = "7.1.2_individual_repeat_data"

# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

x_range_log = np.arange(1, 7, 0.1)
x_range_lin = 10**x_range_log
y_range_log = np.log10(transfer_function(x_range_lin, *popt))

fig, axs = plt.subplots(2, 3, figsize=(4.5, 2.0), sharex=True, sharey=True)
for cell_line_index, cell_line in enumerate(["HeLa", "A549"]):
    i = 0
    for key in ["6_miRNA_full_repeat_x2", 
                '8_miRNA_full_repeat_x4',
                '10_miRNA_full_repeat_x6']:
        repeat_number = key.split("_")[-1][-1]
        if repeat_number == "f":
            continue
        
        ax = axs[cell_line_index, i]
        
        repeat_number = int(repeat_number)
        knock_df = knockdown_dfs[key]
        ex_df = np.log10(added_dfs[key].astype(float))
        
        if use_single_data:
            ex_df = mirna_expression.loc[expression_dfs[key].loc[ex_df.index, "miRNA1"], cell_lines_measured]

        r2 = stats.pearsonr(np.log10(transfer_function(10**ex_df[cell_line], *popt)), knock_df[cell_line])[0]**2
        ax.scatter(ex_df[cell_line]-np.log10(repeat_number), knock_df[cell_line], color=cell_line_colors[cell_line], s=3, marker = cell_line_symbols[cell_line],
                   rasterized=True)
        y_range_log_multiple = np.log10(transfer_function(x_range_lin*repeat_number, *popt))
        ax.plot(x_range_log, y_range_log_multiple, color="black", ls="--", lw=1)
        
        ax.plot(x_range_log, y_range_log, color="darkgrey", ls="--", lw=1, alpha=1)
        
        # insert r2 as text
        ax.text(1.2, -1.9, r"$r^2$=" + f"{round(r2, 2)}", fontsize=7)
        
        ax.set_xlim(1, 6)
        ax.set_ylim(-2.2, 0.3)
        ax.set_xticks([1, 2, 3, 4, 5, 6])
        if i == 0:
            ax.set_ylabel(r"log$_{10}$(stability)")
        if cell_line_index == 1 and not use_single_data:
            # ax.set_xlabel(r"log$_{10}$"+f"({repeat_number}x miRNA expr.)")
            ax.set_xlabel(r"log$_{10}$"+f"(miRNA expr.)")
        if cell_line_index == 1 and use_single_data:
            ax.set_xlabel(r"log$_{10}$"+f"(miRNA expr.)")
        #ax.set_title(f"{repeat_number} repeats, " + r"$r^2$=" + f"{round(r2, 2)}")
        i += 1
        

plt.tight_layout()
if use_single_data:
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/7.1.2-expression_knockdown_repeats_single_data_{exclusion_string}.{format}", dpi=600, bbox_inches='tight')
else:
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/7.1.2-expression_knockdown_repeats_add_data_{exclusion_string}.{format}", dpi=600, bbox_inches='tight')
plt.close()

### All cell lines

In [19]:
curr_folder = "7.1.2_individual_repeat_data"

# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

x_range_log = np.arange(1, 6, 0.1)
x_range_lin = 10**x_range_log
y_range_log = np.log10(transfer_function(x_range_lin, *popt))

fig, axs = plt.subplots(10, 5, figsize=(7.2, 8.0), sharex=True, sharey=True)
for cell_line_index, cell_line in enumerate(cell_lines_measured):
    i = 0
    for key in ["6_miRNA_full_repeat_x2", '7_miRNA_full_repeat_x3',
                '8_miRNA_full_repeat_x4', '9_miRNA_full_repeat_x5',
                '10_miRNA_full_repeat_x6']:
        repeat_number = key.split("_")[-1][-1]
        if repeat_number == "f":
            continue
        
        ax = axs[cell_line_index, i]
        
        repeat_number = int(repeat_number)
        knock_df = knockdown_dfs[key]
        ex_df = np.log10(added_dfs[key].astype(float))
        if use_single_data:
            ex_df = mirna_expression.loc[expression_dfs[key].loc[ex_df.index, "miRNA1"], cell_lines_measured]

        rmsd = np.sqrt(np.mean((knock_df[cell_line].values - np.log10(transfer_function(10**ex_df[cell_line], *popt)).values)**2))
        r2 = stats.pearsonr(np.log10(transfer_function(10**ex_df[cell_line], *popt)), knock_df[cell_line])[0]**2
        ax.scatter(ex_df[cell_line]-np.log10(repeat_number), knock_df[cell_line], color=cell_line_colors[cell_line], s=3, marker = cell_line_symbols[cell_line],
                   rasterized=True)
        y_range_log_multiple = np.log10(transfer_function(x_range_lin*repeat_number, *popt))
        ax.plot(x_range_log, y_range_log_multiple, color="black", ls="--", lw=1)
        
        ax.plot(x_range_log, y_range_log, color="darkgrey", ls="--", lw=1, alpha=1)
        
        # insert r2 as text
        ax.text(1.2, -1.7, r"$r^2$=" + f"{round(r2, 2)}\nrmsd={round(rmsd, 2)}", fontsize=7)
        
        ax.set_xlim(1, 6)
        ax.set_ylim(-2.2, 0.3)
        ax.set_xticks([1, 2, 3, 4, 5, 6])
        if i == 0:
            ax.set_ylabel(r"log$_{10}$(stability)") 
        if cell_line_index == len(cell_lines_measured) - 1 and not use_single_data:
            # ax.set_xlabel(r"log$_{10}$"+f"({repeat_number}x miRNA expr.)")
            ax.set_xlabel(r"log$_{10}$"+f"(miRNA expr.)")
        if cell_line_index == len(cell_lines_measured) - 1 and use_single_data:
            ax.set_xlabel(r"log$_{10}$"+f"(miRNA expr.)")
        
        #ax.set_title(f"{repeat_number} repeats, " + r"$r^2$=" + f"{round(r2, 2)}")
        
        i += 1

plt.tight_layout()
if use_single_data:
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/expression_knockdown_repeats_all_single_data_{exclusion_string}.{format}", dpi=600, bbox_inches='tight')
else:
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/expression_knockdown_repeats_all_added_{exclusion_string}.{format}", dpi=600, bbox_inches='tight')
plt.close()

In [None]:
knock_df[cell_line]

In [20]:
index = pd.MultiIndex(levels=[[], []], codes=[[], []], names=["cell_line", "miRNA"])
repeats_by_cell_line = pd.DataFrame(columns=["1", "2", "3", "4", "5", "6"], index=index)

for key in ["1_mirna_full_single_high_conf", "6_miRNA_full_repeat_x2", '7_miRNA_full_repeat_x3',
                '8_miRNA_full_repeat_x4', '9_miRNA_full_repeat_x5', '10_miRNA_full_repeat_x6']:
    if key == "1_mirna_full_single_high_conf":
        repeat_number = "1"
    else:
        repeat_number = key.split("_")[-1][-1]
    for cell_line in cell_lines_measured:
        # For each miRNA, create the necessary index and assign the data.
        for miRNA in knockdown_dfs_let7[key].index:
            index = (cell_line, miRNA)
            if index not in repeats_by_cell_line.index:
                new_row = pd.Series(name=index, dtype=object)
                repeats_by_cell_line = pd.concat([repeats_by_cell_line, pd.DataFrame(new_row).T])
                
            # Assign the value from the DataFrame.
            repeats_by_cell_line.at[index, repeat_number] = knockdown_dfs_let7[key].at[miRNA, cell_line]

repeats_by_cell_line = repeats_by_cell_line.dropna()

In [21]:
curr_folder = "7.1.2_individual_repeat_data_let7"

# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

x_range_log = np.arange(1, 6, 0.1)
x_range_lin = 10**x_range_log
y_range_log = np.log10(transfer_function(x_range_lin, *popt))

fig, axs = plt.subplots(5, 2, figsize=(5, 6), sharex=True, sharey=True)
# flatten the axs
axs = axs.flatten()
repeat_numbers = np.array([1, 2, 3, 4, 5, 6])
colors = sns.color_palette("tab10", n_colors=len(knockdown_dfs_let7["6_miRNA_full_repeat_x2"]))

for cell_line_index, cell_line in enumerate(cell_lines_measured):
    ax = axs[cell_line_index]
    # get the relevant data from the repeats_by_cell_line
    curr_df = repeats_by_cell_line.loc[cell_line]
    for mirna_index, miRNA in enumerate(curr_df.index):
        knock_df = curr_df.loc[miRNA]
        ex_df = np.log10(repeat_numbers * 10**df_combined_with_crosstalk.loc[miRNA, cell_line])
        ax.plot(ex_df, knock_df, lw=1, ls="-", label="-".join(miRNA.split("-")[2:]), color=colors[mirna_index])
    ax.legend(loc=[0, 0.1], frameon=False, fontsize=6, ncols=2)
    ax.set_xlim(1, 6)
    ax.set_ylim(-2.2, 0.3)

    # set the labels if on the outside
    if cell_line_index % 2 == 0:
        ax.set_ylabel(r"log$_{10}$(stability)")
    if cell_line_index >= 8:
        ax.set_xlabel(r"log$_{10}$(miRNA expr.)")

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/{curr_folder}/repeats_all_added_let7.{format}", dpi=600, bbox_inches='tight')
plt.close()

## 7.1.3 - Combine sublibraries into a single plot

In [None]:
knockdown_dfs.keys()

In [23]:
comb_AND_knock = [knockdown_dfs[key].copy() for key in knockdown_dfs if "full_combination_probe" in key]
comb_single_knock = [knockdown_dfs[key].copy() for key in knockdown_dfs if "full_repeat" in key]
comb_design_knock = [knockdown_dfs[key].copy() for key in knockdown_dfs if "quality" in key or "mse_designs" in key]

comb_AND_expr = [knockdown_from_added[key].copy() for key in expression_dfs if "full_combination_probe" in key]
comb_single_expr = [knockdown_from_added[key].copy() for key in expression_dfs if "full_repeat" in key]
comb_design_expr = [knockdown_from_added[key].copy() for key in expression_dfs if "quality" in key or "mse_designs" in key]

comb_AND_strongest = [knockdown_from_strongest[key].copy() for key in expression_dfs if "full_combination_probe" in key]
comb_single_strongest = [knockdown_from_strongest[key].copy() for key in expression_dfs if "full_repeat" in key]
comb_design_strongest = [knockdown_from_strongest[key].copy() for key in expression_dfs if "quality" in key or "mse_designs" in key]

In [24]:
# merge the dfs. The columns are the same, so we can just use concat
comb_AND_knock = pd.concat(comb_AND_knock, axis=0)
comb_single_knock = pd.concat(comb_single_knock, axis=0)
comb_design_knock = pd.concat(comb_design_knock, axis=0)

comb_AND_expr = pd.concat(comb_AND_expr, axis=0)	
comb_single_expr = pd.concat(comb_single_expr, axis=0)
comb_design_expr = pd.concat(comb_design_expr, axis=0)

comb_AND_strongest = pd.concat(comb_AND_strongest, axis=0)
comb_single_strongest = pd.concat(comb_single_strongest, axis=0)
comb_design_strongest = pd.concat(comb_design_strongest, axis=0)

In [25]:
comb_knock = {"single": comb_single_knock, "AND": comb_AND_knock, "design": comb_design_knock}
comb_expr = {"single": comb_single_expr, "AND": comb_AND_expr, "design": comb_design_expr}
comb_strongest = {"single": comb_single_strongest, "AND": comb_AND_strongest, "design": comb_design_strongest}

In [26]:
%%capture output
curr_folder = "7.1.3_add_mirna_expression_combined"
# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

r2_vals_713 = pd.DataFrame(index=comb_knock.keys(), columns=cell_lines_measured)
rmsd_vals_713 = pd.DataFrame(index=comb_knock.keys(), columns=cell_lines_measured)
for key in comb_knock.keys():
    curr_knock_df = comb_knock[key]
    curr_added_df = comb_expr[key]

    fig = plt.figure(figsize=(1.7, 1.2))

    x_vals = []
    y_vals = []
    for i, cell_line in enumerate(cell_lines_measured):
        r2 = stats.pearsonr(curr_added_df[cell_line],
                            curr_knock_df[cell_line])[0]**2
        rmsd = np.sqrt(np.mean((curr_added_df[cell_line] - curr_knock_df[cell_line])**2))
        plt.scatter(curr_added_df[cell_line], curr_knock_df[cell_line], color="tab:blue", s=1 if key == "design" else 2, 
                    alpha=0.1 if key == "design" else 0.5, rasterized=True, edgecolors="none")
        
        x_vals.append(curr_added_df[cell_line])
        y_vals.append(curr_knock_df[cell_line])
        
        r2_vals_713.loc[key, cell_line] = r2
        rmsd_vals_713.loc[key, cell_line] = rmsd
        if i == 3:
            plt.plot([-2.3, 0.15], [-2.3, 0.15], color="black", linewidth=1, ls="--")
        
    plt.xlabel(r"log$_{10}$(pred. stability)")
    plt.ylabel(r"log$_{10}$(meas. stability)")
    #plt.title(f"{key} added miRNA expression")
    
    x_vals_flatten = np.concatenate(x_vals)
    y_vals_flatten = np.concatenate(y_vals)
    r2 = stats.pearsonr(x_vals_flatten, y_vals_flatten)[0]**2
    plt.text(-2, -0.1, r"$r^2$=" + f"{round(r2, 2)}", fontsize=7)
    
    plt.xlim(-2.1, 0.15)
    plt.ylim(-2.1, 0.15)
    
    plt.xticks([-2, -1.5, -1, -0.5, 0])
    plt.yticks([-2, -1.5, -1, -0.5, 0])

    #plt.legend(loc="upper left", frameon=False, fontsize=7)
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/7.1.3_added_expression_{key}_{exclusion_string}.{format}", dpi=600, bbox_inches='tight')
        
r2_vals_713.to_csv(f"{plot_folder}/{curr_folder}/7.1.3_add_r2_values.csv")

In [27]:
r2_vals_713.drop("design", inplace=True)

In [None]:
r2_vals_713

In [None]:
comb_knock.keys()

#### use strongest site only

In [30]:
%%capture output
curr_folder = "7.1.3_add_mirna_expression_combined"
# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

r2_vals_strongest_713 = pd.DataFrame(index=comb_knock.keys(), columns=cell_lines_measured)
rmsd_vals_strongest_713 = pd.DataFrame(index=comb_knock.keys(), columns=cell_lines_measured)
for key in comb_knock.keys():
    curr_knock_df = comb_knock[key]
    curr_added_df = comb_strongest[key]
    
    fig = plt.figure(figsize=(1.7, 1.2))

    x_vals = []
    y_vals = []
    
    for i, cell_line in enumerate(cell_lines_measured):
        r2 = stats.pearsonr(curr_added_df[cell_line],
                            curr_knock_df[cell_line])[0]**2
        rmsd = np.sqrt(np.mean((curr_added_df[cell_line] - curr_knock_df[cell_line])**2))
        plt.scatter(curr_added_df[cell_line], curr_knock_df[cell_line], color="tab:blue", s=2, alpha=0.5, rasterized=True, edgecolors="none")
        
        x_vals.append(curr_added_df[cell_line])
        y_vals.append(curr_knock_df[cell_line])
             
        if i == 3:
            plt.plot([-2.3, 0.15], [-2.3, 0.15], color="black", linewidth=1, ls="--")
        r2_vals_strongest_713.loc[key, cell_line] = r2
        rmsd_vals_strongest_713.loc[key, cell_line] = rmsd
        
    plt.xlabel(r"log$_{10}$(pred. stability)")
    plt.ylabel(r"log$_{10}$(meas. stability)")
    #plt.title(f"{key} added miRNA expression")
    
    x_vals_flatten = np.concatenate(x_vals)
    y_vals_flatten = np.concatenate(y_vals)
    r2 = stats.pearsonr(x_vals_flatten, y_vals_flatten)[0]**2
    plt.text(-2, -0.1, r"$r^2$=" + f"{round(r2, 2)}", fontsize=7)
    
    plt.xlim(-2.1, 0.15)
    plt.ylim(-2.1, 0.15)
    
    plt.xticks([-2, -1.5, -1, -0.5, 0])
    plt.yticks([-2, -1.5, -1, -0.5, 0])

    #plt.legend(loc="upper left", frameon=False, fontsize=7)
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/7.1.3_strongest_expression_{key}_{exclusion_string}.{format}", dpi=600, bbox_inches='tight')

r2_vals_strongest_713.drop("design", inplace=True)

# 7.2 - Invert the transfer function to calculate miRNA concentrations

In [31]:
# get the actual expression values
single_knockdown = single_dfs["1_mirna_full_single_high_conf"]
# make miRNA the index
single_knockdown = single_knockdown.set_index("miRNA1")
# drop all columns that do not contain "3UTR"
single_knockdown = single_knockdown.filter(regex="_3UTR")
# convert log10 to actual expression
single_knockdown = 10**single_knockdown
# drop the _log10 suffix
single_knockdown.columns = single_knockdown.columns.str.replace("_3UTR", "")
# make all values larger than 1 equal to 1
single_knockdown[single_knockdown >= 1] = 0.999
# get the inverse of the expression
mirna_expr_fr_knockdown = inverse_transfer(single_knockdown, *popt)

In [32]:
# filter all mirnas that are not in mirna_expr_fr_knockdown from expression_df
for key in expression_dfs.keys():
    df = expression_dfs[key].copy()
    # check all columns containing "miRNA" for whether they are in mirna_expr_fr_knockdown
    miRNA_columns = [col for col in df.columns if col.startswith("miRNA")]
    df[miRNA_columns] = df[df[miRNA_columns].isin(mirna_expr_fr_knockdown.index)][miRNA_columns]
    expression_dfs[key] = df.dropna()

In [33]:
added_dfs = {key: add_mirna_expression(mirna_expr_fr_knockdown, expression_dfs[key]) for key in expression_dfs.keys()}
strongest_dfs = {key: max_mirna_expression(mirna_expr_fr_knockdown, expression_dfs[key]) for key in expression_dfs.keys()}

# apply the transfer function to the added dfs
knockdown_from_added = {}
for key in added_dfs.keys():
    knockdown_from_added[key] = np.log10(transfer_function(added_dfs[key], *popt))
    
knockdown_from_strongest = {}
for key in strongest_dfs.keys():
    knockdown_from_strongest[key] = np.log10(transfer_function(strongest_dfs[key], *popt))       

In [34]:
%%capture output
curr_folder = "7.2.1_add_knockdown_individual"

# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")
r2_vals = pd.DataFrame(index=knockdown_dfs.keys(), columns=cell_lines_measured)

for key in knockdown_dfs.keys():
    curr_knock_df = knockdown_dfs[key]
    curr_added_df = knockdown_from_added[key]
    
    # make sure they use the same index
    curr_knock_df = curr_knock_df.loc[curr_added_df.index]
    print_key = "_".join(key.split("_")[2:])

    fig = plt.figure(figsize=(1.8, 1.4))

    rs = []
    for i, cell_line in enumerate(cell_lines_measured):
        # if cell_line != "JEG3":
        #     continue
        r2 = stats.pearsonr(curr_added_df[cell_line],
                            curr_knock_df[cell_line])[0]**2
        plt.scatter(curr_added_df[cell_line], curr_knock_df[cell_line], color=cell_line_colors[cell_line],
            s=3, marker=cell_line_symbols[cell_line], label=f"{cell_line}, " + r"$r^2$=" + f"{round(r2, 2)}")
        r2_vals.loc[key, cell_line] = r2
        if i == 3:
            plt.plot([-2.5, 0.5], [-2.5, 0.5], color="black", linewidth=1.5, ls="--")
        
    plt.xlabel(r"log$_{10}$(predicted stability)")
    plt.ylabel(r"log$_{10}$(measured stability)")
    plt.title(f"{print_key} inverse transfer", fontsize=7)
    
    plt.xlim(-2.3, 0.15)
    plt.ylim(-2.3, 0.15)
    
    plt.xticks([-2, -1.5, -1, -0.5, 0])
    plt.yticks([-2, -1.5, -1, -0.5, 0])

    plt.legend(loc=[1, 0], frameon=False, fontsize=7)
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/7.2.1_added_knockdown_{print_key}_{exclusion_string}.{format}", dpi=300, bbox_inches='tight')

r2_vals.to_csv(f"{plot_folder}/{curr_folder}/7.2.1_knock_r2_values.csv")

## 7.2.3 - Combine sublibraries into a single plot

In [35]:
comb_AND_knock = [knockdown_dfs[key].copy() for key in knockdown_dfs if "full_combination_probe" in key]
comb_single_knock = [knockdown_dfs[key].copy() for key in knockdown_dfs if "full_repeat" in key]
comb_design_knock = [knockdown_dfs[key].copy() for key in knockdown_dfs if "quality" in key or "mse_designs" in key]

comb_AND_expr = [knockdown_from_added[key].copy() for key in expression_dfs if "full_combination_probe" in key]
comb_single_expr = [knockdown_from_added[key].copy() for key in expression_dfs if "full_repeat" in key]
comb_design_expr = [knockdown_from_added[key].copy() for key in expression_dfs if "quality" in key or "mse_designs" in key]

comb_AND_strongest = [knockdown_from_strongest[key].copy() for key in expression_dfs if "full_combination_probe" in key]
comb_single_strongest = [knockdown_from_strongest[key].copy() for key in expression_dfs if "full_repeat" in key]
comb_design_strongest = [knockdown_from_strongest[key].copy() for key in expression_dfs if "quality" in key or "mse_designs" in key]

In [36]:
# merge the dfs. The columns are the same, so we can just use concat
comb_AND_knock = pd.concat(comb_AND_knock, axis=0)
comb_single_knock = pd.concat(comb_single_knock, axis=0)
comb_design_knock = pd.concat(comb_design_knock, axis=0)

comb_AND_expr = pd.concat(comb_AND_expr, axis=0)	
comb_single_expr = pd.concat(comb_single_expr, axis=0)
comb_design_expr = pd.concat(comb_design_expr, axis=0)

comb_AND_strongest = pd.concat(comb_AND_strongest, axis=0)
comb_single_strongest = pd.concat(comb_single_strongest, axis=0)
comb_design_strongest = pd.concat(comb_design_strongest, axis=0)

In [37]:
comb_knock = {"single": comb_single_knock, "AND": comb_AND_knock, "design": comb_design_knock}
comb_expr = {"single": comb_single_expr, "AND": comb_AND_expr, "design": comb_design_expr}
comb_strongest = {"single": comb_single_strongest, "AND": comb_AND_strongest, "design": comb_design_strongest}

In [38]:
%%capture output
curr_folder = "7.2.3_add_mirna_expression_combined"
# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

r2_vals_723 = pd.DataFrame(index=comb_knock.keys(), columns=cell_lines_measured)
rmsd_vals_723 = pd.DataFrame(index=comb_knock.keys(), columns=cell_lines_measured)
for key in comb_knock.keys():
    curr_knock_df = comb_knock[key]
    curr_added_df = comb_expr[key]

    # make sure they use the same index
    curr_knock_df = curr_knock_df.loc[curr_added_df.index]
    
    fig = plt.figure(figsize=(1.7, 1.2))

    x_vals = []
    y_vals = []
    for i, cell_line in enumerate(cell_lines_measured):
        r2 = stats.pearsonr(curr_added_df[cell_line],
                            curr_knock_df[cell_line])[0]**2
        rmsd = np.sqrt(np.mean((curr_added_df[cell_line] - curr_knock_df[cell_line])**2))
        plt.scatter(curr_added_df[cell_line], curr_knock_df[cell_line], color="tab:blue", s=1 if key == "design" else 2,
                    alpha=0.1 if key == "design" else 0.5, rasterized=True, edgecolors="none")
        
        x_vals.append(curr_added_df[cell_line])
        y_vals.append(curr_knock_df[cell_line])
        
        r2_vals_723.loc[key, cell_line] = r2
        rmsd_vals_723.loc[key, cell_line] = rmsd
        if i == 3:
            plt.plot([-2.1, 0.15], [-2.1, 0.15], color="black", linewidth=1, ls="--")
        
    plt.xlabel(r"log$_{10}$(pred. stability)")
    plt.ylabel(r"log$_{10}$(meas. stability)")
    #plt.title(f"{key} added miRNA expression")
    x_vals_flatten = np.concatenate(x_vals)
    y_vals_flatten = np.concatenate(y_vals)
    r2 = stats.pearsonr(x_vals_flatten, y_vals_flatten)[0]**2
    plt.text(-2, -0.1, r"$r^2$=" + f"{round(r2, 2)}", fontsize=7)
    
    plt.xlim(-2.1, 0.15)
    plt.ylim(-2.1, 0.15)
    
    plt.xticks([-2, -1.5, -1, -0.5, 0])
    plt.yticks([-2, -1.5, -1, -0.5, 0])

    #plt.legend(loc="upper left", frameon=False, fontsize=7)
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/7.2.3_added_expression_{key}_{exclusion_string}.{format}", dpi=600, bbox_inches='tight')
        
r2_vals_723.to_csv(f"{plot_folder}/{curr_folder}/7.2.3_add_r2_values.csv")
r2_vals_723.drop("design", inplace=True)

#### use strongest site only

In [39]:
%%capture output
curr_folder = "7.2.3_add_mirna_expression_combined"
# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

r2_vals_strongest_723 = pd.DataFrame(index=comb_knock.keys(), columns=cell_lines_measured)
rmsd_vals_strongest_723 = pd.DataFrame(index=comb_knock.keys(), columns=cell_lines_measured)
for key in comb_knock.keys():
    curr_knock_df = comb_knock[key]
    curr_added_df = comb_strongest[key]
    
    fig = plt.figure(figsize=(1.7, 1.2))

    x_vals = []
    y_vals = []
    for i, cell_line in enumerate(cell_lines_measured):
        r2 = stats.pearsonr(curr_added_df[cell_line],
                            curr_knock_df[cell_line])[0]**2
        rmsd = np.sqrt(np.mean((curr_added_df[cell_line] - curr_knock_df[cell_line])**2))
        plt.scatter(curr_added_df[cell_line], curr_knock_df[cell_line], color="tab:blue", s=2, alpha=0.5, rasterized=True, edgecolors="none")
        
        x_vals.append(curr_added_df[cell_line])
        y_vals.append(curr_knock_df[cell_line])
                
        if i == 3:
            plt.plot([-2.3, 0.15], [-2.3, 0.15], color="black", linewidth=1, ls="--")
        r2_vals_strongest_723.loc[key, cell_line] = r2
        rmsd_vals_strongest_723.loc[key, cell_line] = rmsd
        
    plt.xlabel(r"log$_{10}$(pred. stability)")
    plt.ylabel(r"log$_{10}$(meas. stability)")
    #plt.title(f"{key} added miRNA expression")
    
    x_vals_flatten = np.concatenate(x_vals)
    y_vals_flatten = np.concatenate(y_vals)
    r2 = stats.pearsonr(x_vals_flatten, y_vals_flatten)[0]**2
    plt.text(-2, -0.1, r"$r^2$=" + f"{round(r2, 2)}", fontsize=7)
    
    plt.xlim(-2.1, 0.15)
    plt.ylim(-2.1, 0.15)
    
    plt.xticks([-2, -1.5, -1, -0.5, 0])
    plt.yticks([-2, -1.5, -1, -0.5, 0])

    #plt.legend(loc="upper left", frameon=False, fontsize=7)
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/7.2.3_strongest_expression_{key}_{exclusion_string}.{format}", dpi=600, bbox_inches='tight')    

r2_vals_strongest_723.drop("design", inplace=True)

# 7.2.4 - Investigate Outliers (ddG)

In [40]:
# load the dicts
input_folder = "../outputs/6_context_impact"
with open(os.path.join(input_folder, "6_inserted_design_dict_with_ddG.pkl"), "rb") as f:
    inserted_design_dict = pickle.load(f)
with open(os.path.join(input_folder, "6_reference_dict_with_ddG.pkl"), "rb") as f:
    reference_dict = pickle.load(f)
with open(os.path.join(input_folder, "6_pairwise_target_ddG.pkl"), "rb") as f:
    pairwise_target_ddG = pickle.load(f)

# reference_df_with_ddG = pd.concat(reference_dict_with_ddG.values(), axis=0)
# exclusion_dict = pd.concat(exclusion_dict.values(), axis=0)

In [None]:
ref_keys = list(AND_dfs.keys()) + list(design_dfs.keys()) + list([key for key in single_dfs.keys() if not "high_conf" in key])
ref_keys

### This takes ~40s to run.

In [98]:
%%capture output
bins = np.arange(0,30,1)
ddG_threshold = 14

exclusion_dict_ddG = {}
exclusion_dict_ddG_ignore_expression = {}
for key in ref_keys:
    df = reference_dict[key]
    mirna_columns = [col for col in df.columns if "miRNA" in col and col != "miRNA"]
    ddG_diff_columns = [col for col in df.columns if "diff" in col]
    
    exclusion_df = pd.DataFrame(index=df.index, columns=cell_lines_measured)
    exclusion_df_ignore_expression = pd.DataFrame(index=df.index, columns=cell_lines_measured)
    for index, row in df.iterrows():
        # get all mirna sites with a high ddG
        high_ddG_sites = row[ddG_diff_columns] > ddG_threshold
        # get the expression for all sites
        mirnas = row[mirna_columns]
        # are all mirnas in the expression df?
        if not mirnas.isin(mirna_expression.index).all():
            continue
        # if any are this high, add it to the exclusion list
        if high_ddG_sites.any():
            exclusion_df_ignore_expression.loc[index, cell_lines_measured] = True
        else:
            exclusion_df_ignore_expression.loc[index, cell_lines_measured] = False
        expression = mirna_expression.loc[mirnas, :]
        for cell_line in cell_lines_measured:
            # get the mirna with the highest expression
            highest_expr_mirna = expression[cell_line].idxmax()
            indices = mirnas[mirnas == highest_expr_mirna].index
            first_index = indices[0] if len(indices) > 0 else None
            first_index = first_index[-1]
            
            # get the number of the associated miRNA site
            number = "ddG_diff_" + first_index
            
            # is this mirna in the high_ddG_sites?
            if high_ddG_sites[number]:
                exclusion_df.at[index, cell_line] = True
            else:
                exclusion_df.at[index, cell_line] = False
    exclusion_dict_ddG[key] = exclusion_df
    exclusion_dict_ddG_ignore_expression[key] = exclusion_df_ignore_expression
    
exclusion_dict_pairwise = {}
exclusion_dict_pairwise_ignore_expression = {}
for key in ref_keys:
    df = reference_dict[key]
    mirna_columns = [col for col in df.columns if "miRNA" in col and col != "miRNA"]
    pairwise_columns = [col for col in df.columns if "pairwise" in col]
    
    exclusion_df = pd.DataFrame(index=df.index, columns=cell_lines_measured)
    exclusion_df_ignore_expression = pd.DataFrame(index=df.index, columns=cell_lines_measured)
    for index, row in df.iterrows():
        # get all mirna sites with a high ddG
        high_ddG_sites = row[pairwise_columns] < -14
        # get the expression for all sites
        mirnas = row[mirna_columns]
        # are all mirnas in the expression df?
        if not mirnas.isin(mirna_expression.index).all():
            continue
        # if any are this high, add it to the exclusion list
        if high_ddG_sites.any():
            exclusion_df_ignore_expression.loc[index, cell_lines_measured] = True
        else:
            exclusion_df_ignore_expression.loc[index, cell_lines_measured] = False
        expression = mirna_expression.loc[mirnas, :]
        for cell_line in cell_lines_measured:
            # get the mirna with the highest expression
            highest_expr_mirna = expression[cell_line].idxmax()
            indices = mirnas[mirnas == highest_expr_mirna].index
            first_index = indices[0] if len(indices) > 0 else None
            first_index = first_index[-1]
            
            # get the number of the associated miRNA site
            number = "ddG_pairwise_target_" + first_index
            
            # is this mirna in the high_ddG_sites?
            if high_ddG_sites[number]:
                exclusion_df.at[index, cell_line] = True
            else:
                exclusion_df.at[index, cell_line] = False
    exclusion_dict_pairwise[key] = exclusion_df.dropna()
    exclusion_dict_pairwise_ignore_expression[key] = exclusion_df_ignore_expression.dropna()
    
# convert to a single dataframe
exclusion_df_ddG = pd.concat(exclusion_dict_ddG.values(), axis=0)
exclusion_df_pairwise = pd.concat(exclusion_dict_pairwise.values(), axis=0)
exclude_df_ddG_ignore_expression = pd.concat(exclusion_dict_ddG_ignore_expression.values(), axis=0)
exclude_df_pairwise_ignore_expression = pd.concat(exclusion_dict_pairwise_ignore_expression.values(), axis=0)

In [99]:
exclusion_dfs = {"ddG": exclusion_df_ddG, "pairwise": exclusion_df_pairwise}
exclusion_dfs_ignore_expression = {"ddG": exclude_df_ddG_ignore_expression, "pairwise": exclude_df_pairwise_ignore_expression}

In [100]:
exclusion_dfs_by_design = exclusion_dfs["ddG"].copy()
exclusion_dfs_by_design = exclusion_dfs_by_design.any(axis=1)
exclusion_dfs_by_design = exclusion_dfs_by_design[exclusion_dfs_by_design]
with open(os.path.join(output_folder, "ddG_exclusion_dfs_by_design.pkl"), "wb") as f:
    pickle.dump(exclusion_dfs_by_design, f)

In [101]:
comb_designs_knock = [knockdown_dfs[key].copy() for key in expression_dfs if "quality" in key or "mse_designs" in key]
comb_designs_expr = [knockdown_from_added[key].copy() for key in expression_dfs if "quality" in key or "mse_designs" in key]

comb_repeats_and_probe_knock = [knockdown_dfs[key].copy() for key in expression_dfs if "full_combination_probe" in key or "full_repeat" in key]
comb_repeats_and_probe_expr = [knockdown_from_added[key].copy() for key in expression_dfs if "full_combination_probe" in key or "full_repeat" in key]

comb_designs_knock = pd.concat(comb_designs_knock, axis=0)
comb_designs_expr = pd.concat(comb_designs_expr, axis=0)

comb_repeats_and_probe_knock = pd.concat(comb_repeats_and_probe_knock, axis=0)
comb_repeats_and_probe_expr = pd.concat(comb_repeats_and_probe_expr, axis=0)

In [102]:
comb_knock = {"designs": comb_designs_knock, "repeats_and_probe": comb_repeats_and_probe_knock}
comb_expr = {"designs": comb_designs_expr, "repeats_and_probe": comb_repeats_and_probe_expr}

In [103]:
diff_low_ddG = {}
diff_high_ddG = {}

In [104]:
%%capture output
curr_folder = "7.2.4b_outliers_ddG"

# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

for exclusion_key in exclusion_dfs.keys():
    exclusion_df = exclusion_dfs[exclusion_key]
    for key in comb_knock.keys():
        sc_plots = []
        fig = plt.figure(figsize=(2.2, 1.6))
        diff_low_ddG[key+"_"+exclusion_key] = {}
        diff_high_ddG[key+"_"+exclusion_key] = {}
        
        all_x_vals_before = []
        all_y_vals_before = []
        all_x_vals_after = []
        all_y_vals_after = []
        
        for i, cell_line in enumerate(cell_lines_measured):
            current_index = comb_expr[key][cell_line].index
            
            # if key == "repeats_and_probe" and exclusion_key == "pairwise":
            #     current_index = [index for index in current_index if "repeat" in index]
            #     # make it an index again
            #     current_index = pd.Index(current_index)
                
            # filter the index to those in the exclusion dict
            save_index = current_index
            current_index = current_index.intersection(exclusion_df[cell_line].index)
            
            all_x_vals_before.append(comb_expr[key].loc[current_index, cell_line])
            all_y_vals_before.append(comb_knock[key].loc[current_index, cell_line])
            high_ddG_index = exclusion_df.loc[current_index, cell_line]
            
            # this is a boolean mask
            high_ddG_index = high_ddG_index[high_ddG_index].index
            current_index_low = [index for index in current_index if index not in high_ddG_index]
            current_index_high = [index for index in current_index if index in high_ddG_index]
            plt.scatter(comb_expr[key].loc[current_index_low, cell_line], comb_knock[key].loc[current_index_low, cell_line],
                        color="tab:blue", s=1, alpha=0.3, edgecolors="none", rasterized=True, zorder=1)
            if "repeats_and_probe" in key:
                curr_s = 3
            else:
                curr_s = 2
            sc = plt.scatter(comb_expr[key].loc[current_index_high, cell_line], comb_knock[key].loc[current_index_high, cell_line],
                        color="tab:red", s=curr_s, alpha=1, edgecolors="none", rasterized=True, label=r"$\Delta\Delta$G > "+f"{ddG_threshold}" if i == 0 else None,
                        zorder=2)
            if i==0:
                sc_plots.append(sc)
            
            all_x_vals_after.append(comb_expr[key].loc[current_index_low, cell_line])
            all_y_vals_after.append(comb_knock[key].loc[current_index_low, cell_line])
            
            diff_low_ddG[key+"_"+exclusion_key][cell_line] = 10**comb_knock[key].loc[current_index_low, cell_line] - \
                    10**comb_expr[key].loc[current_index_low, cell_line]
            diff_high_ddG[key+"_"+exclusion_key][cell_line] = 10**comb_knock[key].loc[current_index_high, cell_line] - \
                    10**comb_expr[key].loc[current_index_high, cell_line]
        
            if i == 3:
                plt.plot([-2.1, 0.1], [-2.1, 0.1], color="black", linewidth=1.5, ls="--")
       
        # flatten
        all_x_vals_before = np.concatenate(all_x_vals_before)
        all_y_vals_before = np.concatenate(all_y_vals_before)
        all_x_vals_after = np.concatenate(all_x_vals_after)
        all_y_vals_after = np.concatenate(all_y_vals_after)
        
        r2_before = stats.pearsonr(all_x_vals_before, all_y_vals_before)[0]**2
        r2_after = stats.pearsonr(all_x_vals_after, all_y_vals_after)[0]**2
        
        # plt.title(r"$r^2$=" + f"{round(r2_before, 2)} -> {round(r2_after, 2)}")
       
        plt.xlabel(r"log$_{10}$(predicted stability)")
        plt.ylabel(r"log$_{10}$(measured stability)")

        plt.xlim(-2, 0.15)
        plt.ylim(-2, 0.15)

        plt.xticks([-2, -1.5, -1, -0.5, 0])
        plt.yticks([-2, -1.5, -1, -0.5, 0])

        plt.legend(loc="lower left", frameon=False, fontsize=7, handler_map={sc: HandlerSize(12) for sc in sc_plots})
        for format in ["png", "svg"]:
            plt.savefig(f"{plot_folder}/{curr_folder}/7.2.4b_outliers_{exclusion_key}_{key}_{exclusion_string}.{format}", dpi=300, bbox_inches='tight')

In [105]:
for key in diff_high_ddG.keys():
    diff_high_ddG[key] = pd.concat(diff_high_ddG[key].values())
    diff_low_ddG[key] = pd.concat(diff_low_ddG[key].values())

## analyze the difference

In [None]:
for key in diff_high_ddG.keys():
    diff_low = diff_low_ddG[key].copy()
    diff_high = diff_high_ddG[key].copy()
    diff = pd.concat([diff_low, diff_high])
    # make it a dataframe
    diff = diff.to_frame()
    diff.columns = ["diff"]
    diff.loc[diff_low.index, "type"] = "<14"
    diff.loc[diff_high.index, "type"] = ">14"
    
    # make a violin plot of the two
    fig = plt.figure(figsize=(1, 1.4))
    #sns.violinplot(data=diff, x="type", y="diff", palette=["tab:blue", "tab:red"], cut=0, linewidth=1, inner="quart")
    sns.boxplot(data=diff, x="type", y="diff", palette=["tab:blue", "tab:red"], linewidth=1, width=0.7, linecolor="black",
                showfliers=True, flierprops=dict(marker='o', markersize=2, markerfacecolor='black', markeredgecolor='none'))
    if "pairwise" in key:
        plt.xticks([0,1], [f">-{ddG_threshold}", f"<-{ddG_threshold}"])
    else:
        plt.xticks([0,1], [f"<{ddG_threshold}", f">{ddG_threshold}"])
    plt.xlabel("$\Delta\Delta$G (kcal/mole)")
    plt.ylabel(f"stability\nmeasured - predicted")
    plt.ylim(-1, 1)

    # # Perform the Mann-Whitney U test
    stat, p_value = mannwhitneyu(diff_high_ddG[key], diff_low_ddG[key], alternative="greater")
    
    # # Annotate the plot with the p-value
    x1, x2 = 0, 1  # columns 'Low ΔΔG' and 'High ΔΔG' respectively
    y, col = 0.9, 'k'  # adjust these values as necessary
    h = 0.2
    plt.text((x1+x2)*0.5, y+h, return_pvalue_text(p_value), ha='center', va='bottom', color=col)

    for format in ['png', 'svg']:
        plt.savefig(f"{plot_folder}/{curr_folder}/7.2.4b_outliers_violin_{key}_{ddG_threshold}_{exclusion_string}.{format}", dpi=300, bbox_inches='tight')

## 7.2.5 - Excessively and unstable sequences

In [None]:
high_stability = {}
top3_high_stability = {}
low_stability = {}
top3_low_stability = {}

for key in comb_knock.keys():
    # -----------------------------------
    # high stability
    # get the linear df
    df = 10**comb_knock[key].copy()
        
    # get the maximum value across cell lines
    curr_max = df[cell_lines_measured].max(axis=1)
    mask = curr_max > 1.5
    # how many are there?
    print(key, mask.sum())
    high_stability[key] = df[mask].index
    
    # get the top 3
    curr_max = df[cell_lines_measured].mean(axis=1)
    top3 = curr_max.nlargest(3)
    mask = curr_max.isin(top3)
    top3_high_stability[key] = df[mask].index
    
    # -----------------------------------
    # low stability
    df = np.log10(df)
    df_predicted = comb_expr[key].copy()
    diff = df_predicted - df
    curr_max = diff.max(axis=1)
    mask = curr_max > 0.8
    # how many are there?
    print(key, mask.sum())
    low_stability[key] = df[mask].index
    
    # get the top 3
    curr_max = diff.mean(axis=1)
    top3 = curr_max.nlargest(3)
    mask = curr_max.isin(top3)
    top3_low_stability[key] = df[mask].index
    
# concatenate all high_stability indices
high_stability = [list(high_stability[key]) for key in high_stability.keys()]
high_stability = [item for sublist in high_stability for item in sublist]
top3_high_stability = [list(top3_high_stability[key]) for key in top3_high_stability.keys()]
top3_high_stability = [item for sublist in top3_high_stability for item in sublist]

# concatenate all low_stability indices
low_stability = [list(low_stability[key]) for key in low_stability.keys()]
low_stability = [item for sublist in low_stability for item in sublist]
top3_low_stability = [list(top3_low_stability[key]) for key in top3_low_stability.keys()]
top3_low_stability = [item for sublist in top3_low_stability for item in sublist]

#### Top 3

In [109]:
%%capture output
curr_folder = "7.2.5_strange_stability"

# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

color_palette = sns.color_palette("tab10", n_colors=7)

diff_normal = {}
diff_high_stability = {}
diff_low_stability = {}

for key in comb_knock.keys():
    sc_plots = []
    fig = plt.figure(figsize=(2.2, 1.6))

    for i, cell_line in enumerate(cell_lines_measured):
        color_index = 0
        current_index = comb_expr[key][cell_line].index
            
        # filter the index
        high_stability_index = list(current_index.intersection(top3_high_stability))
        low_stability_index = list(current_index.intersection(top3_low_stability))
        normal_index = current_index.difference(top3_high_stability + top3_low_stability)
        
        plt.scatter(comb_expr[key].loc[normal_index, cell_line], comb_knock[key].loc[normal_index, cell_line],
                    color=color_palette[color_index], s=1, alpha=0.3, edgecolors="none", rasterized=True, zorder=1)
        color_index += 1
        
        if "repeats_and_probe" in key:
            curr_s = 5
        else:
            curr_s = 5
            
        for index_number, index in enumerate(high_stability_index+low_stability_index):
            sc = plt.scatter(comb_expr[key].loc[index, cell_line], comb_knock[key].loc[index, cell_line],
                        color=color_palette[color_index], s=curr_s, alpha=1, edgecolors="none", rasterized=True,
                        label=f"design {index_number+1}" if i == 0 else None,
                        zorder=2)
            color_index += 1
            if i==0:
                sc_plots.append(sc)

        if i == 3:
            plt.plot([-2.1, 0.1], [-2.1, 0.1], color="black", linewidth=1.5, ls="--")
    
    # flatten
    plt.xlabel(r"log$_{10}$(predicted stability)")
    plt.ylabel(r"log$_{10}$(measured stability)")

    plt.xlim(-2, 0.05)
    plt.ylim(-2.2, 1)

    plt.xticks([-2, -1.5, -1, -0.5, 0])
    plt.yticks([-2, -1.5, -1, -0.5, 0, 0.5, 1])

    plt.legend(loc="upper left", frameon=False, fontsize=7, handler_map={sc: HandlerSize(12) for sc in sc_plots})
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/7.2.5_high_and_low_top3_{key}_{exclusion_string}.{format}", dpi=300, bbox_inches='tight')

#### All highly stable

In [110]:
%%capture output
curr_folder = "7.2.5_strange_stability"

# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

diff_normal = {}
diff_high_stability = {}
diff_low_stability = {}

for key in comb_knock.keys():
    sc_plots = []
    fig = plt.figure(figsize=(2.2, 1.6))
    diff_normal[key] = {}
    diff_high_stability[key] = {}
    diff_low_stability[key] = {}
    
    all_x_vals_before = []
    all_y_vals_before = []
    all_x_vals_after = []
    all_y_vals_after = []
    
    for i, cell_line in enumerate(cell_lines_measured):
        current_index = comb_expr[key][cell_line].index
        all_x_vals_before.append(comb_expr[key].loc[current_index, cell_line])
        all_y_vals_before.append(comb_knock[key].loc[current_index, cell_line])
            
        # filter the index
        high_stability_index = current_index.intersection(high_stability)
        low_stability_index = current_index.intersection(low_stability)
        normal_index = current_index.difference(high_stability + low_stability)
        
        plt.scatter(comb_expr[key].loc[normal_index, cell_line], comb_knock[key].loc[normal_index, cell_line],
                    color="tab:blue", s=1, alpha=0.3, edgecolors="none", rasterized=True, zorder=1)
        if "repeats_and_probe" in key:
            curr_s = 3
        else:
            curr_s = 3
        sc1 = plt.scatter(comb_expr[key].loc[high_stability_index, cell_line], comb_knock[key].loc[high_stability_index, cell_line],
                    color="tab:red", s=curr_s, alpha=1, edgecolors="none", rasterized=True, label=r"High stability" if i == 0 else None,
                    zorder=2)
        sc2 = plt.scatter(comb_expr[key].loc[low_stability_index, cell_line], comb_knock[key].loc[low_stability_index, cell_line],
                    color="tab:orange", s=curr_s, alpha=1, edgecolors="none", rasterized=True, label=r"Low stability" if i == 0 else None,
                    zorder=2)
        if i==0:
            sc_plots.append(sc1)
            sc_plots.append(sc2)
        
        all_x_vals_after.append(comb_expr[key].loc[normal_index, cell_line])
        all_y_vals_after.append(comb_knock[key].loc[normal_index, cell_line])
        
        diff_normal[key][cell_line] = 10**comb_knock[key].loc[normal_index, cell_line] - \
                10**comb_expr[key].loc[normal_index, cell_line]
        diff_high_stability[key][cell_line] = 10**comb_knock[key].loc[high_stability_index, cell_line] - \
                10**comb_expr[key].loc[high_stability_index, cell_line]
        diff_low_stability[key][cell_line] = 10**comb_knock[key].loc[low_stability_index, cell_line] - \
                10**comb_expr[key].loc[low_stability_index, cell_line]
    
        if i == 3:
            plt.plot([-2.1, 0.1], [-2.1, 0.1], color="black", linewidth=1.5, ls="--")
    
    # flatten
    all_x_vals_before = np.concatenate(all_x_vals_before)
    all_y_vals_before = np.concatenate(all_y_vals_before)
    all_x_vals_after = np.concatenate(all_x_vals_after)
    all_y_vals_after = np.concatenate(all_y_vals_after)
    
    r2_before = stats.pearsonr(all_x_vals_before, all_y_vals_before)[0]**2
    r2_after = stats.pearsonr(all_x_vals_after, all_y_vals_after)[0]**2
    
    #plt.title(r"$r^2$=" + f"{round(r2_before, 2)} -> {round(r2_after, 2)}")
    
    plt.xlabel(r"log$_{10}$(predicted stability)")
    plt.ylabel(r"log$_{10}$(measured stability)")

    plt.xlim(-2, 0.05)
    plt.ylim(-2.2, 1)

    plt.xticks([-2, -1.5, -1, -0.5, 0])
    plt.yticks([-2, -1.5, -1, -0.5, 0, 0.5, 1])

    plt.legend(loc="upper left", frameon=False, fontsize=7, handler_map={sc: HandlerSize(12) for sc in sc_plots})
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/7.2.5_high_and_low_all_{key}_{exclusion_string}.{format}", dpi=300, bbox_inches='tight')

In [111]:
for key in diff_high_stability.keys():
    diff_high_stability[key] = pd.concat(diff_high_stability[key].values())
    diff_low_stability[key] = pd.concat(diff_low_stability[key].values())
    diff_normal[key] = pd.concat(diff_normal[key].values())

In [None]:
for key in diff_high_stability.keys():
    diff_ref = diff_normal[key].copy()
    diff_high = diff_high_stability[key].copy()
    diff_low = diff_low_stability[key].copy()
    diff = pd.concat([diff_ref, diff_high, diff_low])
    # make it a dataframe
    diff = diff.to_frame()
    diff.columns = ["diff"]
    diff.loc[diff_ref.index, "type"] = "normal"
    diff.loc[diff_high.index, "type"] = "high"
    diff.loc[diff_low.index, "type"] = "low"
    
    # make a violin plot of the two
    fig = plt.figure(figsize=(1, 1.4))
    #sns.violinplot(data=diff, x="type", y="diff", palette=["tab:blue", "tab:red", "tab:orange"], cut=0, linewidth=1, inner="quart")
    sns.boxplot(data=diff, x="type", y="diff", palette=["tab:blue", "tab:red", "tab:orange"], width=0.7, linewidth=1, linecolor="black",
                showfliers=True, flierprops=dict(marker='o', markersize=2, markerfacecolor='black', markeredgecolor='none'))
    
    plt.xlabel("overall stability category")
    plt.ylabel("stability\nmeasured - predicted")
    plt.ylim(-1, 2)

    # # # Perform the Mann-Whitney U test
    stat, p_value_stable = mannwhitneyu(diff_ref, diff_high, alternative="less")
    print(p_value_stable)
    stat, p_value_unstable = mannwhitneyu(diff_ref, diff_low, alternative="greater")
    print(p_value_unstable)
    
    # # # Annotate the plot with the p-value
    # x1, x2 = 0, 1  # columns 'Low ΔΔG' and 'High ΔΔG' respectively
    # y, col = 0.9, 'k'  # adjust these values as necessary
    # h = 0.2
    # plt.text((x1+x2)*0.5, y+h, f'$p = {base}\cdot 10^{{{exponent}}}$', ha='center', va='bottom', color=col)

    for format in ['png', 'svg']:
        plt.savefig(f"{plot_folder}/{curr_folder}/7.2.5_stability_violin_{key}_{exclusion_string}.{format}", dpi=300, bbox_inches='tight')

## 7.3 - Investigate Repeat Data

In [42]:
curr_folder = "7.3_repeat_outliers"
# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

In [None]:
# take a closer look at repeat data
repeats_by_cell_line_full = {}
full_indices = single_dfs["6_miRNA_full_repeat_x2"]["miRNA1"]

# print the length
print(len(full_indices))

# filter these for presence in the expression data
full_indices = full_indices[full_indices.isin(mirna_expression.index)]

# print the length
print(len(full_indices))

for cell_line in cell_lines_measured:
    repeats_by_cell_line_full[cell_line] = pd.DataFrame(columns=["x1", "x2", "x3", "x4", "x5", "x6"], index=full_indices)
    for key in single_dfs.keys():
        df_info = single_dfs[key].copy()
        df_knockdown = knockdown_dfs[key].copy()
        df_knockdown["miRNA"] = df_info.loc[df_knockdown.index, "miRNA1"]
        df_knockdown.set_index("miRNA", inplace=True)
        
        index = df_knockdown.index.intersection(full_indices)
        if key == "1_mirna_full_single_high_conf":
            repeats_by_cell_line_full[cell_line].loc[index, "x1"] = df_knockdown[cell_line]
        else:
            repeat_number = key.split("_")[-1]
            repeats_by_cell_line_full[cell_line].loc[index, repeat_number] = df_knockdown[cell_line]
            
        # add ddG
        repeats_by_cell_line_full[cell_line].loc[index, "ddG"] = [pairwise_target_ddG.loc[miRNA, miRNA] for miRNA in index]

In [None]:
repeat_numbers = ["x1", "x2", "x3", "x4", "x5", "x6"]
# load the inferno colormap mapped from 1 to 6
colors = sns.color_palette("inferno", n_colors=6)

x_range_log = np.arange(1, 6, 0.1)
x_range_lin = 10**x_range_log

plt.figure(figsize=(2.4, 1.8))
sc_plots = []
for index, cell_line in enumerate(cell_lines_measured):
    df = repeats_by_cell_line_full[cell_line]
    mirna_index = df.index
    x_vals = mirna_expression.loc[mirna_index, cell_line]
    
    for repeat_number in repeat_numbers:
        sc = plt.scatter(x_vals, df[repeat_number], s=2, color=colors[int(repeat_number[1]) - 1], edgecolor="none",
                        label=repeat_number if index==0 else None, rasterized=True)
        if index == 0:
            sc_plots.append(sc)
            # plt.plot(x_range_log, np.log10(transfer_function(int(repeat_number[1:])*x_range_lin, *popt)), color="black", linewidth=1, ls="--")
plt.xlim(0, 5.7)
plt.ylim(-2, 0.25)
plt.xlabel(r"log$_{10}$(miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")
plt.legend(loc="lower left", frameon=False, fontsize=7, ncols=2, handler_map={sc: HandlerSize(12) for sc in sc_plots})
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/{curr_folder}/7.3_repeats_single_plot_{exclusion_string}.{format}", dpi=300)


In [None]:
repeat_numbers = ["x1", "x2", "x3", "x4", "x5", "x6"]
# load the inferno colormap mapped from 1 to 6
colors = sns.color_palette("inferno", n_colors=6)

x_range_log = np.arange(0, 5.7, 0.1)
x_range_lin = 10**x_range_log

fig, axs = plt.subplots(3, 2, figsize=(3.5, 3.5))
axs = axs.flatten()

for ax, repeat_number in zip(axs, repeat_numbers):
    sc_plots = []
    x_flat = []
    y_flat = []
    for index, cell_line in enumerate(cell_lines_measured):
        df = repeats_by_cell_line_full[cell_line].dropna()
        mirna_index = df.index
        x_vals = mirna_expression.loc[mirna_index, cell_line].astype(float)
    
        sc = ax.scatter(x_vals, df[repeat_number], s=2, color=colors[int(repeat_number[1]) - 1], edgecolor="none",
                        label=repeat_number if index == 0 else None, rasterized=True)
        
        x_flat.append(np.log10(transfer_function(int(repeat_number[1:]) * 10**x_vals.values, *popt)))
        y_flat.append(df[repeat_number])
        if index == 0:
            sc_plots.append(sc)
            ax.plot(x_range_log, np.log10(transfer_function(int(repeat_number[1:]) * x_range_lin, *popt)), color="black", linewidth=1, ls="--")
            ax.plot(x_range_log,  np.log10(transfer_function(x_range_lin, *popt)), color="darkgrey", linewidth=1, ls="--")
    
    x_flat = np.concatenate(x_flat)
    y_flat = np.concatenate(y_flat)
    r2 = stats.pearsonr(x_flat, y_flat)[0]**2
    
    ax.set_xlim(0, 5.7)
    ax.set_ylim(-2, 0.25)
    ax.set_yticks([-2, -1, 0])
    ax.text(x=0.4, y=-1.85, s=r"$r^2$=" + f"{round(r2, 2)}", fontsize=7)
    #ax.text(x=0.4, y=-1.2, s=f"{repeat_number[1:]} repeats", fontsize=7)
    if repeat_number in ["x5", "x6"]:
        ax.set_xlabel(r"log$_{10}$(expression)")
        ax.set_xticks([0, 1, 2, 3, 4, 5])
    else:
        ax.set_xticklabels([])
    if repeat_number in ["x1", "x3", "x5"]:
        ax.set_ylabel(r"log$_{10}$(stability)")
    else:
        ax.set_yticklabels([])
    #ax.legend(loc="lower left", frameon=False, fontsize=7, ncols=2, handler_map={sc: HandlerSize(12) for sc in sc_plots})
    ax.set_title(f"{repeat_number[1:]} repeats", fontsize=7)

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/{curr_folder}/7.3_repeats_separate_plots_{exclusion_string}.{format}", dpi=300)
plt.show()


In [None]:
curr_folder

In [47]:
with open(os.path.join(output_folder, "7_repeats_by_cell_line_full.pkl"), "wb") as f:
    pickle.dump(repeats_by_cell_line_full, f)

In [48]:
for cell_line, df in repeats_by_cell_line_full.items():
    df['cell_line'] = cell_line

# Concatenate all modified DataFrames along the columns
combined_df = pd.concat(repeats_by_cell_line_full.values(), axis=0)
combined_df = combined_df.dropna()
combined_df["miRNA"] = combined_df.index.copy()
combined_df.set_index(["miRNA", "cell_line"], inplace=True, drop=False)

In [49]:
combined_df["2>1"] = combined_df["x2"] > combined_df["x1"] + 0.1
combined_df["3>2"] = combined_df["x3"] > combined_df["x2"] + 0.1
combined_df["4>3"] = combined_df["x4"] > combined_df["x3"] + 0.1
combined_df["5>4"] = combined_df["x5"] > combined_df["x4"] + 0.1
combined_df["6>5"] = combined_df["x6"] > combined_df["x5"] + 0.1

# get the patterns
combined_df["even_pattern"] = combined_df["2>1"] & combined_df["4>3"] & (combined_df["x6"]<0.3)
combined_df["odd_pattern"] = combined_df["3>2"] & combined_df["5>4"] & (combined_df["x6"]<0.3)
# combined_df["outlier"] = combined_df["2>1"] | combined_df["3>2"] | combined_df["4>3"] | combined_df["5>4"] | combined_df["6>5"]
combined_df["highly_stable"] = (combined_df["x6"]>0.3)

# get the mean
combined_df["mean"] = combined_df[repeat_numbers].mean(axis=1)

# add high ddG
combined_df["high_ddG"] = False
mask = combined_df["ddG"] < -10
combined_df.loc[mask, "high_ddG"] = True

In [None]:
print("Number of miRNAs in the even pattern:")
print(combined_df[combined_df["even_pattern"]]["miRNA"].unique())
even_pattern = combined_df[combined_df["even_pattern"]]
print(len(even_pattern["miRNA"].unique()))

print("Number of miRNAs in the odd pattern:")
print(combined_df[combined_df["odd_pattern"]]["miRNA"].unique())
odd_pattern = combined_df[combined_df["odd_pattern"]]
print(len(odd_pattern["miRNA"].unique()))

print("Number of highly stable miRNAs:")
print(combined_df[combined_df["highly_stable"]]["miRNA"].unique())
highly_stable = combined_df[combined_df["highly_stable"]]
print(len(highly_stable["miRNA"].unique()))

In [None]:
plt.figure(figsize=(2.4, 1.8))
for index, row in combined_df.iterrows():
    plt.plot(repeat_numbers, row[repeat_numbers], color="tab:blue", alpha=0.1)
plt.ylim(-2.2, 0.25)
plt.xlabel(r"log$_{10}$(miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/{curr_folder}/7.3_expression_vs_repeats_{exclusion_string}.{format}", dpi=300)

In [None]:
plt.figure(figsize=(2.4, 1.8))
odd_or_even = pd.concat([even_pattern, odd_pattern], axis=0)

combined_df_other = combined_df[~combined_df.index.isin(odd_or_even.index)]
for index, row in combined_df_other.iterrows():
    plt.plot(repeat_numbers, row[repeat_numbers], color="tab:blue", alpha=0.05)
    
for index, row in odd_or_even.iterrows():
    plt.plot(repeat_numbers, row[repeat_numbers], color="red", alpha=0.3)
    
plt.ylim(-2.2, 1)
plt.xlabel(r"repeat number")
plt.ylabel(r"log$_{10}$(stability)")
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/{curr_folder}/7.3_expression_vs_repeats_odd_and_even_{exclusion_string}.{format}", dpi=300)

In [None]:
plt.figure(figsize=(2.4, 1.8))

combined_df_other = combined_df[~combined_df.index.isin(highly_stable.index)]
for index, row in combined_df_other.iterrows():
    plt.plot(repeat_numbers, row[repeat_numbers], color="tab:blue", alpha=0.05)
    
for index, row in highly_stable.iterrows():
    plt.plot(repeat_numbers, row[repeat_numbers], color="red", alpha=0.3)
    
plt.ylim(-2.2, 1)
plt.xlabel(r"repeat number")
plt.ylabel(r"log$_{10}$(stability)")
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/{curr_folder}/7.3_expression_vs_repeats_highly_stable_{exclusion_string}.{format}", dpi=300)

In [None]:
%%capture output
for mirna in odd_pattern["miRNA"].unique():
    mirna_rows = combined_df[combined_df["miRNA"] == mirna]
    plt.figure(figsize=(1.8, 1))
    for index, row in mirna_rows.iterrows():
        plt.plot(repeat_numbers, row[repeat_numbers],
                 label=row["cell_line"], color=cell_line_colors[row["cell_line"]])
    
    plt.ylim(-2, 0.25)
    plt.xlabel("repeat number")
    plt.ylabel("stability")
    plt.title(mirna, fontsize=8)
    plt.legend(loc=[1.05, 0], frameon=False, fontsize=6)
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/7.3_odd_pattern_{mirna}.{format}", dpi=300, bbox_inches='tight')
    
for mirna in even_pattern["miRNA"].unique():
    mirna_rows = combined_df[combined_df["miRNA"] == mirna]
    plt.figure(figsize=(1.8, 1))
    for index, row in mirna_rows.iterrows():
        plt.plot(repeat_numbers, row[repeat_numbers],
                 label=row["cell_line"], color=cell_line_colors[row["cell_line"]])
        
    plt.ylim(-2, 0.25)
    plt.xlabel("repeat number")
    plt.ylabel("stability")
    plt.title(mirna, fontsize=8)
    plt.legend(loc=[1.05, 0], frameon=False, fontsize=6)
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/7.3_even_pattern_{mirna}.{format}", dpi=300, bbox_inches='tight')

In [None]:
pairwise_target_ddG.loc["hsa-miR-590-5p", "hsa-miR-590-5p"]

In [None]:
pairwise_target_ddG_diagonal_all = [pairwise_target_ddG.iloc[i,i] for i in range(len(pairwise_target_ddG))]
pairwise_target_ddG_measured = [pairwise_target_ddG.loc[miRNA, miRNA] for miRNA in combined_df["miRNA"].unique()]
bins = np.arange(-20, 0, 1)

plt.figure(figsize=(2.4, 1.8))
plt.hist(pairwise_target_ddG_diagonal_all, alpha=0.5, color="tab:blue", density=True, bins=bins, label="all targets")
plt.hist(pairwise_target_ddG_measured, alpha=0.5, color="tab:red", density=True, bins=bins, label="measured repeats")
plt.axvline(pairwise_target_ddG.loc["hsa-miR-21-5p", "hsa-miR-21-5p"], color="black")
plt.text(pairwise_target_ddG.loc["hsa-miR-21-5p", "hsa-miR-21-5p"]-1.5, 0.185, "21-5p", color="black", fontsize=7)
plt.axvline(pairwise_target_ddG.loc["hsa-miR-520a-5p", "hsa-miR-520a-5p"], color="black")
plt.text(pairwise_target_ddG.loc["hsa-miR-520a-5p", "hsa-miR-520a-5p"]-2, 0.185, "520a-5p", color="black", fontsize=7)
plt.axvline(pairwise_target_ddG.loc["hsa-miR-224-5p", "hsa-miR-224-5p"], color="black")
plt.text(pairwise_target_ddG.loc["hsa-miR-224-5p", "hsa-miR-224-5p"]-2, 0.17, "224-5p", color="black", fontsize=7)
plt.axvline(pairwise_target_ddG.loc["hsa-miR-186-5p", "hsa-miR-186-5p"], color="black")
plt.text(pairwise_target_ddG.loc["hsa-miR-186-5p", "hsa-miR-186-5p"]-2, 0.17, "186-5p", color="black", fontsize=7)
plt.axvline(pairwise_target_ddG.loc["hsa-miR-324-5p", "hsa-miR-324-5p"], color="grey")
plt.text(pairwise_target_ddG.loc["hsa-miR-324-5p", "hsa-miR-324-5p"]-3, 0.17, "324-5p", color="grey", fontsize=7)
plt.axvline(pairwise_target_ddG.loc["hsa-miR-548am-3p", "hsa-miR-548am-3p"], color="grey")
plt.text(pairwise_target_ddG.loc["hsa-miR-548am-3p", "hsa-miR-548am-3p"]-3, 0.185, "548am-3p", color="grey", fontsize=7)
plt.xlabel(r"pairwise target $\Delta\Delta$G")
plt.ylabel("frequency")
plt.legend(loc="upper left", frameon=False, fontsize=7)
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/{curr_folder}/7.3_pairwise_target_ddG_{exclusion_string}.{format}", dpi=300, bbox_inches='tight')

In [None]:
pairwise_target_ddG.loc["hsa-miR-186-5p", "hsa-miR-186-5p"]

In [None]:
mirbase.loc["hsa-miR-186-5p"]

In [None]:
combined_df_293T = combined_df[combined_df["cell_line"] == "HEK293T"]
combined_df_293T = combined_df_293T.sort_values(by="ddG")

In [None]:
for mirna in combined_df_293T["miRNA"].head(5):
    mirna_rows = combined_df[combined_df["miRNA"] == mirna]
    plt.figure(figsize=(1.8, 1))
    for index, row in mirna_rows.iterrows():
        plt.plot(repeat_numbers, row[repeat_numbers],
                 label=row["cell_line"], color=cell_line_colors[row["cell_line"]])
        
    plt.ylim(-2, 0.25)
    plt.xlabel("repeat number")
    plt.ylabel("stability")
    plt.title(f"{mirna}, "+r"$\Delta\Delta$G"+f"={pairwise_target_ddG.loc[mirna, mirna]:.2f}", fontsize=8)
    #plt.legend(loc=[1.05, 0], frameon=False, fontsize=6)
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{curr_folder}/7.3_highest_ddG_{mirna}_{exclusion_string}.{format}", dpi=300, bbox_inches='tight')

# 7.4 - Plot an overview of R2 values

In [56]:
curr_folder = "7.4_r2_overview"
# create folder if it does not exist
if not os.path.exists(f"{plot_folder}/{curr_folder}"):
    os.makedirs(f"{plot_folder}/{curr_folder}")

In [57]:
# create a heatmap of the r2 values for r2_vals_713 and r2_vals_723
# join the two
r2_vals_713 = r2_vals_713.T
r2_vals_723 = r2_vals_723.T
r2_vals = r2_vals_713.join(r2_vals_723, lsuffix="_713", rsuffix="_723")
r2_vals = r2_vals.astype(float)

rmsd_vals_713 = rmsd_vals_713.T
rmsd_vals_723 = rmsd_vals_723.T
rmsd_vals = rmsd_vals_713.join(rmsd_vals_723, lsuffix="_713", rsuffix="_723")
rmsd_vals = rmsd_vals.astype(float)

In [58]:
# switch the second and third column
r2_vals = r2_vals[["single_713", "single_723", "AND_713", "AND_723"]]

In [None]:
# Prepare the figure with two subplots
plt.clf()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(2, 1.8), gridspec_kw={'width_ratios': [1, 1], 'wspace': 0.5})

# Plot the first two columns in the first subplot
sns.heatmap(r2_vals.iloc[:, :2], cmap="viridis", annot=True, fmt=".2f", vmin=0.5, vmax=0.9, ax=ax1,
            annot_kws={"size": 6.5}, cbar=True)
# ax1.set_xticklabels(["Label1", "Label2"], rotation=0)
ax1.set_xticklabels(["e.d.", "i.t."], rotation=0)
ax1.set_title("repeats", fontsize=7)

# Plot the second two columns in the second subplot
sns.heatmap(r2_vals.iloc[:, 2:], cmap="viridis", annot=True, fmt=".2f", vmin=0.5, vmax=0.9, ax=ax2,
            annot_kws={"size": 6.5}, cbar_kws={'label': r'$r^2$'})
# ax2.set_xticklabels(["Label3", "Label4"], rotation=0)
ax2.set_xticklabels(["e.d.", "i.t."], rotation=0)
ax2.set_title("combinations", fontsize=7)

# Save each format
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/{curr_folder}/7.4_r2_overview_{exclusion_string}.{format}", dpi=300, bbox_inches='tight')

In [None]:
# what is the correlation between single_713 and single_723
r2_single = stats.pearsonr(r2_vals["single_713"], r2_vals["single_723"])[0]**2
print(r2_single)

# what is the correlation between
r2_AND = stats.pearsonr(r2_vals["AND_713"], r2_vals["AND_723"])[0]**2
print(r2_AND)

# what is the mean
print((r2_single+r2_AND)/2)

In [None]:
# create a heatmap
plt.clf()
fig, ax = plt.subplots(figsize=(1.6, 2))
sns.heatmap(rmsd_vals, cmap="cividis_r", annot=True, fmt=".2f", vmin=0.1, vmax=0.3, ax=ax,
            annot_kws={"size": 6.5},
            cbar_kws={'label': r'$r^2$'})

plt.xticks([0.5, 1.5, 2.5, 3.5],
           ["rep.\n             expression", "comb.","rep.\n              inv. transfer", "comb."],
           rotation=0)

for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/{curr_folder}/7.4_rmsd_overview_{exclusion_string}.{format}", dpi=300, bbox_inches='tight')