In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from library2_utils.color_scheme import cell_line_colors, cell_line_symbols
import pickle

# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

cell_lines_subset = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH7", "A549"]
cell_lines_rest = ["HaCaT", "JEG3", "Tera1", "PC3"]
cell_lines_measured = cell_lines_subset + cell_lines_rest
cell_lines_UTR = [cell_line + "_3UTR" for cell_line in cell_lines_measured]

plot_folder = "../plots/14_high_stability"
# create the folder if it doesn't exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)
    
output_folder = "../outputs/14_high_stability"
# create the folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    
# get mirbase
mirbase = pd.read_csv("../microrna_data/mirbase_extended.csv", index_col=0)

In [2]:
data_dir_input = "../measured_data/2_normalized_log10"

# get the name of all files in "reference" folder
reference_files = os.listdir(data_dir_input)

# read them into a dictionary
reference_dict = {}
for reference_file in reference_files:
    if reference_file.endswith(".csv"):
        reference_dict[reference_file.split('.')[0]] = pd.read_csv(os.path.join(data_dir_input, reference_file), index_col=0)
        
used_mirna_data = pd.read_csv("../microrna_data/3_output/Alles_Keller_combined_expression_with_crosstalk.csv", index_col=0)
# remove "hsa-miR-3613-3p" if present
if "hsa-miR-3613-3p" in used_mirna_data.index:
    df_merge = used_mirna_data.drop("hsa-miR-3613-3p", axis=0)
    
# get all dfs that contain "single" in their key
single_dfs = {key: reference_dict[key].copy() for key in reference_dict.keys() if "single" in key or "repeat" in key}

for key, df in single_dfs.items():
    df.set_index("miRNA1", inplace=True)
    # remove all columns that are not 3UTR
    df = df.filter(regex='(3UTR)').copy()

    # get the current list of cell lines:
    cell_lines = [column.split("_")[0] for column in df.columns]

    # drop NaN values
    df.dropna(inplace=True)

    single_dfs[key] = df

repeat_names = [key for key in reference_dict.keys() if "full_repeat" in key or "1_mirna" in key]
repeat_mirnas = reference_dict["6_miRNA_full_repeat_x2"]["miRNA1"].values

In [3]:
combination_dfs = {key: reference_dict[key].copy() for key in reference_dict.keys() if "full_combination" in key or "AND" in key}
combination_dfs_flat = pd.concat([combination_dfs[key] for key in combination_dfs.keys()], axis=0)

In [5]:
high_stability = pd.read_csv('../measured_data/2_normalized_log10/high_stability/high_stability.csv', index_col=0)
mirna_columns = high_stability.columns[high_stability.columns.str.contains('miRNA')]

## Full repeats

In [6]:
current_plot_folder = os.path.join(plot_folder, "repeats")
# create the folder if it doesn't exist
if not os.path.exists(current_plot_folder):
    os.makedirs(current_plot_folder)
    
input_folder = "../outputs/7a_additive_full"
with open(os.path.join(input_folder, "7_repeats_by_cell_line_full.pkl"), "rb") as f:
    repeats_by_cell_line = pickle.load(f)

In [7]:
# get all indices that contain "repeat_x6"
repeat_x6 = high_stability.index.str.contains('full_repeat_x6')
high_x6 = high_stability[repeat_x6]
unique_mirnas_x6 = high_x6[mirna_columns].unstack().dropna().unique()

# get all indices that contain "repeat_x5"
repeat_x5 = high_stability.index.str.contains('full_repeat_x5')
high_x5 = high_stability[repeat_x5]
unique_mirnas_x5 = high_x5[mirna_columns].unstack().dropna().unique()

# get all indices that contain "repeat_x4"
repeat_x4 = high_stability.index.str.contains('full_repeat_x4')
high_x4 = high_stability[repeat_x4]
unique_mirnas_x4 = high_x4[mirna_columns].unstack().dropna().unique()

# get all indices that contain "repeat_x3"
repeat_x3 = high_stability.index.str.contains('full_repeat_x3')
high_x3 = high_stability[repeat_x3]
unique_mirnas_x3 = high_x3[mirna_columns].unstack().dropna().unique()

# get all indices that contain "repeat_x2"
repeat_x2 = high_stability.index.str.contains('full_repeat_x2')
high_x2 = high_stability[repeat_x2]
unique_mirnas_x2 = high_x2[mirna_columns].unstack().dropna().unique()

# get all indices that contain "1_"
repeat_x1 = high_stability.index.str.contains('1_mirna_full_single_high_conf')
high_x1 = high_stability[repeat_x1]
high_x1 = high_x1[high_x1['miRNA1'].isin(repeat_mirnas)]
unique_mirnas_x1 = high_x1[mirna_columns].unstack().dropna().unique()

In [None]:
plt.figure(figsize=(2, 1.4))
total_stable_mirnas = {
    "x1": len(unique_mirnas_x1),
    "x2": len(unique_mirnas_x2),
    "x3": len(unique_mirnas_x3),
    "x4": len(unique_mirnas_x4),
    "x5": len(unique_mirnas_x5),
    "x6": len(unique_mirnas_x6)
}
plt.bar(total_stable_mirnas.keys(), total_stable_mirnas.values(), color="skyblue")
plt.xlabel("repeats")
plt.ylabel("highly stable constructs")
plt.savefig(os.path.join(current_plot_folder, "total_stable_mirnas.svg"), dpi=300, bbox_inches='tight')

In [9]:
unique_mirnas_x3_plot = ["-".join(mirna.split("-")[2:]) for mirna in unique_mirnas_x3]
unique_mirnas_x4_plot = ["-".join(mirna.split("-")[2:]) for mirna in unique_mirnas_x4]
unique_mirnas_x5_plot = ["-".join(mirna.split("-")[2:]) for mirna in unique_mirnas_x5]
unique_mirnas_x6_plot = ["-".join(mirna.split("-")[2:]) for mirna in unique_mirnas_x6]

# remove all x3 mirnas that are also in x4
unique_mirnas_x4_plot = [mirna for mirna in unique_mirnas_x4_plot if mirna not in unique_mirnas_x3_plot]
# remove all x5 mirnas that are in either x3 or x4
unique_mirnas_x5_plot = [mirna for mirna in unique_mirnas_x5_plot if mirna not in unique_mirnas_x3_plot and mirna not in unique_mirnas_x4_plot]
# remove all x6 mirnas that are in either x3, x4 or x5
unique_mirnas_x6_plot = [mirna for mirna in unique_mirnas_x6_plot if mirna not in unique_mirnas_x3_plot and mirna not in unique_mirnas_x4_plot and mirna not in unique_mirnas_x5_plot]

In [None]:
plt.figure(figsize=(2, 1.4))

center = 0.5
for i, mirna in enumerate(unique_mirnas_x3_plot):
    plt.text(0, i*0.2, mirna, fontsize=7)
for i, mirna in enumerate(unique_mirnas_x4_plot):
    plt.text(0.5, i*0.2, mirna, fontsize=7)
for i, mirna in enumerate(unique_mirnas_x5_plot):
    plt.text(1, i*0.2, mirna, fontsize=7)
for i, mirna in enumerate(unique_mirnas_x6_plot):
    plt.text(1.5, i*0.2, mirna, fontsize=7)

# delete the axes
plt.axis('off')
plt.savefig(os.path.join(current_plot_folder, "unique_repeat_mirnas.svg"), bbox_inches='tight', dpi=300)

In [12]:
repeat_by_cell_line_high = {}

for cell_line in cell_lines_measured:
    repeat_df = pd.DataFrame(index=unique_mirnas_x6, columns=["x1", "x2", "x3", "x4", "x5", "x6"])
    repeat_df["x1"] = 10**single_dfs["1_mirna_full_single_high_conf"].loc[unique_mirnas_x6, cell_line + "_3UTR"]
    repeat_df["x2"] = 10**single_dfs["6_miRNA_full_repeat_x2"].loc[unique_mirnas_x6, cell_line + "_3UTR"]
    repeat_df["x3"] = 10**single_dfs["7_miRNA_full_repeat_x3"].loc[unique_mirnas_x6, cell_line + "_3UTR"]
    repeat_df["x4"] = 10**single_dfs["8_miRNA_full_repeat_x4"].loc[unique_mirnas_x6, cell_line + "_3UTR"]
    repeat_df["x5"] = 10**single_dfs["9_miRNA_full_repeat_x5"].loc[unique_mirnas_x6, cell_line + "_3UTR"]
    repeat_df["x6"] = 10**single_dfs["10_miRNA_full_repeat_x6"].loc[unique_mirnas_x6, cell_line + "_3UTR"]
    repeat_by_cell_line_high[cell_line] = repeat_df
    
expression_df_by_cell_line = {}
for cell_line in cell_lines_measured:
    expression_df = pd.DataFrame(index=used_mirna_data.index, columns=["x1", "x2", "x3", "x4", "x5", "x6"])
    for i in range(1, 7):
        expression_df[f"x{i}"] = np.log10(i* 10 ** used_mirna_data[cell_line])
    expression_df_by_cell_line[cell_line] = expression_df

In [17]:
for mirna in unique_mirnas_x6:
    plt.figure(figsize=(2, 1.1))
    for i, cell_line in enumerate(cell_lines_measured):
        plt.plot(expression_df_by_cell_line[cell_line].loc[mirna], repeat_by_cell_line_high[cell_line].loc[mirna],
                 label=cell_line, color=cell_line_colors[cell_line])
    plt.xlim(1, 5)
    plt.ylim(0, 4.2)
    plt.xlabel(r"log$_{10}$(repeats x miRNA expression)")
    plt.ylabel("stability")
    # plt.tight_layout()
    plt.title(mirna, fontsize=8)
    plt.tight_layout()
    plt.legend(loc=[1.05, -0.05], fontsize=6, ncols=1)
    for format in ["png", "svg"]:
        plt.savefig(f"{current_plot_folder}/{mirna}.{format}", dpi=300)
    plt.close()

In [None]:
# which percentage of microRNAs is in the next higher repeat?
print(f"percentage of x3 in x4: {100*len([mirna for mirna in unique_mirnas_x3 if mirna in unique_mirnas_x4])/len(unique_mirnas_x3)}")
print(f"percentage of x4 in x5: {100*len([mirna for mirna in unique_mirnas_x4 if mirna in unique_mirnas_x5])/len(unique_mirnas_x4)}")
print(f"percentage of x5 in x6: {100*len([mirna for mirna in unique_mirnas_x5 if mirna in unique_mirnas_x6])/len(unique_mirnas_x5)}")

# Target combinations

In [21]:
high_stability_non_repeat = high_stability[~high_stability.index.str.contains("repeat")].copy()
high_stability_non_repeat = high_stability_non_repeat[high_stability_non_repeat.index.str.contains("full")]
high_stability_non_repeat = high_stability_non_repeat[~high_stability_non_repeat.index.str.contains("single")]

In [22]:
high_stability_non_repeat_4 = high_stability_non_repeat[high_stability_non_repeat.index.str.contains("x4") | high_stability_non_repeat.index.str.contains("AND4")]
high_stability_non_repeat_5 = high_stability_non_repeat[high_stability_non_repeat.index.str.contains("x5") | high_stability_non_repeat.index.str.contains("AND5")]
high_stability_non_repeat_6 = high_stability_non_repeat[high_stability_non_repeat.index.str.contains("x6") | high_stability_non_repeat.index.str.contains("AND6")]

all_4 = combination_dfs_flat[combination_dfs_flat.index.str.contains("x4") | combination_dfs_flat.index.str.contains("AND4")]
all_5 = combination_dfs_flat[combination_dfs_flat.index.str.contains("x5") | combination_dfs_flat.index.str.contains("AND5")]
all_6 = combination_dfs_flat[combination_dfs_flat.index.str.contains("x6") | combination_dfs_flat.index.str.contains("AND6")]

fraction_4 = 100 * len(high_stability_non_repeat_4) / len(all_4)
fraction_5 = 100 * len(high_stability_non_repeat_5) / len(all_5)
fraction_6 = 100 * len(high_stability_non_repeat_6) / len(all_6)

In [None]:
fractions_comb = {"4": fraction_4, "5": fraction_5, "6": fraction_6}
plt.figure(figsize=(1.4, 1.4))
plt.bar(fractions_comb.keys(), fractions_comb.values(), color="skyblue")
plt.xlabel("miRNA sites")
plt.ylabel("constructs with high stability (%)")
plt.tight_layout()
plt.savefig(os.path.join(current_plot_folder, "fractions_combination.svg"), dpi=300)

In [38]:
intersection_suspicious = set(unique_mirnas_x6).intersection(set(unique_mirnas_x5)).intersection(set(unique_mirnas_x4))  

In [39]:
for index, design in high_stability_non_repeat.iterrows():
    mirnas = design[mirna_columns].dropna()
    if any([mirna in intersection_suspicious for mirna in mirnas]):
        high_stability_non_repeat.loc[index, "in_x6"] = True
        
        # get the mirna that is in x6
        # mirna = [mirna for mirna in mirnas if mirna in unique_mirnas_x6][0]
        # print(index, mirna)
    else:
        high_stability_non_repeat.loc[index, "in_x6"] = False

In [None]:
high_stability_non_repeat_true = high_stability_non_repeat[high_stability_non_repeat["in_x6"]].copy()
high_stability_non_repeat_false = high_stability_non_repeat[high_stability_non_repeat["in_x6"] == False].copy()

high_fraction_true = 100 * len(high_stability_non_repeat_true) / len(high_stability_non_repeat)
high_fraction_false = 100 * len(high_stability_non_repeat_false) / len(high_stability_non_repeat)

# print them
print(f"high fraction true: {high_fraction_true}")
print(f"high fraction false: {high_fraction_false}")

In [41]:
for index, design in combination_dfs_flat.iterrows():
    mirnas = design[mirna_columns].dropna()
    if any([mirna in intersection_suspicious for mirna in mirnas]):
        combination_dfs_flat.loc[index, "in_x6"] = True
    else:
        combination_dfs_flat.loc[index, "in_x6"] = False

In [None]:
all_true = combination_dfs_flat[combination_dfs_flat["in_x6"]].copy()
all_false = combination_dfs_flat[combination_dfs_flat["in_x6"] == False].copy()

all_fraction_true = 100 * len(all_true) / len(combination_dfs_flat)
all_fraction_false = 100 * len(all_false) / len(combination_dfs_flat)

# print them
print(f"all fraction true: {all_fraction_true}")
print(f"all fraction false: {all_fraction_false}")

In [None]:
# create a barplot
plt.figure(figsize=(1.4, 1.6))
plt.bar(["all", "high stability"], [all_fraction_true, high_fraction_true], color="skyblue")
plt.ylabel("fraction containing highly\nstable repeat miRNAs (%)")
plt.xlabel("target combination constructs")
plt.tight_layout()
plt.savefig(os.path.join(current_plot_folder, "fractions_all_high.svg"), dpi=300)
