In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import pickle
import os
from library2_utils.mirna_levels import normalize_expr_df_to_rpm_with_index
from library2_utils.color_scheme import cell_line_colors, cell_line_symbols

# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

cell_lines_subset = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH7", "A549"]
cell_lines_rest = ["HaCaT", "JEG3", "Tera1", "PC3"]
cell_lines_measured = cell_lines_subset + cell_lines_rest

plot_folder = "../plots/2_explore_microRNA_data"
output_folder = f'../microrna_data/2_output/'

# create these folders if they don't exist
os.makedirs(plot_folder, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)

### This notebook represents some initial exploration of the data

In [2]:
data_dir_input = "../measured_data/2_normalized_log10"

# get the name of all files in "reference" folder
reference_files = os.listdir(data_dir_input)

# read them into a dictionary
reference_dict = {}
for reference_file in reference_files:
    if reference_file.endswith(".csv"):
        reference_dict[reference_file.split('.')[0]] = pd.read_csv(os.path.join(data_dir_input, reference_file), index_col=0)

## read the microRNA expression data

### conormalize the data
'Conormalization corresponds to the notion that we calculate the normalization factor looking only a high confidence microRNAs from miRbase, normalizing all other microRNAs according to the same factor

In [3]:
mirbase = pd.read_csv('../microrna_data/mirbase_original.csv', index_col=0)

In [4]:
# get likely real mirnas
with open("../microrna_data/likely_real_mirnas.pkl", "rb") as f:
    likely_real_mirnas = pickle.load(f)

In [5]:
df_alles = pd.read_csv("../microrna_data/1_input/Alles2019_all.csv", index_col=0)
df_alles = normalize_expr_df_to_rpm_with_index(df_alles, likely_real_mirnas)
df_alles.to_csv(f"{output_folder}/Alles2019_conormalized.csv")

In [7]:
df_keller = pd.read_excel('../microrna_data/1_input/Keller2023_all.csv', index_col=0)
df_keller = normalize_expr_df_to_rpm_with_index(df_keller, likely_real_mirnas)
df_keller.to_csv(f"{output_folder}/Keller2023_conormalized.csv")

In [8]:
# this one is already normalized
# df_merged_unfiltered = pd.read_csv("../microrna_data/3_output/Alles_Keller_completely_unfiltered_merge.csv", index_col=0)
# df_merged_unfiltered = 10**df_merged_unfiltered

## Add the expression data to the dataframes

In [9]:
used_mirna_data = df_alles
used_mirna_name = "Alles2019"
# make it log10
used_mirna_data = np.log10(used_mirna_data)

In [10]:
# ADD EXPRESSION DATA TO THE DATAFRAMES
# get all dfs that contain "single" in their key
single_dfs = {key: reference_dict[key].copy() for key in reference_dict.keys() if "single" in key or "full_repeat" in key}

for key, df in single_dfs.items():
    # add miRNA expression data
    # this is done by matching the column "miRNA1" in the df with the column "miRNA" in the mirna_expression_df
    # not all values are present in the mirna_expression_df, so we have to match them
    df.set_index("miRNA1", inplace=True)
    
    # remove all columns that are not 3UTR
    df = df.filter(regex='3UTR')
    
    # check which cell lines are in the expression_df
    for column in df.columns:
        if column.split("_")[0] not in used_mirna_data.columns:
           df = df.drop(column, axis=1)

    # restrict to microRNAs that are present in the expression_df
    df = df.loc[df.index.intersection(used_mirna_data.index)]

    # get the current list of cell lines:
    cell_lines = [column.split("_")[0] for column in df.columns]
    for cell_line in cell_lines:
        df.loc[:, f"{cell_line}_exp"] = used_mirna_data.loc[df.index, cell_line]

    # drop NaN values
    df.dropna(inplace=True)

    single_dfs[key] = df

# 2.1 Plot all designs containing only a single microRNA against the expression

### 2.1.1 - Plot each individually in subplots

In [11]:
%%capture output
label_mirna = "hsa-miR-100-5p"

# create the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/{used_mirna_name}/2.1.1_individual_plots"):
    os.makedirs(f"{plot_folder}/{used_mirna_name}/2.1.1_individual_plots")

for key in single_dfs.keys():
    plt.clf()
    # create a figure with 10 subplots
    fig, axs = plt.subplots(5, 2, figsize=(6, 8), sharex=True, sharey=True)
    for i, cell_line in enumerate(cell_lines_measured):

        df = single_dfs[key].copy()
        rs = []
        # calculate the correlation coefficient
        r, p = stats.spearmanr(df[f"{cell_line}_exp"], df[f"{cell_line}_3UTR"])
        r2 = r**2
        # plot the data
        axs[i//2, i%2].scatter(df[f"{cell_line}_exp"], df[f"{cell_line}_3UTR"], color="black", s=3)
        
        if label_mirna in df.index:
            axs[i//2, i%2].scatter(df.loc[label_mirna, f"{cell_line}_exp"], df.loc[label_mirna, f"{cell_line}_3UTR"], color="red", s=10, marker="x",label=label_mirna)
        
        # only show the axis labels on the outer plots
        if i//2 == 4:
            axs[i//2, i%2].set_xlabel(r"log$_{10}$(miRNA expression)")
        if i%2 == 0:
            axs[i//2, i%2].set_ylabel(r"log$_{10}$(RNA/DNA)")
        
        axs[i//2, i%2].set_xlim(0, 5.5)
        axs[i//2, i%2].set_ylim(-1.7, 0.25)
        
        axs[i//2, i%2].legend(loc='lower left', fontsize=7)
        axs[i//2, i%2].set_title(f"{cell_line}", fontsize=7) #, "+r"$\rho^2$=" + f"{round(r2, 2)}

    plt.tight_layout()
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{used_mirna_name}/2.1.1_individual_plots/{key}_individual.{format}", dpi=300)

### 2.1.2 - Plot each individually in subplots

In [12]:
%%capture output
label_mirna = "hsa-miR-100-5p"

# create the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/{used_mirna_name}/2.1.2_individual_plots"):
    os.makedirs(f"{plot_folder}/{used_mirna_name}/2.1.2_individual_plots")

for key in single_dfs.keys():
    for i, cell_line in enumerate(cell_lines_measured):
        plt.clf()
        fig, ax = plt.subplots(figsize=(3, 2))

        df = single_dfs[key].copy()
        rs = []
        # calculate the correlation coefficient
        r, p = stats.spearmanr(df[f"{cell_line}_exp"], df[f"{cell_line}_3UTR"])
        r2 = r**2
        # plot the data
        plt.scatter(df[f"{cell_line}_exp"], df[f"{cell_line}_3UTR"], color="black", s=3)
        
        if label_mirna in df.index:
            label_mirna_text = "-".join(label_mirna.split("-")[2:])
            # plt.text(df.loc[label_mirna, f"{cell_line}_exp"]+0.1, df.loc[label_mirna, f"{cell_line}_3UTR"],
            #     label_mirna, fontsize=7, color="red",
            #     bbox=dict(facecolor='white', alpha=0.5, edgecolor="black", boxstyle="round,pad=0.1"))
            plt.scatter(df.loc[label_mirna, f"{cell_line}_exp"]+0.1, df.loc[label_mirna, f"{cell_line}_3UTR"],
                color="red", s=10, marker="x", label=label_mirna)
        
        ax.set_xlabel(r"log$_{10}$(miRNA expression)")
        ax.set_ylabel(r"log$_{10}$(RNA/DNA)")
        
        ax.set_xlim(0, 5.5)
        ax.set_ylim(-1.7, 0.25)
        
        ax.set_title(f"{cell_line}", fontsize=8) #+ r"$\rho^2$=" + f"{round(r2, 2)}
        plt.legend(loc="lower left", fontsize=8)
        
        plt.tight_layout()
        for format in ["png", "svg"]:
            plt.savefig(f"{plot_folder}/{used_mirna_name}/2.1.2_individual_plots/{key}_{cell_line}.{format}", dpi=300)

### 2.1.3 - Plot them all into the same plot

In [13]:
%%capture output

# create the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/{used_mirna_name}/2.1.3_combined_plots"):
    os.makedirs(f"{plot_folder}/{used_mirna_name}/2.1.3_combined_plots")

for key in single_dfs.keys():    
    fig = plt.figure(figsize=(2.5, 1.8))

    df = single_dfs[key].copy()
    rs = []
    for cell_line in list(set(cell_lines) & set(cell_lines_measured)):
        # calculate the correlation coefficient
        r, p = stats.spearmanr(df[f"{cell_line}_exp"], df[f"{cell_line}_3UTR"])
        # plot the data
        plt.scatter(df[f"{cell_line}_exp"], df[f"{cell_line}_3UTR"], color=cell_line_colors[cell_line], 
            s=1.5, marker=cell_line_symbols[cell_line], label=f"{cell_line}") #, " + r"$\rho^2$=" + f"{round(r**2, 2)}")
        
    plt.xlabel(r"log$_{10}$"+f"(miRNA expression)")
    plt.ylabel(r"log$_{10}$(RNA/DNA)")

    plt.xlim(0, 5.5)
    plt.ylim(-2, 0.5)
    plt.legend(loc="lower left", frameon=False, fontsize=7, ncol=1)
    plt.title(f"{used_mirna_name}, {key}", fontsize=7)
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{used_mirna_name}/2.1.3_combined_plots/{key}_{used_mirna_name}.{format}", dpi=300, bbox_inches='tight')

# 2.3 - Compare the results of library 1 and library 2

In [14]:
data_dir_input = "../measured_data/0_library 1"

# get the name of all files in "reference" folder
reference_files_lib1 = os.listdir(data_dir_input)

# read them into a dictionary
reference_dict_lib1 = {}
for reference_file_lib1 in reference_files_lib1:
    if reference_file_lib1.endswith(".csv"):
        reference_dict_lib1[reference_file_lib1.split('.')[0]] = pd.read_csv(os.path.join(data_dir_input, reference_file_lib1), index_col=0)

In [15]:
measured_single_lib1 = reference_dict_lib1["1_full_single_context1"]
measured_single_lib2 = reference_dict["1_mirna_full_single_high_conf"]

In [16]:
measured_single_lib1.set_index("miRNA1", inplace=True)
measured_single_lib2.set_index("miRNA1", inplace=True)

In [17]:
measured_single_lib1 = measured_single_lib1.filter(regex='(3UTR)')
measured_single_lib1.columns = [f"{column.split('_')[1]}" for column in measured_single_lib1.columns]

In [18]:
measured_single_lib2 = measured_single_lib2.filter(regex='(3UTR)')
measured_single_lib2.columns = [f"{column.split('_')[0]}" for column in measured_single_lib2.columns]

In [19]:
# find the five strongest outliers for each cell line
common_columns = measured_single_lib1.columns.intersection(measured_single_lib2.columns)

outliers = {}
for column in common_columns:
    df1 = measured_single_lib1[column]
    df2 = measured_single_lib2[column]
    
    # make them have the same index
    index_both = df1.index.intersection(df2.index)
    df1 = df1.loc[index_both]
    df2 = df2.loc[index_both]
    
    # calculate the difference
    diff = df1 - df2
    
    # find the five strongest outliers
    outliers[column] = diff.abs().nlargest(5).index

In [105]:
mut_mirs = ['hsa-let-7a-5p', 'hsa-let-7i-5p', 'hsa-miR-16-5p', 'hsa-miR-19b-3p',
 'hsa-miR-21-5p', 'hsa-miR-22-3p', 'hsa-miR-23a-3p', 'hsa-miR-24-3p',
 'hsa-miR-31-3p', 'hsa-miR-31-5p', 'hsa-miR-365a-3p', 'hsa-miR-107']

In [106]:
%%capture output
# create the output folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/{used_mirna_name}/2.3_library_comparison"):
    os.makedirs(f"{plot_folder}/{used_mirna_name}/2.3_library_comparison")

for column in common_columns:
    df1 = measured_single_lib1[column]
    df2 = measured_single_lib2[column]
    
    # make them have the same index
    index_both = df1.index.intersection(df2.index)
    df1 = df1.loc[index_both]
    df2 = df2.loc[index_both]
    
    fig, ax = plt.subplots(figsize=(2.3, 1.8))
    r2 = stats.pearsonr(df1, df2)[0]**2
    plt.scatter(df1, df2, color="black", s=3)
    plt.plot(df1, df1, color="black", linestyle="--", linewidth=0.5)
    
    # # add a label for the five strongest outliers if desired
    # for i, txt in enumerate(outliers[column]):
    #     ax.annotate("-".join(txt.split("-")[2:]), (df1[txt], df2[txt]), fontsize=7, color="red")
    # add a label for the mutated miRNAs
    # for txt in mut_mirs:
    #     if txt in df1.index:
    #         ax.annotate("-".join(txt.split("-")[2:]), (df1[txt], df2[txt]), fontsize=7, color="red")
    
    plt.xlabel("stability (library 1)")
    plt.ylabel("stability (library 2)")
    plt.title(f"{column}, "+r"$r^2$=" + f"{round(r2, 2)}", fontsize=7)
    plt.tight_layout()
    plt.savefig(f"{plot_folder}/{used_mirna_name}/2.3_library_comparison/{column}_library_comparison.png", dpi=300)

In [None]:
# Create the output folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/{used_mirna_name}/2.3_library_comparison"):
    os.makedirs(f"{plot_folder}/{used_mirna_name}/2.3_library_comparison")

# Prepare a grid of subplots
fig, axs = plt.subplots(2, 2, figsize=(4, 3.2)) 
axs = axs.flatten()
current_ax = 0

for index, column in enumerate(common_columns):
    df1 = measured_single_lib1[column]
    df2 = measured_single_lib2[column]
    
    # Make them have the same index
    index_both = df1.index.intersection(df2.index)
    df1 = df1.loc[index_both]
    df2 = df2.loc[index_both]
    
    df_ex = used_mirna_data.loc[index_both, column]
    
    rho2_1 = stats.spearmanr(df_ex, df1)[0]**2
    rho2_2 = stats.spearmanr(df_ex, df2)[0]**2
    
    axs[current_ax].scatter(df_ex, df1, color="black", s=3, label=f"library 1, "+r"$\rho^2$=" + f"{round(rho2_1, 2)}")
    axs[current_ax].scatter(df_ex, df2, color="red", s=3, label=f"library 2, "+r"$\rho^2$=" + f"{round(rho2_2, 2)}")
    
    axs[current_ax].set_title(f"{column}", fontsize=7.5)
    axs[current_ax].set_xlim(0, 5.5)
    axs[current_ax].set_xticks([0, 1, 2, 3, 4, 5])
    axs[current_ax].set_ylim(-1.7, 0.25)
    
    # Only set x and y labels for the outer plots
    if current_ax >= 2:
        axs[current_ax].set_xlabel(r"log$_{10}$(miRNA expression)")
    if current_ax % 2 == 0:  # First column
        axs[current_ax].set_ylabel(r"log$_{10}$(stability)")
    
    # don't have a box around the legend
    axs[current_ax].legend(loc="lower left", fontsize=6, frameon=False)
    current_ax += 1

# Hide any unused axes
for ax in axs[current_ax:]:
    ax.axis('off')

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/{used_mirna_name}/2.3_library_comparison/all_ex_vs_stability.{format}", dpi=300)

In [None]:
# Create the output folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/{used_mirna_name}/2.3_library_comparison"):
    os.makedirs(f"{plot_folder}/{used_mirna_name}/2.3_library_comparison")

# Prepare a grid of subplots
fig, axs = plt.subplots(2, 2, figsize=(4, 3.5)) 
axs = axs.flatten()
current_ax = 0

for column in common_columns:
    df1 = measured_single_lib1[column]
    df2 = measured_single_lib2[column]
    
    # Make them have the same index
    index_both = df1.index.intersection(df2.index)
    df1 = df1.loc[index_both]
    df2 = df2.loc[index_both]
    
    r2 = stats.pearsonr(df1, df2)[0]**2
    axs[current_ax].scatter(df1, df2, color="black", s=3)
    axs[current_ax].plot(df1, df1, color="black", linestyle="--", linewidth=0.5)
    
    axs[current_ax].set_title(f"{column}, "+r"$r^2$=" + f"{round(r2, 2)}", fontsize=7.5)
    
    # Only set x and y labels for the outer plots
    if current_ax >= 2:  # Bottom row
        axs[current_ax].set_xlabel(r"log$_{10}$(stability, library 1)")
    if current_ax % 2 == 0:  # First column
        axs[current_ax].set_ylabel(r"log$_{10}$(stability, library 2)")
    
    current_ax += 1

# Hide any unused axes
for ax in axs[current_ax:]:
    ax.axis('off')

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/{used_mirna_name}/2.3_library_comparison/all_library_comparison.{format}", dpi=300)

# 2.4 - Compare microRNA database sources (miRbase, mirgeneDB)

#### This code requires that the initial fitting in Notebook 3 was already performed!

In [109]:
# load mirgeneDB
mirgenedb = pd.read_csv("../microrna_data/mirgenedb.csv", index_col=0)

# get high confidence miRNAs in mirgenedb
mirbase_high_conf = mirbase[mirbase["confidence"] == "high"]

In [110]:
# get relevant miRNAs from mirgenedb
mirgenedb_mirnas = list(mirgenedb["5p accession"].unique()) + list(mirgenedb["3p accession"].unique())
mirgenedb_mirnas = [mirna for mirna in mirgenedb_mirnas if mirna != "None"]
mirgenedb_mirnas = mirbase[mirbase["MIMAT"].isin(mirgenedb_mirnas)]

In [111]:
# retrieve the classification of the miRNAs
measured_high_conf = single_dfs["1_mirna_full_single_high_conf"]
measured_mirgenedb_low_conf = single_dfs["2_mirna_full_single_low_conf_mirgenedb"]
measured_not_mirgenedb_low_conf = single_dfs["3_mirna_full_single_low_conf_not_mirgenedb"]

In [112]:
# are there any microRNAs in measured_high_conf that are not in mirgenedb_mirnas?
microRNAs_not_mirgenedb_high_conf = measured_high_conf.index.difference(mirgenedb_mirnas.index)

measured_not_mirgenedb_high_conf = measured_high_conf.loc[microRNAs_not_mirgenedb_high_conf]
measured_mirgenedb_high_conf = measured_high_conf[~measured_high_conf.index.isin(microRNAs_not_mirgenedb_high_conf)]

In [None]:
# verify that the original classification is correct:
print("Length of measured_high_conf: ", len(measured_high_conf))
print("Number of these in mirbase_high_conf: ", len(measured_high_conf.index.intersection(mirbase_high_conf.index)))

print("Length of measured_mirgenedb_high_conf: ", len(measured_mirgenedb_high_conf))
print("Number of these in mirgenedb: ", len(measured_mirgenedb_high_conf.index.intersection(mirgenedb_mirnas.index)))
print("Number of these in mirbase_high_conf: ", len(measured_mirgenedb_high_conf.index.intersection(mirbase_high_conf.index)))

print("Length of measured_not_mirgenedb_high_conf: ", len(measured_not_mirgenedb_high_conf))
print("Number of these in mirgenedb: ", len(measured_not_mirgenedb_high_conf.index.intersection(mirgenedb_mirnas.index)))
print("Number of these in mirbase_high_conf: ", len(measured_not_mirgenedb_high_conf.index.intersection(mirbase_high_conf.index)))

print("Length of measured_mirgenedb_low_conf: ", len(measured_mirgenedb_low_conf))
print("Number of these in mirgenedb: ", len(measured_mirgenedb_low_conf.index.intersection(mirgenedb_mirnas.index)))
print("Number of these in mirbase_high_conf: ", len(measured_mirgenedb_low_conf.index.intersection(mirbase_high_conf.index)))

print("Length of measured_not_mirgenedb_low_conf: ", len(measured_not_mirgenedb_low_conf))
print("Number of these in mirgenedb: ", len(measured_not_mirgenedb_low_conf.index.intersection(mirgenedb_mirnas.index)))
print("Number of these in mirbase_high_conf: ", len(measured_not_mirgenedb_low_conf.index.intersection(mirbase_high_conf.index)))

In [114]:
# add these to a dictionary
measured_data_source_dict = {
    "High confidence in miRbase\n In MirGeneDB": measured_mirgenedb_high_conf,
    "Low confidence in miRbase\n In MirGeneDB": measured_mirgenedb_low_conf,
    "High confidence in miRbase\n Not in MirGeneDB": measured_not_mirgenedb_high_conf,
    "Low confidence in miRbase\n Not in MirGeneDB": measured_not_mirgenedb_low_conf
}

In [115]:
%%capture output
# create the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/{used_mirna_name}/2.4_datasources"):
    os.makedirs(f"{plot_folder}/{used_mirna_name}/2.4_datasources")

for key in measured_data_source_dict.keys():
    fig = plt.figure(figsize=(2.2, 1.7))

    df = measured_data_source_dict[key].copy()
    rs = []
    for cell_line_index, cell_line in enumerate(cell_lines_measured):
        # plot the data
        plt.scatter(df[f"{cell_line}_exp"], df[f"{cell_line}_3UTR"], color=cell_line_colors[cell_line], 
            s=1.5, marker=cell_line_symbols[cell_line], label=f"{cell_line}") #, " + r"$\rho^2$=" + f"{round(r**2, 2)}")
        # if cell_line_index == 0:
        #     y_transfer = np.log10(transfer_function(x_range_lin, *popt))
        #     plt.plot(x_range_log, y_transfer, ls="--", lw=1, color="black")
        
    plt.xlabel(r"log$_{10}$"+f"(miRNA expression)")
    plt.ylabel(r"log$_{10}$(stability)")

    plt.xlim(0, 5.5)
    plt.xticks([0, 1, 2, 3, 4, 5])
    plt.ylim(-2, 0.5)
    plt.legend(loc="lower left", frameon=False, fontsize=7, ncol=1)
    plt.title(f"{key}", fontsize=7.5)
    for format in ["png", "svg"]:
        plt.savefig(f"{plot_folder}/{used_mirna_name}/2.4_datasources/" + key.replace('\n', '') + f"_{used_mirna_name}.{format}", dpi=300, bbox_inches='tight')

In [None]:
# Prepare a 2x2 grid of subplots
fig, axs = plt.subplots(2, 2, figsize=(4, 3.5))
axs = axs.flatten() 
current_ax = 0

for key in measured_data_source_dict.keys():
    df = measured_data_source_dict[key].copy()
    rs = []
    for cell_line_index, cell_line in enumerate(cell_lines_measured):
        # Assume calculation for r, p and y_transfer is done here
        axs[current_ax].scatter(df[f"{cell_line}_exp"], df[f"{cell_line}_3UTR"], 
                                color=cell_line_colors[cell_line], s=1.5, marker=cell_line_symbols[cell_line],
                                label=f"{cell_line}", rasterized=True)
        # if cell_line_index == 0:
        #     y_transfer = np.log10(transfer_function(x_range_lin, *popt))
        #     axs[current_ax].plot(x_range_log, y_transfer, ls="--", lw=1, color="black")

    # Set labels only on the edge subplots
    if current_ax >= 2:  # Bottom row
        axs[current_ax].set_xlabel(r"log$_{10}$"+f"(miRNA expression)")
    else:
        axs[current_ax].set_xticklabels([])
    if current_ax % 2 == 0:  # Left column
        axs[current_ax].set_ylabel(r"log$_{10}$(RNA/DNA)")
    else:
        axs[current_ax].set_yticklabels([])

    axs[current_ax].set_xlim(0, 5.5)
    axs[current_ax].set_xticks([0, 1, 2, 3, 4, 5])
    axs[current_ax].set_ylim(-2, 0.5)
    axs[current_ax].set_title(f"{key}", fontsize=7.5)
    
    current_ax += 1
    if current_ax >= 4:
        break  # Ensure we don't go out of index if there are more than 4 keys
    
axs[2].legend(loc="lower left", frameon=False, fontsize=7)

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/{used_mirna_name}/2.4_datasources/all_datasources.{format}", dpi=300, bbox_inches='tight')