In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import scipy.optimize as opt
import os
import pickle
from library2_utils.color_scheme import cell_line_colors, cell_line_symbols
from library2_utils.mirna_levels import normalize_expr_df_to_rpm_with_index

# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

cell_lines_subset = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH7", "A549"]
cell_lines_rest = ["HaCaT", "JEG3", "Tera1", "PC3"]
cell_lines_measured = cell_lines_subset + cell_lines_rest
    
mirna_output_folder = "../microrna_data/3_output"
# create it if it doesn't exist
if not os.path.exists(mirna_output_folder):
    os.makedirs(mirna_output_folder)

### This notebook analyzes the data for a single full target site.

## Get the stability data

In [79]:
data_dir_input = "../measured_data/2_normalized_log10"

# get the name of all files in "reference" folder
reference_files = os.listdir(data_dir_input)

# read them into a dictionary
reference_dict = {}
for reference_file in reference_files:
    if reference_file.endswith(".csv"):
        reference_dict[reference_file.split('.')[0]] = pd.read_csv(os.path.join(data_dir_input, reference_file), index_col=0)

## Get the expression data

In [80]:
# get only the high confidence microRNAs in mirbase
mirbase = pd.read_csv("../microrna_data/mirbase_extended.csv", index_col=0)
mirbase_high_confidence = mirbase[mirbase["confidence"] == "high"]

In [81]:
df_alles = pd.read_csv("../microrna_data/2_output/Alles2019_conormalized.csv", index_col=0)
df_alles = np.log10(df_alles)

df_keller = pd.read_csv('../microrna_data/2_output/Keller2023_conormalized.csv', index_col=0)
df_keller = np.log10(df_keller)

In [82]:
# get likely real mirnas
with open("../microrna_data/likely_real_mirnas.pkl", "rb") as f:
    likely_real_mirnas = pickle.load(f)

In [83]:
# create an unfiltered merge of df_alles and df_keller
common_index = df_alles.index.intersection(df_keller.index)
common_columns = df_alles.columns.intersection(df_keller.columns)
df_merge_unfiltered = (df_alles.loc[common_index, common_columns]+df_keller.loc[common_index, common_columns])/2

# normalize
df_merge_unfiltered = normalize_expr_df_to_rpm_with_index(df_merge_unfiltered, likely_real_mirnas)

# save it to a file
df_merge_unfiltered.to_csv(os.path.join(mirna_output_folder, "Alles_Keller_completely_unfiltered_merge.csv"))

In [84]:
# !! these only exists later after this notebook has been run
df_merge = pd.read_csv("../microrna_data/3_output/Alles_Keller_combined_expression_with_crosstalk.csv", index_col=0)
df_merge_crosstalk_filter = pd.read_csv("../microrna_data/3_output/Alles_Keller_combined_expression_congruent.csv", index_col=0).dropna()

# remove "hsa-miR-3613-3p" if present
if "hsa-miR-3613-3p" in df_merge.index:
    df_merge = df_merge.drop("hsa-miR-3613-3p", axis=0)

In [85]:
used_mirna_data = df_keller
used_mirna_name = "Keller2023"

plot_folder = f"../plots/3_fit_microRNA_data/{used_mirna_name}"
output_folder = f"../outputs/3_fitting/{used_mirna_name}"

# create it if it does not exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

## Add the expression to the dataframes

In [86]:
# get all dfs that contain "single" in their key
single_dfs = {key: reference_dict[key].copy() for key in reference_dict.keys() if "single" in key}

for key, df in single_dfs.items():
    df.set_index("miRNA1", inplace=True)
    # remove all columns that are not 3UTR
    df = df.filter(regex='(3UTR)')
    
    # check which cell lines are in the expression_df
    for column in df.columns:
        if column.split("_")[0] not in used_mirna_data.columns:
           df = df.drop(column, axis=1)

    # restrict to microRNAs that are present in the expression_df
    df = df.loc[df.index.intersection(used_mirna_data.index)]

    # get the current list of cell lines:
    cell_lines = [column.split("_")[0] for column in df.columns]
    for cell_line in cell_lines:
        df.loc[:, f"{cell_line}_exp"] = used_mirna_data.loc[df.index, cell_line]

    # drop NaN values
    df.dropna(inplace=True)

    single_dfs[key] = df

In [87]:
# Look only at high confidence mirnas OR those in mirgenedb

# high confidence
measured_single = single_dfs["1_mirna_full_single_high_conf"]
# add mirgenedb
measured_single = pd.concat([measured_single, single_dfs["2_mirna_full_single_low_conf_mirgenedb"]], axis=0)

# split into knockdown and expression
# filter to columns that contain "_3UTR"
df_knockdown = measured_single.filter(regex='_3UTR')
# rename columns to drop everthing except the cell line name
df_knockdown.columns = [col.split("_")[0] for col in df_knockdown.columns]

# filter to columns that contain "exp"
df_expression = measured_single.filter(regex='exp')
# rename columns to drop the "_exp"
df_expression.columns = [col.split("_")[0] for col in df_expression.columns]

In [88]:
df_knockdown.to_csv(os.path.join(output_folder, f"{used_mirna_name}_knockdown.csv"))

# 3.1 - Fit to the unscaled data

#### Create dataframes to record the rmsd and the correlation

In [89]:
correlation_dataframe = pd.DataFrame(columns=cell_lines_measured)
rmsd_dataframe = pd.DataFrame(columns=cell_lines_measured)

In [90]:
def hill_func_log_scales(x_data, dataset_indices, c1, c2, *scales):
    """This is a hill function for a set of microRNA expression values that can be scaled individually.
    
    The expression is assumed to be normalized to one.
    The microRNA data is assumed to be log10.
    The return value is also log10."""
    c1 = 10**c1
    c2 = 10**c2
    results = []

    for i, scale in enumerate(scales):
        mask = (dataset_indices == i)
        x = x_data[mask] + scale
        x = 10**x
        result = (1 / (1 + x / c1)) * (1 + x / c2)
        results.append( np.log10( result ))
    return np.concatenate(results)


def hill_func_log_regular(x, c1, c2):
    """The expression is assumed to be normalized to one.
    The microRNA data is assumed to be log10.
    The return value is also log10."""
    x = 10**x
    c1 = 10**c1
    c2 = 10**c2
    
    result = (1 / (1 + x / c1)) * (1 + x / c2)
    return np.log10(result)

In [91]:
# constrain cell_lines_measured to those in the data
cell_lines_measured = [cell_line for cell_line in cell_lines_measured if cell_line in df_knockdown.columns]

In [92]:
x_data = []
y_data = []
dataset_indices = []
for i, cell_line in enumerate(cell_lines_measured):
    ex_df = df_expression[cell_line].values
    knock_df = df_knockdown[cell_line].values
    x_data.append(ex_df)
    y_data.append(knock_df)
    dataset_indices.append([i] * len(ex_df))

x_data = np.concatenate(x_data)
y_data = np.concatenate(y_data)
dataset_indices = np.concatenate(dataset_indices)

In [93]:
# set bounds and initial guesses for non-scale fitting parameters
p0 = [3, 10]
num_params = len(p0)
bounds = ([1, 9.99], [10, 10.01])

# Guess initial scale values for all datasets
scale_guesses = [0 for _ in range(len(cell_lines_measured))]
scale_bounds_min = [-0.001 for _ in range(len(cell_lines_measured))]
scale_bounds_max = [0.001 for _ in range(len(cell_lines_measured))]

# set up parameters
p0_scale = p0 + scale_guesses
bounds_scale = (bounds[0]+scale_bounds_min, bounds[1]+scale_bounds_max)

popt_scales, pcov = popt_scales_filter, pcov = opt.curve_fit(
    lambda x, *params: hill_func_log_scales(x, dataset_indices, *params),
    x_data,
    y_data,
    p0=p0_scale,
    bounds=bounds_scale,
    maxfev=5000
)

scales = list(popt_scales[num_params:])
hill_params = popt_scales[:num_params]

In [94]:
scale_dict_unscaled = {cell_line: scale for cell_line, scale in zip(cell_lines_measured, scales)}

with open(os.path.join(output_folder, f"{used_mirna_name}_popt_unscaled.pkl"), "wb") as f:
    pickle.dump(hill_params, f)

with open(os.path.join(output_folder, f"{used_mirna_name}_scale_dict_unscaled.pkl"), "wb") as f:
    pickle.dump(scale_dict_unscaled, f)

In [95]:
%%capture output
df_deviation = pd.DataFrame(columns=df_knockdown.columns, index=df_knockdown.index)

# create a figure with 4 subplots
fig, axs = plt.subplots(5, 2, figsize=(4, 6), sharex=True, sharey=True)

plt.xlim(0, 5.5)
plt.ylim(-1.7, 0.25)

# make the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/3.1_global_fit_wo_scale"):
    os.makedirs(f"{plot_folder}/3.1_global_fit_wo_scale")

x_range_log = np.linspace(0, 5.5, 1000)

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    
    axs[i//2, i%2].scatter(df_expression[f"{cell_line}"]+current_scale, df_knockdown[f"{cell_line}"], s=5, color="tab:blue", rasterized=True)
    axs[i//2, i%2].plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="black", linewidth=1.5, label="fit", ls="--")

    # calculate the R2 value
    r2 = stats.pearsonr(df_knockdown[f"{cell_line}"],
                        hill_func_log_regular(df_expression[f"{cell_line}"]+current_scale,
                        *hill_params))[0]**2
    
    # calculate the RMSD value
    rmsd = np.sqrt(np.mean((df_knockdown[f"{cell_line}"]-
                            hill_func_log_regular(df_expression[f"{cell_line}"]+current_scale,
                            *hill_params))**2))

    # save r2 and rmsd
    correlation_dataframe.loc["3.1_no_scale", cell_line] = r2
    rmsd_dataframe.loc["3.1_no_scale", cell_line] = rmsd
    
    # calculate the deviation
    df_deviation[cell_line] = (df_knockdown[f"{cell_line}"] - hill_func_log_regular(
        df_expression[f"{cell_line}"]+current_scale, *hill_params))

    if i//2 == 4:
        axs[i//2, i%2].set_xlabel(r"log$_{10}$(miRNA expression)")
    if i%2 == 0:
        axs[i//2, i%2].set_ylabel(r"log$_{10}$(stability)")
    axs[i//2, i%2].set_title(f"{cell_line}, r2 = {round(r2, 2)}, rmsd = {round(rmsd, 3)}", fontsize=8)

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.1_global_fit_wo_scale/3.1_global_fit_wo_scale.{format}", dpi=600)

In [96]:
%%capture output
fig = plt.figure(figsize=(2, 1.4))
label_mirna = "hsa-miR-100-5p"

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    r2 = stats.pearsonr(df_knockdown[f"{cell_line}"],
                        hill_func_log_regular(df_expression[f"{cell_line}"]+current_scale,
                        *hill_params))[0]**2
    
    
    plt.scatter(df_expression[f"{cell_line}"]+current_scale, df_knockdown[f"{cell_line}"], color=cell_line_colors[cell_line],
        s=3, marker=cell_line_symbols[cell_line], rasterized=True, label=f"{cell_line}")

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    # add text at the position of the label_mirna
    if label_mirna in df_expression.index:
        label_mirna_text = "-".join(label_mirna.split("-")[2:])
        if i == 0:
            plt.scatter(df_expression.loc[label_mirna, f"{cell_line}"]+current_scale, df_knockdown.loc[label_mirna, f"{cell_line}"],
                color="black", s=8, marker="x", label=label_mirna)
        else:
            plt.scatter(df_expression.loc[label_mirna, f"{cell_line}"]+current_scale, df_knockdown.loc[label_mirna, f"{cell_line}"],
                color="black", s=8, marker="x")
            
    if i == 0:
        plt.plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="black", linewidth=1, ls="--")
    
plt.xlabel(r"log$_{10}$"+f"(miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")

plt.xlim(0, 5.5)
plt.ylim(-1.7, 0.25)
plt.tight_layout()
plt.legend(loc="lower left", frameon=False, fontsize=6.5)
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.1_global_fit_wo_scale/3.1-global_fit_without_scale_single_plot.{format}", dpi=600, bbox_inches='tight')

In [97]:
%%capture output
fig = plt.figure(figsize=(2.0, 1.5))
label_mirnas = ["hsa-miR-100-5p", "hsa-let-7g-5p", "hsa-miR-218-5p", "hsa-miR-122-5p"]
markers = ["x", "o", "s", "D"]

plt.plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="black", linewidth=1, ls="--")

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    # add text at the position of the label_mirna
    for j, label_mirna in enumerate(label_mirnas):
        if label_mirna in df_expression.index:
            plt.scatter(df_expression.loc[label_mirna, f"{cell_line}"]+current_scale, df_knockdown.loc[label_mirna, f"{cell_line}"],
                color=cell_line_colors[cell_line], s=8, marker=markers[j], rasterized=False)
            
plt.xlabel(r"log$_{10}$"+f"(miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")
plt.xticks([1,2,3,4,5])

shift = -0.25
for i, label_mirna in enumerate(label_mirnas):
    plt.scatter(1.3, -0.8+i*shift, color="black", s=8, marker=markers[i])
    plt.text(1.5, -0.85+i*shift, f"{'-'.join(label_mirna.split('-')[1:])}", fontsize=7, ha="left")

plt.xlim(1, 5.5)
plt.ylim(-1.7, 0.25)
plt.tight_layout()
#plt.legend(loc="lower left", frameon=False, fontsize=6)
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.1_global_fit_wo_scale/3.1-global_fit_without_scale_label_mirnas.{format}", dpi=600, bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(0.3, 1.65))
sns.heatmap(correlation_dataframe.iloc[0:1].astype('float').T, cmap="viridis", annot=True, fmt=".2f", vmin=0.2, vmax=0.8, ax=ax,
            annot_kws={"size": 6},
            cbar_kws={'label': r'$r^2$', 'shrink': 1.4, 'aspect': 25, 'pad': 0.02, 'ticks': [0.2, 0.4, 0.6, 0.8]})

plt.xticks([])
plt.yticks([])
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.1_global_fit_wo_scale/3.1_correlations.{format}", dpi=600, bbox_inches='tight')

#### Illustrate the inverted transfer function

In [None]:
miRNAs = ["hsa-miR-100-5p", "hsa-miR-20a-5p"]
plt.figure(figsize=(2.4, 1.8))
plt.plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="tab:blue", linewidth=1.5, ls="--", label="transfer function t(x)", zorder=1)
for miRNA in miRNAs:
    plt.scatter(df_expression.loc[miRNA], df_knockdown.loc[miRNA], color="black", s=12, marker="x", zorder=2)
plt.xlabel(r"log$_{10}$"+f"(miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")
plt.legend(loc="lower left", fontsize=7)
plt.tight_layout()
plt.savefig(f"{plot_folder}/3.1_global_fit_wo_scale/3.1_transfer_function.svg", dpi=300, bbox_inches='tight')

In [None]:
from library2_utils.transfer_functions import inverse_transfer

plt.figure(figsize=(1.4, 1))
y_range_log = 10**hill_func_log_regular(x_range_log, *hill_params)
plt.plot(np.log10(y_range_log), np.log10(inverse_transfer(y_range_log, *hill_params)), color="tab:blue", linewidth=1.5,
         ls="--", label="t$^{-1}$(x)", zorder=1)
for miRNA in miRNAs:
    plt.scatter(df_knockdown.loc[miRNA], np.log10(inverse_transfer(10**df_knockdown.loc[miRNA], *hill_params)), color="black", s=12, marker="x", zorder=2)
plt.xlabel(r"log$_{10}$(meas. stability)")
plt.ylabel(r"log$_{10}$"+f"(miRNA expr.)")
plt.xlabel(r"meas. stability")
plt.ylabel(r"miRNA expr.")
plt.xticks([])
plt.yticks([])
plt.legend(loc="lower left", fontsize=7)
plt.tight_layout()
plt.savefig(f"{plot_folder}/3.1_global_fit_wo_scale/3.1_inverse_transfer_function.svg", dpi=300, bbox_inches='tight')

# 3.2 - Fit to scaled expression data

In [101]:
x_data = []
y_data = []
dataset_indices = []
for i, cell_line in enumerate(cell_lines_measured):
    ex_df = df_expression[cell_line].values
    knock_df = df_knockdown[cell_line].values
    x_data.append(ex_df)
    y_data.append(knock_df)
    dataset_indices.append([i] * len(ex_df))

x_data = np.concatenate(x_data)
y_data = np.concatenate(y_data)
dataset_indices = np.concatenate(dataset_indices)

In [102]:
# set bounds and initial guesses for non-scale fitting parameters
p0 = [3, 10]
num_params = len(p0)
bounds = ([1, 9.99], [10, 10.01])

# Guess initial scale values for all datasets
scale_guesses = [0 for _ in range(len(cell_lines_measured))]
scale_bounds_min = [-2 for _ in range(len(cell_lines_measured))]
scale_bounds_max = [2 for _ in range(len(cell_lines_measured))]

# set scale for HEK293T to 0
scale_bounds_min[0] = -0.001
scale_bounds_max[0] = 0.001

# set up parameters
p0_scale = p0 + scale_guesses
bounds_scale = (bounds[0]+scale_bounds_min, bounds[1]+scale_bounds_max)

popt_scales, pcov = popt_scales_filter, pcov = opt.curve_fit(
    lambda x, *params: hill_func_log_scales(x, dataset_indices, *params),
    x_data,
    y_data,
    p0=p0_scale,
    bounds=bounds_scale,
    maxfev=5000
)

scales = list(popt_scales[num_params:])
hill_params = popt_scales[:num_params]

In [None]:
scale_dict = {cell_line: scale for cell_line, scale in zip(cell_lines_measured, scales)}
scale_dict

In [104]:
with open(os.path.join(output_folder, f"{used_mirna_name}_popt.pkl"), "wb") as f:
    pickle.dump(hill_params, f)

with open(os.path.join(output_folder, f"{used_mirna_name}_scale_dict.pkl"), "wb") as f:
    pickle.dump(scale_dict, f)

### Plot the global fit data individually

In [105]:
%%capture output
df_deviation = pd.DataFrame(columns=df_knockdown.columns, index=df_knockdown.index)

# create a figure with 4 subplots
fig, axs = plt.subplots(5, 2, figsize=(4, 6), sharex=True, sharey=True)

plt.xlim(0, 5.5)
plt.ylim(-1.7, 0.25)

for i, cell_line in enumerate(cell_lines_measured):
    # make the plot folder if it doesn't exist
    if not os.path.exists(f"{plot_folder}/3.2_global_fit"):
        os.makedirs(f"{plot_folder}/3.2_global_fit")
    
    current_scale = scales[i]
    
    axs[i//2, i%2].scatter(df_expression[f"{cell_line}"]+current_scale, df_knockdown[f"{cell_line}"], s=5, color="tab:blue", rasterized=True)
    axs[i//2, i%2].plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="black", linewidth=1.5, label="fit", ls="--")

    # calculate the R2 value
    r2 = stats.pearsonr(df_knockdown[f"{cell_line}"],
                        hill_func_log_regular(df_expression[f"{cell_line}"]+current_scale,
                        *hill_params))[0]**2
    
    # calculate the RMSD value
    rmsd = np.sqrt(np.mean((df_knockdown[f"{cell_line}"]-
                            hill_func_log_regular(df_expression[f"{cell_line}"]+current_scale,
                            *hill_params))**2))

    # save r2 and rmsd
    correlation_dataframe.loc["3.2_global_fit", cell_line] = r2
    rmsd_dataframe.loc["3.2_global_fit", cell_line] = rmsd
    
    # calculate the deviation
    df_deviation[cell_line] = (df_knockdown[f"{cell_line}"] - hill_func_log_regular(
        df_expression[f"{cell_line}"]+current_scale, *hill_params))

    if i//2 == 4:
        axs[i//2, i%2].set_xlabel(r"log$_{10}$(miRNA expression)")
    if i%2 == 0:
        axs[i//2, i%2].set_ylabel(r"log$_{10}$(stability)")
    axs[i//2, i%2].set_title(f"{cell_line}, r2 = {round(r2, 2)}, rmsd = {round(rmsd, 3)}", fontsize=8)

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.2_global_fit/3.2_global_fit_with_scale.{format}", dpi=600)

### plot the data as a single plot

In [106]:
%%capture output
fig = plt.figure(figsize=(2.5, 1.7))
#label_mirna = "hsa-miR-18a-5p"
label_mirna = "hsa-miR-100-5p"

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    r2 = stats.pearsonr(df_knockdown[f"{cell_line}"],
                        hill_func_log_regular(df_expression[f"{cell_line}"]+current_scale,
                        *hill_params))[0]**2
    
    
    plt.scatter(df_expression[f"{cell_line}"]+current_scale, df_knockdown[f"{cell_line}"], color=cell_line_colors[cell_line],
        s=3, marker=cell_line_symbols[cell_line], rasterized=True, label=f"{cell_line}")#, " + r"$r^2$=" + f"{round(r2, 2)}", )

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    # add text at the position of the label_mirna
    if label_mirna in df_expression.index:
        label_mirna_text = "-".join(label_mirna.split("-")[2:])
        # plt.scatter(df_expression.loc[label_mirna, f"{cell_line}"]+current_scale, df_knockdown.loc[label_mirna, f"{cell_line}"],
        #     color="black", s=10, marker="x")
            
    if i == 0:
        plt.plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="black", linewidth=1, ls="--")
    
plt.xlabel(r"log$_{10}$"+f"(miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")

plt.xlim(0, 5.5)
plt.ylim(-1.7, 0.25)
plt.tight_layout()
plt.legend(loc="lower left", frameon=False, fontsize=6.5)
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.2_global_fit/global_fit_with_scale_single_plot.{format}", dpi=600, bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(0.3, 1.65))
sns.heatmap(correlation_dataframe.iloc[1:2].astype('float').T, cmap="viridis", annot=True, fmt=".2f", vmin=0.2, vmax=0.8, ax=ax,
            annot_kws={"size": 6},
            cbar_kws={'label': r'$r^2$', 'shrink': 1.4, 'aspect': 25, 'pad': 0.02, 'ticks': [0.2, 0.4, 0.6, 0.8]})
plt.xticks([])
plt.yticks([])
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.2_global_fit/3.2_correlations.{format}", dpi=600, bbox_inches='tight')

In [108]:
%%capture output
fig = plt.figure(figsize=(2.0, 1.5))
label_mirnas = ["hsa-miR-100-5p", "hsa-let-7g-5p", "hsa-miR-218-5p", "hsa-miR-122-5p"]
markers = ["x", "o", "s", "D"]

plt.plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="black", linewidth=1, ls="--")

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    # add text at the position of the label_mirna
    for j, label_mirna in enumerate(label_mirnas):
        if label_mirna in df_expression.index:
            # label_mirna_text = "-".join(label_mirna.split("-")[2:])
            plt.scatter(df_expression.loc[label_mirna, f"{cell_line}"]+current_scale, df_knockdown.loc[label_mirna, f"{cell_line}"],
                color=cell_line_colors[cell_line], s=8, marker=markers[j], rasterized=False)
            
plt.xlabel(r"log$_{10}$"+f"(miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")
plt.xticks([1,2,3,4,5])

shift = -0.25
for i, label_mirna in enumerate(label_mirnas):
    plt.scatter(1.3, -0.8+i*shift, color="black", s=8, marker=markers[i])
    plt.text(1.5, -0.85+i*shift, f"{'-'.join(label_mirna.split('-')[1:])}", fontsize=7, ha="left")

plt.xlim(1, 5.5)
plt.ylim(-1.7, 0.25)
plt.tight_layout()
#plt.legend(loc="lower left", frameon=False, fontsize=6)
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.2_global_fit/global_fit_with_scale_label_mirnas.{format}", dpi=600, bbox_inches='tight')

In [109]:
# save the deviation dataframe
df_deviation.to_csv(os.path.join(output_folder, f"{used_mirna_name}_deviation_all_mirnas.csv"))

# 3.2.2 - Look at tissue data microRNAs
This should only be executed after 15_merge_tissue_datasets.ipynb

In [110]:
input_folder = "../microrna_data/15_human_data_merge/processed"
ngs_larger = []
with open(os.path.join(input_folder, "ngs_larger_in_multiple.txt"), "r") as f:
    for line in f:
        line = line.rstrip()
        ngs_larger.append(line)
micro_larger = []
with open(os.path.join(input_folder, "micro_larger_in_multiple.txt"), "r") as f:
    for line in f:
        line = line.rstrip()
        micro_larger.append(line)
tissue_mirnas = []
with open(os.path.join(input_folder, "tissue_dataset_mirnas.txt"), "r") as f:
    for line in f:
        line = line.rstrip()
        tissue_mirnas.append(line)
alles_larger = []
with open(os.path.join(input_folder, "alles_larger_in_multiple.txt"), "r") as f:
    for line in f:
        line = line.rstrip()
        alles_larger.append(line)
keller_larger = []
with open(os.path.join(input_folder, "keller_larger_in_multiple.txt"), "r") as f:
    for line in f:
        line = line.rstrip()
        keller_larger.append(line)        

### Plot consistent outliers in the Alles-Keller data

In [111]:
df_expression_cell_lines = df_expression.copy()

# remove "hsa-miR-3613-3p" if it exists
df_expression_cell_lines = df_expression_cell_lines[~df_expression_cell_lines.index.isin(["hsa-miR-3613-3p"])]

# add the scale
for cell_line in cell_lines_measured:
    df_expression_cell_lines[cell_line] = df_expression_cell_lines[cell_line] + scale_dict[cell_line]

In [112]:
# filter out potential crosstalk miRNAs
# load the crosstalk dict
input_folder = "../outputs/5_mutations"

with open(f"{input_folder}/5.7_crosstalk_filter_dict.pkl", "rb") as f:
    crosstalk_filter_dict = pickle.load(f)

from library2_utils.crosstalk import merge_identical_mirnas
df_expression_cell_lines, groups = merge_identical_mirnas(df_expression_cell_lines, mirbase)

allowed_mirnas_all = {}
for cell_line in cell_lines_measured:
    allowed_mirnas = []
    crosstalk_dict = crosstalk_filter_dict[cell_line]
    allowed_mirnas_all[cell_line] = list(df_expression_cell_lines.index.difference(crosstalk_dict))  
    df_expression_cell_lines.loc[:, cell_line] = df_expression_cell_lines.loc[allowed_mirnas_all[cell_line], cell_line]
 
df_expression_cell_lines = df_expression_cell_lines.dropna()
df_knockdown_cell_lines = df_knockdown.copy()
df_knockdown_cell_lines = df_knockdown_cell_lines.loc[df_expression_cell_lines.index]

alles_larger = [miRNA for miRNA in alles_larger if miRNA in df_expression_cell_lines.index]
keller_larger = [miRNA for miRNA in keller_larger if miRNA in df_expression_cell_lines.index]
other_mirnas = df_expression_cell_lines.index.difference(alles_larger+keller_larger)

In [113]:
%%capture output
fig = plt.figure(figsize=(2.5, 1.7))
df_deviation = pd.DataFrame(index=df_expression_cell_lines.index, columns=df_expression_cell_lines.columns)
for i, cell_line in enumerate(cell_lines_measured):
    # ignore poorly behaving cell lines:
    if cell_line == "JEG3" or cell_line == "Tera1":
        continue
    
    current_scale = scales[i]
    
    plt.scatter(df_expression_cell_lines[f"{cell_line}"].loc[other_mirnas],
        df_knockdown[f"{cell_line}"].loc[other_mirnas],
        color="tab:blue",
        s=3,
        alpha=1,
        edgecolors="none",
        rasterized=True,
        zorder = 1)
    
    plt.scatter(df_expression_cell_lines[f"{cell_line}"].loc[keller_larger],
        df_knockdown[f"{cell_line}"].loc[keller_larger],
        color="tab:red",
        label="consistent NGS >> microarray" if i == 0 else None,
        s=5,
        alpha=1,
        edgecolors="none",
        rasterized=True,
        zorder = 2)
    
    plt.scatter(df_expression_cell_lines[f"{cell_line}"].loc[alles_larger],
        df_knockdown[f"{cell_line}"].loc[alles_larger],
        color="tab:orange",
        label="consistent microarray >> NGS" if i == 0 else None,
        s=5,
        alpha=1,
        edgecolors="none",
        rasterized=True,
        zorder = 3)
 
    if i == 0:
        plt.plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="black", linewidth=1, ls="--")
    
    df_deviation.loc[:,cell_line] = df_knockdown_cell_lines.loc[df_expression_cell_lines.index, f"{cell_line}"] \
        - hill_func_log_regular(df_expression_cell_lines[f"{cell_line}"]+current_scale, *hill_params)

translate_dataset_name = {
    "Alles2019": "microarray",
    "Keller2023": "NGS",
}

plt.xlabel(r"log$_{10}$"+f"(miRNA expression ({translate_dataset_name[used_mirna_name]}))")
plt.ylabel(r"log$_{10}$(stability)")

plt.xlim(0, 5.5)
plt.ylim(-1.7, 0.25)
plt.tight_layout()
plt.legend(loc="lower left", frameon=False, fontsize=6.5)
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.2_global_fit/3.2.2_outliers_alles_keller_mirnas_plot.{format}", dpi=600, bbox_inches='tight')

In [114]:
df_deviation.to_csv(os.path.join(output_folder, f"{used_mirna_name}_deviation_cell_line_consistent_outliers.csv"))

### Consistent outliers tissue

In [115]:
df_expression_tissues = df_expression_cell_lines[df_expression_cell_lines.index.isin(tissue_mirnas)]
df_knockdown_tissues = df_knockdown_cell_lines[df_knockdown_cell_lines.index.isin(tissue_mirnas)]

In [116]:
micro_larger = [miRNA for miRNA in micro_larger if miRNA in df_expression_tissues.index]
ngs_larger = [miRNA for miRNA in ngs_larger if miRNA in df_expression_tissues.index]
other_mirnas = df_expression_tissues.index.difference(micro_larger+ngs_larger)

In [117]:
%%capture output
df_deviation = pd.DataFrame(index=df_expression_tissues.index, columns=df_expression_tissues.columns)
fig = plt.figure(figsize=(2.5, 1.7))
for i, cell_line in enumerate(cell_lines_measured):
    # ignore poorly fitting miRNAs:
    if cell_line == "JEG3" or cell_line == "Tera1":
        continue
    
    current_scale = scales[i]
    
    plt.scatter(df_expression_tissues[f"{cell_line}"].loc[other_mirnas],
        df_knockdown_tissues[f"{cell_line}"].loc[other_mirnas],
        color="tab:blue",
        s=3,
        edgecolors="none",
        rasterized=True,
        zorder = 1)
    
    plt.scatter(df_expression_tissues[f"{cell_line}"].loc[ngs_larger],
        df_knockdown_tissues[f"{cell_line}"].loc[ngs_larger],
        color="tab:red",
        label="consistent NGS >> microarray" if i == 0 else None,
        s=5,
        edgecolors="none",
        rasterized=True,
        zorder = 2)
    
    plt.scatter(df_expression_tissues[f"{cell_line}"].loc[micro_larger],
        df_knockdown_tissues[f"{cell_line}"].loc[micro_larger],
        color="tab:orange",
        label="consistent microarray >> NGS" if i == 0 else None,
        s=5,
        edgecolors="none",
        rasterized=True,
        zorder = 3)
 
    if i == 0:
        plt.plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="black", linewidth=1, ls="--")
    #     # plot a horizontal line at y=0.2 and y=-0.2
    #     plt.axhline(y=0.2, color="black", linestyle="--", linewidth=0.5)
    #     plt.axhline(y=-0.2, color="black", linestyle="--", linewidth=0.5)

    df_deviation.loc[:,cell_line] = df_knockdown_tissues[f"{cell_line}"] \
        - hill_func_log_regular(df_expression_tissues[f"{cell_line}"], *hill_params)
    
translate_dataset_name = {
    "Alles2019": "microarray",
    "Keller2023": "NGS",
}

plt.xlabel(r"log$_{10}$"+f"(miRNA expression ({translate_dataset_name[used_mirna_name]}))")
plt.ylabel(r"log$_{10}$(stability)")

plt.xlim(0, 5.5)
plt.ylim(-1.7, 0.25)
plt.tight_layout()
plt.legend(loc="lower left", frameon=False, fontsize=6.5)
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.2_global_fit/3.2.2_tissue_mirnas_plot.{format}", dpi=600, bbox_inches='tight')

In [118]:
df_deviation.to_csv(os.path.join(output_folder, f"{used_mirna_name}_deviation_tissue_mirnas.csv"))

### Try to infer correctness
The code above needs to be run twice - once for Alles2019 and once for Keller2023

In [154]:
current_data_type = "cell_line"
if current_data_type == "tissue":
    df_deviation_keller = pd.read_csv(os.path.join('../outputs/3_fitting/Keller2023',
                                                   f"Keller2023_deviation_tissue_mirnas.csv"), index_col=0)
    df_deviation_alles = pd.read_csv(os.path.join('../outputs/3_fitting/Alles2019',
                                                  f"Alles2019_deviation_tissue_mirnas.csv"), index_col=0)
if current_data_type == "cell_line":
    df_deviation_keller = pd.read_csv(os.path.join('../outputs/3_fitting/Keller2023',
                                                   f"Keller2023_deviation_cell_line_consistent_outliers.csv"), index_col=0)
    df_deviation_alles = pd.read_csv(os.path.join('../outputs/3_fitting/Alles2019',
                                                  f"Alles2019_deviation_cell_line_consistent_outliers.csv"), index_col=0)

In [155]:
df_deviation_keller = df_deviation_keller.abs()
df_deviation_alles = df_deviation_alles.abs()

In [156]:
# remove JEG3 and Tera1
df_deviation_alles = df_deviation_alles.drop(columns=["Tera1", "JEG3"])
df_deviation_keller = df_deviation_keller.drop(columns=["Tera1", "JEG3"])

In [160]:
# larger 0: deviation is stronger in the alles data
df_diff = df_deviation_alles - df_deviation_keller

In [161]:
if current_data_type == "tissue":
    decision_mirnas = ngs_larger + micro_larger
if current_data_type == "cell_line":
    decision_mirnas = alles_larger + keller_larger
    
# df_diff = df_diff.loc[decision_mirnas]

In [None]:
# flatten
df_diff_flat = df_diff.melt()
plt.figure(figsize=(2, 1.4))
plt.hist(x=df_diff_flat["value"], color="skyblue", edgecolor="black", bins=np.arange(-1,1,0.05))
# make the y axis logarithmic
plt.yscale('log')
plt.xlabel("abs(deviation(micro)) - abs(deviation(ngs))")
plt.ylabel("Count")
# plot vertical lines at -0.2 and 0.2
plt.axvline(x=-0.2, color="tab:red", linestyle="--", linewidth=1.5)
plt.axvline(x=0.2, color="tab:red", linestyle="--", linewidth=1.5)
plt.text(-0.25, 1000, "ngs wrong", fontsize=7, ha="right", color="tab:red")
plt.text(0.25, 1000, "micro wrong", fontsize=7, ha="left", color="tab:red")
plt.savefig(f"{plot_folder}/3.2_global_fit/3.2.2_deviation_difference_histogram_{current_data_type}.svg", dpi=600, bbox_inches='tight')

In [140]:
# alles is wrong for a specific miRNA and cell line
df_diff_bool_alles = df_diff > 0.2
# keller is wrong for a specific miRNA and cell line
df_diff_bool_keller = df_diff < -0.2

In [141]:
df_diff_bool_alles_sum = df_diff_bool_alles.sum(axis=1)
df_diff_bool_keller_sum = df_diff_bool_keller.sum(axis=1)

In [142]:
# larger 0: alles is wrong more often than keller
df_call = df_diff_bool_alles_sum - df_diff_bool_keller_sum
if current_data_type == "tissue":
    df_call = df_call[df_call.index.isin(tissue_mirnas)]

In [None]:
plt.figure(figsize=(1.4, 1.4))
plt.hist(x=df_call, color="skyblue", edgecolor="black", bins=np.arange(0,len(df_diff_bool_alles.columns)+1,1))
plt.xlabel("Consistent deviation difference")
plt.xticks(np.arange(0,len(df_diff_bool_alles.columns)+1,1)+0.5, labels=np.arange(0,len(df_diff_bool_alles.columns)+1,1))
plt.xlim(-0.5, len(df_diff_bool_alles.columns)+1.5)
plt.yscale('log')
plt.ylabel("Count")
plt.savefig(f"{plot_folder}/3.2_global_fit/3.2.2_deviation_consistent_diff_count_{current_data_type}.svg", dpi=600, bbox_inches='tight')

In [None]:
df_call_alles_wrong = df_call[df_call > 0]
df_call_alles_wrong.value_counts()

In [None]:
df_call_keller_wrong = df_call[df_call < 0]
df_call_keller_wrong.value_counts()

In [147]:
decision_ngs_wrong = df_call.index.intersection(df_call_keller_wrong.index.intersection(decision_mirnas))
decision_micro_wrong = df_call.index.intersection(df_call_alles_wrong.index.intersection(decision_mirnas))
undecided = df_call.index.intersection(set(decision_mirnas).difference(decision_ngs_wrong.union(decision_micro_wrong)))
all_others = df_call.index.difference(decision_mirnas)

In [148]:
%%capture output
fig = plt.figure(figsize=(2.5, 1.7))
for i, cell_line in enumerate(cell_lines_measured):
    # ignore poorly behaving cell lines:
    if cell_line == "JEG3" or cell_line == "Tera1":
        continue
    
    current_scale = scales[i]
    
    plt.scatter(df_expression_cell_lines[f"{cell_line}"].loc[undecided],
        df_knockdown_cell_lines[f"{cell_line}"].loc[undecided],
        color="tab:gray",
        label = "undecided" if i == 0 else None,
        s=5,
        edgecolors="none",
        rasterized=True,
        zorder = 1)
    
    plt.scatter(df_expression_cell_lines[f"{cell_line}"].loc[decision_ngs_wrong],
        df_knockdown_cell_lines[f"{cell_line}"].loc[decision_ngs_wrong],
        color="tab:red",
        label="ngs wrong" if i == 0 else None,
        s=5,
        edgecolors="none",
        rasterized=True,
        zorder = 2)
    
    plt.scatter(df_expression_cell_lines[f"{cell_line}"].loc[decision_micro_wrong],
        df_knockdown_cell_lines[f"{cell_line}"].loc[decision_micro_wrong],
        color="tab:orange",
        label="microarray wrong" if i == 0 else None,
        s=5,
        edgecolors="none",
        rasterized=True,
        zorder = 3)
 
    if i == 0:
        plt.plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="black", linewidth=1, ls="--")

translate_dataset_name = {
    "Alles2019": "microarray",
    "Keller2023": "NGS",
}

plt.xlabel(r"log$_{10}$"+f"(miRNA expression ({translate_dataset_name[used_mirna_name]}))")
plt.ylabel(r"log$_{10}$(stability)")

plt.xlim(0, 5.5)
plt.ylim(-1.7, 0.25)
plt.tight_layout()
plt.legend(loc="lower left", frameon=False, fontsize=6.5)
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.2_global_fit/3.2.2_outliers_{current_data_type}_decision.{format}", dpi=600, bbox_inches='tight')

In [149]:
%%capture output
fig = plt.figure(figsize=(2.5, 1.7))
for i, cell_line in enumerate(cell_lines_measured):
    # ignore poorly behaving cell lines:
    if cell_line == "JEG3" or cell_line == "Tera1":
        continue
    
    current_scale = scales[i]
    
    plt.scatter(df_expression_cell_lines[f"{cell_line}"].loc[undecided],
        df_knockdown_cell_lines[f"{cell_line}"].loc[undecided],
        color="tab:gray",
        label = "undecided" if i == 0 else None,
        s=3,
        edgecolors="none",
        rasterized=True,
        zorder = 1)
    
    # add text with the miRNA name
    for mirna in undecided:
        if "320c" in mirna:
            plt.text(df_expression_cell_lines[f"{cell_line}"].loc[undecided], df_knockdown_cell_lines.loc[mirna, f"{cell_line}"],
                    "-".join(mirna.split("-")[2:]), fontsize=4, ha="left", va="center", color="black", zorder=2)
    
    if i == 0:
        plt.plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="black", linewidth=1, ls="--")

plt.xlabel(r"log$_{10}$"+f"(miRNA expression ({translate_dataset_name[used_mirna_name]}))")
plt.ylabel(r"log$_{10}$(stability)")

plt.xlim(0, 5.5)
plt.ylim(-1.7, 0.25)
plt.tight_layout()
plt.legend(loc="lower left", frameon=False, fontsize=6.5)
plt.savefig(f"{plot_folder}/3.2_global_fit/3.2.2_outliers_{current_data_type}_decision_label_mirnas.png", dpi=600, bbox_inches='tight')

In [150]:
# save these to a file
if current_data_type == "tissue":
    if not os.path.exists(os.path.join(output_folder, "tissue_mirna_calls")):
        os.makedirs(os.path.join(output_folder, "tissue_mirna_calls"))
    df_call_alles_wrong.to_csv(os.path.join(os.path.join(output_folder, "tissue_mirna_calls"), "calls_alles.csv"))
    df_call_keller_wrong.to_csv(os.path.join(os.path.join(output_folder, "tissue_mirna_calls"), "calls_keller.csv"))
if current_data_type == "cell_line":
    if not os.path.exists(os.path.join(output_folder, "cell_line_mirna_calls")):
        os.makedirs(os.path.join(output_folder, "cell_line_mirna_calls"))
    df_call_alles_wrong.to_csv(os.path.join(os.path.join(output_folder, "cell_line_mirna_calls"), "calls_alles.csv"))
    df_call_keller_wrong.to_csv(os.path.join(os.path.join(output_folder, "cell_line_mirna_calls"), "calls_keller.csv"))

# 3.3 - Plot outliers

In [None]:
# these are miRNAs which are used particularly often in this library
# can be used to check if these deviate more than others
# mut_mirs = ['hsa-let-7a-5p', 'hsa-let-7i-5p', 'hsa-miR-16-5p', 'hsa-miR-19b-3p',
#  'hsa-miR-21-5p', 'hsa-miR-22-3p', 'hsa-miR-23a-3p', 'hsa-miR-24-3p',
#  'hsa-miR-31-3p', 'hsa-miR-31-5p', 'hsa-miR-365a-3p', 'hsa-miR-107']

In [None]:
%%capture output
for cell_line in cell_lines_measured:
    # create the plot folder if it doesn't exist
    if not os.path.exists(f"{plot_folder}/3.3_outliers"):
        os.makedirs(f"{plot_folder}/3.3_outliers")
    
    # get the index of cell_line in cell_lines
    cell_line_index = cell_lines_measured.index(cell_line)
    current_scale = scales[cell_line_index]

    # get the deviation
    deviation = abs(df_deviation[cell_line])
    deviation.sort_values(inplace=True, ascending=False)
    deviation = deviation[:10]

    # plot the data
    fig = plt.figure(figsize=(2.2, 1.8))
    plt.scatter(df_expression[f"{cell_line}"]+current_scale, df_knockdown[f"{cell_line}"], s=5, color="tab:blue", rasterized=True)
    plt.plot(x_range_log+current_scale,
            hill_func_log_regular(x_range_log+current_scale, *hill_params), color="black", linewidth=1.5, label="fit", rasterized=True)

    # add text with the miRNA names
    for i in deviation.index.to_list():
        plt.text(df_expression.loc[i, cell_line]+current_scale, df_knockdown.loc[i, cell_line], "-".join(i.split("-")[2:]), fontsize=8)

    plt.xlabel("miRNA expression")
    plt.ylabel(r"log$_{10}$(stability)")
    plt.title(f"{cell_line}", fontsize=8)
    
    plt.xlim(0, 5.5)
    plt.ylim(-1.7, 0.25)
    
    plt.tight_layout()
    for format in ["png", "svg"]:
        plt.savefig(os.path.join(plot_folder, f"3.3_outliers/outliers_{cell_line}.{format}"), dpi=600)

# 3.4 - Do low mRNA counts explain outliers?

In [None]:
# load count data
raw_counts = pd.read_csv('../measured_data/1_count_data/library2_count_and_log2fc_data.csv', index_col=0)

# filter to columns containing "count"
raw_counts = raw_counts.filter(regex="count")

In [None]:
# get the single microRNA data
mirna_info_df = pd.concat([reference_dict["1_mirna_full_single_high_conf"],
                                reference_dict["2_mirna_full_single_low_conf_mirgenedb"]])
single_mirna_index = mirna_info_df.index
raw_counts = raw_counts.loc[raw_counts.index.intersection(single_mirna_index), :]

# add the miRNA name and make it the index
raw_counts["miRNA"] = mirna_info_df.loc[raw_counts.index, "miRNA1"]
raw_counts.set_index("miRNA", inplace=True)

In [None]:
%%capture output
# plot the results
# create the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/3.4_counts_vs_deviation"):
    os.makedirs(f"{plot_folder}/3.4_counts_vs_deviation")

fig = plt.figure(figsize=(2.2, 1.6))
all_points_x = []
all_poins_y = []
for cell_line in cell_lines_measured:
    for r in ["r1", "r2"]:      
        plt.scatter(np.log10(raw_counts.loc[df_deviation.index, f"count_{cell_line}_3UTR_{r}"]),
                    df_deviation[cell_line], s=3, color="tab:blue", rasterized=True, edgecolor="none", alpha=0.5)
        all_points_x.extend(np.log10(raw_counts.loc[df_deviation.index, f"count_{cell_line}_3UTR_{r}"]))
        all_poins_y.extend(df_deviation[cell_line])

all_points_x = np.array(all_points_x)
all_poins_y = np.array(all_poins_y)
# r2 = stats.pearsonr(all_points_x, all_poins_y)[0]**2

# plt.text(1.1, 0.7, f"r$^2$={round(r2, 2)}", fontsize=8)
plt.xticks(np.arange(1, 5, 1))
plt.xlabel(r"log$_{10}$(RNA counts in sequencing data)")
plt.ylabel("deviation from fit")

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.4_counts_vs_deviation/3.4_counts_vs_deviation.{format}", dpi=300)

In [None]:
%%capture output
# plot the results
# create the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/3.4_counts_vs_deviation"):
    os.makedirs(f"{plot_folder}/3.4_counts_vs_deviation")

fig = plt.figure(figsize=(2.2, 1.6))
for cell_line in cell_lines_measured:
    for r in ["r1", "r2"]:      
        plt.scatter(np.log10(raw_counts.loc[df_deviation.index, f"count_{cell_line}_3UTR_{r}"]),
                    df_knockdown.loc[df_deviation.index, cell_line], s=3, color="tab:blue",
                    rasterized=True, edgecolor="none", alpha=0.5)

plt.xticks(np.arange(1, 5, 1))
plt.xlabel(r"log$_{10}$(RNA counts in sequencing data)")
plt.ylabel(r"log$_{10}$(stability)")

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.4_counts_vs_deviation/3.4_counts_vs_knockdown.{format}", dpi=300)

# 3.5 - Does miRNA GC content explain outliers?

In [86]:
%%capture output
# plot the results
# create the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/3.5_GC_content_vs_deviation"):
    os.makedirs(f"{plot_folder}/3.5_GC_content_vs_deviation")

fig = plt.figure(figsize=(2.2, 1.6))
for cell_line in cell_lines_measured:
    plt.scatter(mirbase.loc[df_deviation.index, "GC_content"]*100, df_deviation[cell_line], s=3, color = "tab:blue",
                rasterized=True, edgecolor="none", alpha=0.5)
    
plt.xlabel("GC content in the microRNA")
plt.ylabel("deviation from fit")

# # add text with the miRNA names
# for i, row in df_deviation.iterrows():
# plt.text(mirbase.loc[i, "GC_content"]-0.05, row[cell_line], "-".join(i.split("-")[2:]), fontsize=8)
plt.xlim(0,100)

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.5_GC_content_vs_deviation/GC_content_vs_deviation.{format}", dpi=300)

In [87]:
%%capture output
# plot the results
# create the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/3.5_GC_content_vs_deviation"):
    os.makedirs(f"{plot_folder}/3.5_GC_content_vs_deviation")

fig = plt.figure(figsize=(2.2, 1.6))
for cell_line in cell_lines_measured:
    plt.scatter(mirbase.loc[df_deviation.index, "GC_content"]*100, df_knockdown.loc[df_deviation.index, cell_line], s=3, color = "tab:blue",
                rasterized=True, edgecolor="none", alpha=0.5)
    
plt.xlabel("GC content in the microRNA")
plt.ylabel(r"log$_{10}$(stability)")

# # add text with the miRNA names
# for i, row in df_deviation.iterrows():
# plt.text(mirbase.loc[i, "GC_content"]-0.05, row[cell_line], "-".join(i.split("-")[2:]), fontsize=8)
plt.xlim(0,100)

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.5_GC_content_vs_deviation/GC_content_vs_knockdown.{format}", dpi=300)

# 3.6 - Do weak polyA signals lead to undercounting?

In [88]:
# get the polyA signal designs
polyA_designs = pd.read_excel("../design_info/polyA_signals.xlsx", index_col=0) 
polyA_designs = polyA_designs[(polyA_designs.index.str.contains("1_mirna_full_single_high_conf")) | (polyA_designs.index.str.contains("2_mirna_full_single_low_conf_mirgenedb"))]

In [89]:
# I filtered out all microRNAs containing a full polyA signal
polyA_designs["AATAAA"] = polyA_designs["seq"].str.contains("AATAAA")

# Get rid of these as I did not measure them
polyA_designs = polyA_designs[~polyA_designs["AATAAA"]]

In [90]:
design_df = reference_dict["1_mirna_full_single_high_conf"].copy()
design_df = pd.concat([design_df, reference_dict["2_mirna_full_single_low_conf_mirgenedb"]], axis=0)
design_df["id"] = design_df.index
polyA_designs["miRNA1"] = design_df.loc[polyA_designs.index, "miRNA1"]
polyA_designs.set_index("miRNA1", inplace=True)

In [91]:
polyA_signals = ["AATAAA", "ATTAAA", "AGTAAA", "TATAAA", "ACTAAA"]
# for each design, add whichever signal is found
polyA_designs["polyA_signal"] = "None"
for index, row in polyA_designs.iterrows():
    seq = row["seq"]
    # check if any of the polyA signals are in the sequence
    for signal in polyA_signals:
        if signal in seq:
            polyA_designs.loc[index, "polyA_signal"] = signal

In [None]:
polyA_deviation = df_deviation.loc[polyA_designs.index]
# create the plot folder if it doesn't exist
if not os.path.exists(os.path.join(plot_folder, "3.6_polyA_deviation")):
    os.makedirs(os.path.join(plot_folder, "3.6_polyA_deviation"))

# plot a histogram
plt.figure(figsize=(2.4, 1.8))
plt.hist(polyA_deviation.values.flatten(), bins=20, color="skyblue")
plt.xlabel("deviation from fit")
plt.ylabel("count")
plt.title("weak poly(A)-signals\nin microRNA targets")

# add text labeling unique polyA signals
unique_signals = polyA_designs["polyA_signal"].unique()
signal_text = "\n".join(unique_signals)
plt.text(0.1, 20, f"poly(A) signals:\n{signal_text}", fontsize=7)
    
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.6_polyA_deviation/3.6_polyA_deviation_histogram.{format}", dpi=300)

Theres nothing there! A partial polyA makes no difference whatsoever

# 3.7 - Inspect family outliers

First, determine which microRNAs of each family are maximally expressed in each cell line

In [117]:
def get_family_mirnas(family):
    return mirbase[mirbase['family_extended'] == family].index.to_list()

In [118]:
# for each microRNA, find the family member with the highest expression
# get a list of family members
families = mirbase.loc[used_mirna_data.index, "family_extended"].unique()

# for each family, find the highest expressed miRNA across cell lines
# the output is a dataframe that only contains the highest expressed miRNA for each family
# all other values are NaN
family_max_df = pd.DataFrame(columns=cell_lines_measured, index=used_mirna_data.index)
for family in families:
    # get the microRNA names
    family_mirnas = get_family_mirnas(family)
    df_family = used_mirna_data[used_mirna_data.index.isin(family_mirnas)]
    for cell_line in cell_lines:
        max_id = df_family[cell_line].idxmax(axis=0)
        family_max_df.loc[max_id, cell_line] = df_family.loc[max_id, cell_line]

Look at let-7-5p, speficically

In [119]:
let7_family = get_family_mirnas("let-7-5p")

In [None]:
let7a_mirna = mirbase.loc[let7_family, "sequence_orig"].str.replace("T", "U").iloc[0]
# print the number of positions at which each of these differs from the first one
for mirna in let7_family:
    mirna_seq = mirbase.loc[mirna, "sequence_orig"].replace("T", "U")
    print(mirna, sum([1 for i in range(len(let7a_mirna)) if let7a_mirna[i] != mirna_seq[i]]))

In [None]:
# make the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/3.7_family_outliers"):
    os.makedirs(f"{plot_folder}/3.7_family_outliers")

# plot df_alles versus df_keller across cell line for these
mirnas = list(let7_family)
plt.figure(figsize=(2, 1.8))
data_x = []
data_y = []
for cell_line in cell_lines_measured:
    data_x.append(df_alles.loc[mirnas, cell_line])
    data_y.append(df_keller.loc[mirnas, cell_line])
    
plt.scatter(np.concatenate(data_x), np.concatenate(data_y), s=5, color="tab:blue", rasterized=False, edgecolor="none", alpha=1)
r2 = stats.pearsonr(np.concatenate(data_x), np.concatenate(data_y))[0]**2

plt.text(2, 4.5, f"r2 = {round(r2, 2)}", fontsize=7)

plt.title("5p-arm of the let-7 family", fontsize=7)
plt.xlabel(r"log$_{10}$(tpm microarray)")
plt.ylabel(r"log$_{10}$(tpm sequencing)")
plt.xticks(np.arange(2, 6, 1))
plt.yticks(np.arange(2, 6, 1))
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.7_family_outliers/3.7_let7_family_expression.{format}", dpi=300)

In [122]:
let7a_mirna = mirbase.loc[let7_family, "sequence_orig"].str.replace("T", "U").iloc[0][:-1]

other_targets = mirbase.loc[let7_family, "target"].str.replace("T", "U").iloc[1:]

In [123]:
%%capture output
# create a figure with 4 subplots
fig, axs = plt.subplots(5, 2, figsize=(4.5, 7), sharex=True, sharey=True)

# make the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/3.7_family_outliers"):
    os.makedirs(f"{plot_folder}/3.7_family_outliers")

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    
    axs[i//2, i%2].scatter(df_expression[f"{cell_line}"]+current_scale, df_knockdown[f"{cell_line}"], s=5, color="tab:blue", rasterized=True)
    
    # plot let7 family members
    # find the let7 family members that are in the data
    let7_family_filter = [mirna for mirna in let7_family if mirna in df_expression.index]
    axs[i//2, i%2].scatter(df_expression[cell_line].loc[let7_family_filter,]+current_scale,
                df_knockdown[cell_line].loc[let7_family_filter,], color="red", s=6, rasterized=True)
    
    # add text with the abbreviated name
    for mirna in let7_family_filter:
        axs[i//2, i%2].text(
            df_expression[cell_line].loc[mirna] + current_scale - 0.15, 
            df_knockdown[cell_line].loc[mirna] - 0.2,
            mirna.split("-")[2], 
            fontsize=7,
            bbox=dict(facecolor='none', alpha=0.5, edgecolor='none', pad=0.1)
        )
    
    # plot the fitting curve
    axs[i//2, i%2].plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="black", linewidth=1.5, label="fit", ls="--")

    # calculate the R2 value
    r2 = stats.pearsonr(df_knockdown[f"{cell_line}"],
                        hill_func_log_regular(df_expression[f"{cell_line}"]+current_scale,
                        *hill_params))[0]**2

    plt.xlim(0, 5.5)
    plt.ylim(-1.7, 0.25)
    
    if i % 2 == 0:
        axs[i//2, i%2].set_ylabel(r"log$_{10}$(stability)")
    if i // 2 == 4:
        axs[i//2, i%2].set_xlabel(r"log$_{10}$(miRNA expression)")
    axs[i//2, i%2].set_title(f"{cell_line}", fontsize=7)

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"3.7_family_outliers/3.7_let7_mirnas.{format}"), dpi=600)

# 3.8 - Filter the crosstalk
### This requires input files generated in Notebook 5.

In [124]:
# load the crosstalk dict
input_folder = "../outputs/5_mutations"

with open(f"{input_folder}/5.7_crosstalk_filter_dict.pkl", "rb") as f:
    crosstalk_filter_dict = pickle.load(f)

In [125]:
used_mirna_data_filter = used_mirna_data.copy()
df_expression_filter = df_expression.copy()
df_knockdown_filter = df_knockdown.copy()

In [126]:
# make a list of allowed mirnas per cell line
allowed_mirnas_all = {}
for cell_line in cell_lines_measured:
    allowed_mirnas = []
    crosstalk_dict = crosstalk_filter_dict[cell_line]
    allowed_mirnas_all[cell_line] = list(df_knockdown_filter.index.difference(crosstalk_dict))

In [127]:
x_data = []
y_data = []
dataset_indices = []
allowed_mirnas_all_measured = {}
for i, cell_line in enumerate(cell_lines_measured):
    # check which miRNAs I actually measured of those allowed
    allowed_mirnas_measured = [mirna for mirna in allowed_mirnas_all[cell_line] if mirna in df_expression_filter.index]
    allowed_mirnas_all_measured[cell_line] = allowed_mirnas_measured
    
    # add those to the dataset
    ex_df = df_expression_filter.loc[allowed_mirnas_measured, cell_line].values
    knock_df = df_knockdown_filter.loc[allowed_mirnas_measured, cell_line].values
    x_data.append(ex_df)
    y_data.append(knock_df)
    dataset_indices.append([i] * len(ex_df))

x_data = np.concatenate(x_data)
y_data = np.concatenate(y_data)
dataset_indices = np.concatenate(dataset_indices)

In [128]:
popt_scales, pcov = opt.curve_fit(
    lambda x, *params: hill_func_log_scales(x, dataset_indices, *params),
    x_data,
    y_data,
    p0=p0_scale,
    bounds=bounds_scale,
    maxfev=5000
)

scales = list(popt_scales_filter[num_params:])
hill_params = popt_scales_filter[:num_params]

In [None]:
hill_params

In [130]:
%%capture output

r2_vals_scales = {}
rmsd_vals_scales = {}
df_deviation_filter = {}

# create a figure with 4 subplots
fig, axs = plt.subplots(5, 2, figsize=(6, 10), sharex=True, sharey=True)
# make the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/3.8_crosstalk_removed"):
    os.makedirs(f"{plot_folder}/3.8_crosstalk_removed")

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    
    # drop all non-maximum values for each family in this cell line
    allowed_mirnas_measured = allowed_mirnas_all_measured[cell_line]

    # get the expression and knockdown data for these microRNAs
    ex_df = df_expression_filter.loc[allowed_mirnas_measured, cell_line].values
    knock_df = df_knockdown_filter.loc[allowed_mirnas_measured, cell_line].values
    
    axs[i//2, i%2].scatter(ex_df+current_scale, knock_df, s=5, color="tab:blue", rasterized=True)
    axs[i//2, i%2].plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="black", linewidth=1, label="fit", ls="--")

    # calculate the R2 value
    r2 = stats.pearsonr(knock_df,
                        hill_func_log_regular(ex_df+current_scale,
                        *hill_params))[0]**2
    r2_vals_scales[cell_line] = r2
    
    # calculate the RMSD value
    rmsd = np.sqrt(np.mean((knock_df-
                            hill_func_log_regular(ex_df+current_scale,
                            *hill_params))**2))
    rmsd_vals_scales[cell_line] = rmsd

    # calculate the deviation
    df_deviation_filter[cell_line] = (df_knockdown_filter.loc[allowed_mirnas_measured, cell_line] - hill_func_log_regular(ex_df+current_scale, *hill_params))
    
    # also plot the microRNAs that were filtered
    ex_df = df_expression_filter.loc[:, cell_line]
    knock_df = df_knockdown_filter.loc[:, cell_line]
    
    forbidden_mirnas = ex_df.index.difference(allowed_mirnas_measured)
    ex_df = ex_df.loc[forbidden_mirnas]
    knock_df = knock_df.loc[forbidden_mirnas]
    
    axs[i//2, i%2].scatter(ex_df+current_scale, knock_df, s=5, color="tab:red", rasterized=True)
    
    axs[i//2, i%2].set_xlabel("miRNA expression")
    axs[i//2, i%2].set_ylabel(r"log$_{10}$(stability)")
    axs[i//2, i%2].set_title(f"{cell_line}, r2 = {round(r2, 2)}, rmsd = {round(rmsd, 2)}", fontsize=7.5)

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.8_crosstalk_removed/global_fit_crosstalk_removed.{format}", dpi=300)

In [131]:
df_deviation_filter = pd.DataFrame(df_deviation_filter)
df_deviation_filter.to_csv(os.path.join(output_folder, f"{used_mirna_name}_deviation_crosstalk_filtered.csv"))

In [132]:
# filter df_knockdown and df_expression to the allowed mirnas
for cell_line in cell_lines_measured:
    df_knockdown_filter[cell_line] = df_knockdown_filter.loc[allowed_mirnas_all_measured[cell_line], cell_line]
    df_expression_filter[cell_line] = df_expression_filter.loc[allowed_mirnas_all_measured[cell_line], cell_line]

# 3.9 - Investigate false positives

## Generate false positive info for both datasets and save it

In [133]:
false_positives = {}
for cell_line in cell_lines_measured:
    # filter to those with little knockdown
    df_knockdown_low = df_knockdown_filter.loc[:, cell_line].dropna()
    df_knockdown_low = df_knockdown_low[df_knockdown_low > -0.25]
    
    # filter to those with high expression
    df_expression_high = df_expression_filter.loc[:, cell_line].dropna()
    df_expression_high = df_expression_high[df_expression_high > 3.8]
    
    # get common indices between the two and the associated df_deviation_false_pos
    common_indices = df_knockdown_low.index.intersection(df_expression_high.index)
    df_deviation_false_pos = df_deviation_filter[cell_line].loc[common_indices]
    
    # restrain to those that knock down less than expected
    df_deviation_false_pos = df_deviation_false_pos[df_deviation_false_pos > 0.3]
    
    false_positives[cell_line] = df_deviation_false_pos.index.to_list()

In [134]:
# get a list of how often each microRNA is a false positive
false_positives_count = {}
for cell_line in cell_lines_measured:
    for mirna in false_positives[cell_line]:
        if mirna in false_positives_count.keys():
            false_positives_count[mirna] += 1
        else:
            false_positives_count[mirna] = 1

In [135]:
false_positives_count = pd.Series(false_positives_count)
false_positives_count.to_csv(os.path.join(output_folder, f"{used_mirna_name}_false_positives_count.csv"))

false_positives_df = pd.DataFrame(columns=cell_lines_measured, index=false_positives_count.index)
for cell_line in cell_lines_measured:
    false_positives_df.loc[false_positives[cell_line], cell_line] = 1
false_positives_df.to_csv(os.path.join(output_folder, f"{used_mirna_name}_false_positives.csv"))

## Load the filtered false positive data
### The data loaded here is generated using Notebook 4.

In [136]:
false_positives_df_filter = pd.read_csv(os.path.join(output_folder, f"{used_mirna_name}_false_positives_filtered.csv"), index_col=0)

In [None]:
false_positives_df_filter

In [None]:
# count those that have a value that is not NaN in at least four cell lines
false_positives_df_filter["sum"] = false_positives_df_filter.sum(axis=1)
false_positives_df_filter = false_positives_df_filter[false_positives_df_filter["sum"] > 3]
false_positives_df_filter

In [139]:
%%capture output
fig, axs = plt.subplots(5, 2, figsize=(6, 8), sharex=True, sharey=True)
# make the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/3.9_false_positives"):
    os.makedirs(f"{plot_folder}/3.9_false_positives")

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    
    # there should be not filtering for NaN values - a false positive in one cell line is probably one elsewhere as well
    false_pos_df_cell_line = false_positives_df_filter[cell_line]
    index_wo_false_pos = df_expression_filter.index.difference(false_pos_df_cell_line.index)
    df_ex = df_expression_filter.loc[index_wo_false_pos, cell_line].dropna()
    df_knock = df_knockdown_filter.loc[index_wo_false_pos, cell_line].dropna()
    
    # remove "hsa-miR-3613-3p" from df_ex and df_knock for plotting purposes
    df_ex = df_ex[df_ex.index != "hsa-miR-3613-3p"]
    df_knock = df_knock[df_knock.index != "hsa-miR-3613-3p"]
    
    axs[i//2, i%2].scatter(df_ex+current_scale,df_knock, s=5, color="tab:blue", rasterized=True)
    axs[i//2, i%2].scatter(df_expression_filter[cell_line].loc[false_pos_df_cell_line.index,]+current_scale,
                df_knockdown_filter[cell_line].loc[false_pos_df_cell_line.index,], color="red", s=5, rasterized=True)
    
    axs[i//2, i%2].plot(x_range_log, hill_func_log_regular(x_range_log,
                        *hill_params), color="black", linewidth=1.5, label="fit")

    # calculate the R2 value
    r2 = stats.pearsonr(df_knock,
                        hill_func_log_regular(df_ex+current_scale,
                        *hill_params))[0]**2
    
    # calculate the RMSD value
    rmsd = np.sqrt(np.mean((df_knock-
                            hill_func_log_regular(df_ex+current_scale,
                            *hill_params))**2))

    plt.xlim(0, 5.5)
    plt.ylim(-1.7, 0.25)
    
    axs[i//2, i%2].set_xlabel(r"log$_{10}$(miRNA expression)")
    axs[i//2, i%2].set_ylabel(r"log$_{10}$(stability)")
    axs[i//2, i%2].set_title(f"{cell_line}, r2 = {round(r2, 2)}, rmsd = {round(rmsd, 2)}", fontsize=7.5)

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"3.9_false_positives/3.9.1_false_positives_highlighted.{format}"), dpi=600)

In [140]:
%%capture output
fig = plt.figure(figsize=(3, 1.8))

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    
    # there should be not filtering for NaN values - a false positive in one cell line is probably one elsewhere as well
    # false_pos_df_cell_line = false_positives_df_filter[cell_line]
    # index_wo_false_pos = df_expression_filter.index.difference(false_pos_df_cell_line.index)
    # df_ex = df_expression_filter.loc[index_wo_false_pos, cell_line].dropna()
    # df_knock = df_knockdown_filter.loc[index_wo_false_pos, cell_line].dropna()
    false_pos_df_cell_line = false_positives_df_filter[cell_line]
    index_wo_false_pos = df_expression.index.difference(false_pos_df_cell_line.index)
    df_ex = df_expression.loc[index_wo_false_pos, cell_line].dropna()
    df_knock = df_knockdown.loc[index_wo_false_pos, cell_line].dropna()
    
    # remove "hsa-miR-3613-3p" from df_ex and df_knock for plotting purposes
    df_ex = df_ex[df_ex.index != "hsa-miR-3613-3p"]
    df_knock = df_knock[df_knock.index != "hsa-miR-3613-3p"]
    
    r2 = stats.pearsonr(df_knock,
                        hill_func_log_regular(df_ex+current_scale,
                        *hill_params))[0]**2
    
    plt.scatter(df_ex+current_scale,df_knock, s=4, color="tab:blue", rasterized=True, zorder=1, edgecolors="none")
    
    if used_mirna_name == "Alles2019":
        if i ==0:
            plt.scatter(df_expression_filter[cell_line].loc[false_pos_df_cell_line.index,]+current_scale,
                        df_knockdown_filter[cell_line].loc[false_pos_df_cell_line.index,],
                        color="tab:orange", s=5, rasterized=True, zorder=2, label="false positives microarray")
            plt.title("filtered false positives (microarray)", fontsize=8)
        else:
            plt.scatter(df_expression_filter[cell_line].loc[false_pos_df_cell_line.index,]+current_scale,
                        df_knockdown_filter[cell_line].loc[false_pos_df_cell_line.index,],
                        color="tab:orange", s=5, rasterized=True, zorder=2)
    else:
        if i == 0:
            plt.scatter(df_expression_filter[cell_line].loc[false_pos_df_cell_line.index,]+current_scale,
                        df_knockdown_filter[cell_line].loc[false_pos_df_cell_line.index,],
                        color="tab:red", s=5, rasterized=True, zorder=2, label="false positives sequencing")
            plt.title("filtered false positives (sequencing)", fontsize=8)
        else:
            plt.scatter(df_expression_filter[cell_line].loc[false_pos_df_cell_line.index,]+current_scale,
                        df_knockdown_filter[cell_line].loc[false_pos_df_cell_line.index,],
                        color="tab:red", s=5, rasterized=True, zorder=2)

    if i == 0:
        plt.plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="black", linewidth=1, ls="--")
    
plt.xlabel(r"log$_{10}$"+f"(miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")

plt.xlim(0, 5.5)
plt.ylim(-1.7, 0.25)
plt.tight_layout()
plt.legend(loc="lower left", frameon=False, fontsize=7)
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.9_false_positives/3.9.1_false_positives_highlighted_single_plot.{format}", dpi=600, bbox_inches='tight')

In [141]:
# remove these from the data
df_knockdown_filter = df_knockdown_filter[~df_knockdown_filter.index.isin(false_positives_df_filter.index)]
df_expression_filter = df_expression_filter[~df_expression_filter.index.isin(false_positives_df_filter.index)]
used_mirna_data_filter = used_mirna_data[~used_mirna_data.index.isin(false_positives_df_filter.index)]
df_deviation_filter = df_deviation_filter[~df_deviation_filter.index.isin(false_positives_df_filter.index)]

# 3.10 - Look at homopolymers / Investigate the remaining false positives

In [142]:
false_positives_df.drop(false_positives_df_filter.index, inplace=True)

In [None]:
# print the sequence of all of these
for index, row in false_positives_df.iterrows():
    mirna_seq = mirbase.loc[index, "sequence_norm"]
    print(index, mirna_seq)

In [144]:
def find_largest_homopolymer_stretch(sequence, min_length=5, max_gap=0):
    """
    Finds the largest homopolymer stretch in a given DNA sequence that is at least `min_length` bases long,
    allowing for a customizable gap, and returns only the single largest stretch in terms of total identical nucleotides.

    Args:
    - sequence (str): The DNA sequence to search within.
    - min_length (int): The minimum length of the homopolymer stretch.
    - max_gap (int): The maximum number of different bases allowed in the homopolymer stretch.

    Returns:
    - tuple: Contains the start index, end index, the sequence of the homopolymer stretch, and the base of the homopolymer.
    """
    
    # Initialize variables to keep track of the largest homopolymer stretch
    largest_stretch = None
    largest_count = 0
    
    for i in range(len(sequence)):
        for j in range(i + min_length, len(sequence) + 1):
            subseq = sequence[i:j]
            base_counts = {base: subseq.count(base) for base in set(subseq)}
            dominant_base, dominant_count = max(base_counts.items(), key=lambda x: x[1])
            gap_count = len(subseq) - dominant_count

            if dominant_count > largest_count and gap_count <= max_gap:
                largest_stretch = (i, j, subseq, dominant_base)
                largest_count = dominant_count

    return largest_stretch

In [None]:
# print the sequence of all of these
for index, row in false_positives_df.iterrows():
    mirna_seq = mirbase.loc[index, "sequence_norm"]
    stretches = find_largest_homopolymer_stretch(mirna_seq)
    print(index, mirna_seq, stretches)

In [146]:
# is df_deviation_filter in the local environment
if not "df_deviation_filter" in locals():
    df_deviation_filter = df_deviation.copy()
    df_expression_filter = df_expression.copy()
    df_knockdown_filter = df_knockdown.copy()

In [147]:
# add homopolymer stretch info to deviation_df_filter
for index, row in df_deviation_filter.iterrows():
    mirna_seq = mirbase.loc[index, "sequence_norm"]
    homopolymer = find_largest_homopolymer_stretch(mirna_seq)
    if homopolymer is not None:
        df_deviation_filter.loc[index, "homopolymer"] = True
    else:
        df_deviation_filter.loc[index, "homopolymer"] = False

In [148]:
# unroll df_deviation along cell lines
df_deviation_filter_unroll = pd.DataFrame(columns=["deviation", "homopolymer"])

for index, row in df_deviation_filter.iterrows():
    for cell_line in cell_lines_measured:
        new_index = index + "_" + cell_line
        df_deviation_filter_unroll.loc[new_index, ["deviation", "homopolymer"]] = [df_deviation_filter.loc[index, cell_line], df_deviation_filter.loc[index, "homopolymer"]]

In [None]:
# make a boxplot containing all values for which homopolymer is true or false
# create the output folder if necessary
if not os.path.exists(os.path.join(plot_folder, "3.10_remaining_false_positives")):
    os.makedirs(os.path.join(plot_folder, "3.10_remaining_false_positives"))

fig = plt.figure(figsize=(1.6, 1.6))
sns.boxplot(data=df_deviation_filter_unroll, x="homopolymer", y="deviation", flierprops=dict(marker='o', markersize=2))
plt.ylabel("deviation from fit")
plt.xlabel("homopolymers >=5 nt")

# get all hsa-miR-3613-3p entries
index_3613 = df_deviation_filter_unroll.index[df_deviation_filter_unroll.index.str.contains("hsa-miR-3613-3p")]

# label these as red dots
#plt.scatter(df_deviation_filter_unroll.loc[index_3613, "homopolymer"], df_deviation_filter_unroll.loc[index_3613, "deviation"], color="red", s=10)
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"3.10_remaining_false_positives/3.10_homopolymer_boxplot.{format}"), dpi=300)

In [151]:
# which microRNAs have a deviation below -0.5?
strong_deviation_dict = {}
for cell_line in cell_lines_measured:
    strong_deviation_index = df_deviation_filter[df_deviation_filter[cell_line] < -0.5].index
    strong_deviation_dict[cell_line] = strong_deviation_index

In [None]:
# create counts across cell lines!
strong_deviation_count = {}
for cell_line in cell_lines_measured:
    for mirna in strong_deviation_dict[cell_line]:
        if mirna in strong_deviation_count.keys():
            strong_deviation_count[mirna] += 1
        else:
            strong_deviation_count[mirna] = 1

# sort by count
strong_deviation_count = pd.Series(strong_deviation_count)
strong_deviation_count.sort_values(ascending=False, inplace=True)
strong_deviation_count

#### hsa-miR-3613-3p stands out in every single measurement!

In [155]:
# filter out "hsa-miR-3613-3p"
df_expression_filter = df_expression_filter[~df_expression_filter.index.isin(["hsa-miR-3613-3p"])]
df_knockdown_filter = df_knockdown_filter[~df_knockdown_filter.index.isin(["hsa-miR-3613-3p"])]
df_deviation_filter = df_deviation_filter[~df_deviation_filter.index.isin(["hsa-miR-3613-3p"])]

### Redo the fitting

In [156]:
# do df_expression_filter and df_knockdown_filter exist in the local variables?
if not ("df_expression_filter" in locals() and "df_knockdown_filter" in locals()):
    df_expression_filter = df_expression.copy()
    df_knockdown_filter = df_knockdown.copy()

In [None]:
x_data = []
y_data = []
dataset_indices = []
for i, cell_line in enumerate(cell_lines_measured):
    ex_df = df_expression_filter[cell_line].dropna().values
    knock_df = df_knockdown_filter[cell_line].dropna().values
    x_data.append(ex_df)
    y_data.append(knock_df)
    dataset_indices.append([i] * len(ex_df))

x_data = np.concatenate(x_data)
y_data = np.concatenate(y_data)
dataset_indices = np.concatenate(dataset_indices)

popt_scales, pcov = popt_scales_filter, pcov = opt.curve_fit(
    lambda x, *params: hill_func_log_scales(x, dataset_indices, *params),
    x_data,
    y_data,
    p0=p0_scale,
    bounds=bounds_scale,
    maxfev=5000
)

scales = list(popt_scales[num_params:])
hill_params = popt_scales[:num_params]
hill_params

In [158]:
scale_dict = {cell_line: scale for cell_line, scale in zip(cell_lines_measured, scales)}

In [159]:
%%capture output
df_deviation_filter = pd.DataFrame(columns=df_knockdown_filter.columns, index=df_knockdown_filter.index)

# create a figure with 4 subplots
fig, axs = plt.subplots(5, 2, figsize=(6, 8), sharex=True, sharey=True)
# make the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/3.9_false_positives"):
    os.makedirs(f"{plot_folder}/3.9_false_positives")

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    
    df_ex = df_expression_filter.loc[:, cell_line].dropna()
    df_knock = df_knockdown_filter.loc[:, cell_line].dropna()
    
    axs[i//2, i%2].scatter(df_ex+current_scale,df_knock, s=5, color="tab:blue")
    axs[i//2, i%2].plot(x_range_log, hill_func_log_regular(x_range_log,
                        *hill_params), color="black", linewidth=1.5, label="fit", ls="--")

    # calculate the R2 value
    r2 = stats.pearsonr(df_knock,
                        hill_func_log_regular(df_ex+current_scale,
                        *hill_params))[0]**2
    
    # calculate the RMSD value
    rmsd = np.sqrt(np.mean((df_knock-
                            hill_func_log_regular(df_ex+current_scale,
                            *hill_params))**2))

    
    # calculate the deviation
    df_deviation_filter[cell_line] = (df_knock - hill_func_log_regular(df_ex+current_scale, *hill_params))
    
    plt.xlim(0, 5.5)
    plt.ylim(-1.7, 0.25)
    
    if i % 2 == 0:
        axs[i//2, i%2].set_ylabel(r"log$_{10}$(stability)")
    if i // 2 == 4:
        axs[i//2, i%2].set_xlabel(r"log$_{10}$(miRNA expression)")
    axs[i//2, i%2].set_title(f"{cell_line}, r2 = {round(r2, 2)}, rmsd = {round(rmsd, 2)}", fontsize=7.5)

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"3.9_false_positives/3.9.2_global_fit_false_positives_filtered.{format}"), dpi=300)

In [160]:
# save the deviation dict
df_deviation_filter.to_csv(os.path.join(output_folder, f"{used_mirna_name}_deviation_step_3.10.csv"))

# 3.11 - Investigate the remaining deviating miRNAs

## 3.11.1 - False negatives

### Get false negative info for both datasets and save them

In [161]:
false_negatives = {}
for cell_line in cell_lines_measured:
    # restrain to those that actually have significant knockdown
    df_knockdown_high = df_knockdown_filter.loc[:, cell_line].dropna()
    df_knockdown_high = df_knockdown_high[df_knockdown_high < -0.25]
    
    df_expression_low = df_expression_filter.loc[:, cell_line].dropna()
    # ignore this requirement for now
    df_expression_low = df_expression_low[df_expression_low < 3.5]
    
    # get common indices between the two and the associated df_deviation_false_pos
    common_indices = df_knockdown_high.index.intersection(df_expression_low.index)
    df_deviation_false_neg = df_deviation_filter[cell_line].loc[common_indices]
    
    # restrain to those that knock down more than expected
    df_deviation_false_neg = df_deviation_false_neg[df_deviation_false_neg < - 0.3]
    
    false_negatives[cell_line] = df_deviation_false_neg.index.to_list()

In [162]:
# get a list of how often each microRNA is a false negative
false_negatives_count = {}
for cell_line in cell_lines_measured:
    for mirna in false_negatives[cell_line]:
        if mirna in false_negatives_count.keys():
            false_negatives_count[mirna] += 1
        else:
            false_negatives_count[mirna] = 1

In [163]:
false_negatives_count = pd.Series(false_negatives_count)
false_negatives_count.to_csv(os.path.join(output_folder, f"{used_mirna_name}_false_negatives_count.csv"))

false_negatives_df = pd.DataFrame(columns=cell_lines_measured, index=false_negatives_count.index)
for cell_line in cell_lines_measured:
    false_negatives_df.loc[false_negatives[cell_line], cell_line] = 1
false_negatives_df.to_csv(os.path.join(output_folder, f"{used_mirna_name}_false_negatives.csv"))

## Load the filtered false negative data
### The processing and filterting occurs in Notebook 4

In [164]:
false_negative_df_filter = pd.read_csv(os.path.join(output_folder, f"{used_mirna_name}_false_negatives_filtered.csv"), index_col=0)

In [None]:
# count those that have a value that is not NaN in at least four cell lines
false_negative_df_filter["sum"] = false_negative_df_filter.sum(axis=1)
false_negative_df_filter = false_negative_df_filter[false_negative_df_filter["sum"] > 3]
false_negative_df_filter

In [166]:
%%capture output
fig, axs = plt.subplots(5, 2, figsize=(6, 8), sharex=True, sharey=True)
# make the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/3.11.1_false_negatives"):
    os.makedirs(f"{plot_folder}/3.11.1_false_negatives")

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    
    # if it's a false negative in one cell line, it needs to be one in the other cell lines as well!
    #false_neg_df_cell_line = false_negative_df_filter[cell_line].dropna()
    false_neg_df_cell_line = false_negative_df_filter[cell_line]
    index_wo_false_neg = df_expression_filter.index.difference(false_neg_df_cell_line.index)
    df_ex = df_expression_filter.loc[index_wo_false_neg, cell_line].dropna()
    df_knock = df_knockdown_filter.loc[index_wo_false_neg, cell_line].dropna()
    
    axs[i//2, i%2].scatter(df_ex+current_scale,df_knock, s=5, color="tab:blue", rasterized=True)
    axs[i//2, i%2].scatter(df_expression_filter[cell_line].loc[false_neg_df_cell_line.index,]+current_scale,
                df_knockdown_filter[cell_line].loc[false_neg_df_cell_line.index,], color="red", s=5, rasterized=True)
    
    axs[i//2, i%2].plot(x_range_log, hill_func_log_regular(x_range_log,
                        *hill_params), color="black", linewidth=1.5, label="fit", ls="--")

    # calculate the R2 value
    r2 = stats.pearsonr(df_knock,
                        hill_func_log_regular(df_ex+current_scale,
                        *hill_params))[0]**2
    
    # calculate the RMSD value
    rmsd = np.sqrt(np.mean((df_knock-
                            hill_func_log_regular(df_ex+current_scale,
                            *hill_params))**2))

    plt.xlim(0, 5.5)
    plt.ylim(-1.7, 0.25)
    
    if i % 2 == 0:
        axs[i//2, i%2].set_ylabel(r"log$_{10}$(stability)")
    if i // 2 == 4:
        axs[i//2, i%2].set_xlabel(r"log$_{10}$(miRNA expression)")
    axs[i//2, i%2].set_title(f"{cell_line}, r2 = {round(r2, 2)}, rmsd = {round(rmsd, 2)}", fontsize=7.5)

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"3.11.1_false_negatives/3.11.1_false_negatives_highlighted.{format}"), dpi=600)

In [167]:
%%capture output
fig = plt.figure(figsize=(3, 1.8))

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    
    false_neg_df_cell_line = false_negative_df_filter[cell_line]
    index_wo_false_neg = df_expression.index.difference(false_neg_df_cell_line.index)
    
    df_ex = df_expression.loc[index_wo_false_neg, cell_line].dropna()
    df_knock = df_knockdown.loc[index_wo_false_neg, cell_line].dropna()
    
    plt.scatter(df_ex+current_scale,df_knock, s=3, color="tab:blue", rasterized=True, zorder=1, edgecolors="none")
    
    if used_mirna_name == "Alles2019":
        # plt.scatter(df_expression[cell_line].loc[false_pos_df_cell_line.index,]+current_scale,
        #             df_knockdown[cell_line].loc[false_pos_df_cell_line.index,], color="tab:orange", s=6, rasterized=True, zorder=2, edgecolors="none")

        plt.scatter(df_expression[cell_line].loc[false_neg_df_cell_line.index,]+current_scale,
                    df_knockdown[cell_line].loc[false_neg_df_cell_line.index,],
                    color="tab:orange", s=5, rasterized=True, zorder=2, marker="x", label="false negatives microarray" if i == 0 else None)
        plt.title("filtered false negatives (microarray)", fontsize=8)
    else:
        plt.scatter(df_expression[cell_line].loc[false_neg_df_cell_line.index,]+current_scale,
                    df_knockdown[cell_line].loc[false_neg_df_cell_line.index,],
                    color="tab:red", s=5, rasterized=True, zorder=2, marker="x", label="false negatives sequencing" if i == 0 else None)
        plt.title("filtered false negatives (sequencing)", fontsize=8)

    if i == 0:
        plt.plot(x_range_log, hill_func_log_regular(x_range_log,
                *hill_params), color="black", linewidth=1, ls="--")
    
plt.xlabel(r"log$_{10}$"+f"(miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")

# plt.scatter(0.5, -1.2, color="black", s=10, marker="x")
# plt.text(0.6, -1.25, f"{label_mirna}", fontsize=7, ha="left")
# plt.text(0.6, -1.25, r"$t(x)=1-\frac{x}{c+x}$", fontsize=7, ha="left")

plt.xlim(0, 5.5)
plt.ylim(-1.7, 0.25)
plt.tight_layout()
plt.legend(loc="lower left", frameon=False, fontsize=6.5)
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"3.11.1_false_negatives/3.11.1_false_pos_and_neg_single_plot.{format}"), dpi=600)

## 3.11.2 - Does the length of the microRNA matter?

In [None]:
original_sequences = mirbase.loc[df_expression_filter.index, ["sequence_orig"]]
original_sequences["length"] = original_sequences["sequence_orig"].apply(len)
df_expression_filter["length"] = original_sequences["length"]
original_sequences["length"]

In [169]:
# get a boxplot of the deviation for each length
df_deviation_filter["length"] = original_sequences["length"]
df_deviation_filter_flattened = pd.DataFrame(columns=["deviation", "length"])

for index, row in df_deviation_filter.iterrows():
    for cell_line in cell_lines_measured:
        new_index = index + "_" + cell_line
        df_deviation_filter_flattened.loc[new_index, ["deviation", "length"]] = [df_deviation_filter.loc[index, cell_line], df_deviation_filter.loc[index, "length"]]

In [None]:
# create a plot folder if none exists
if not os.path.exists(os.path.join(plot_folder, "3.11_length_deviation")):
    os.makedirs(os.path.join(plot_folder, "3.11_length_deviation"))

plt.figure(figsize=(2.5, 1.6))
sns.boxplot(data=df_deviation_filter_flattened, x="length", y="deviation", flierprops=dict(marker='o', markersize=1, color="blue", linewidth=0.0))
plt.xlabel("microRNA length")
plt.ylabel("deviation from fit")
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"3.11_length_deviation/3.11_length_deviation_boxplot.{format}"), dpi=300)

In [None]:
from library2_utils.plotting_utilities import HandlerSize

# create the plot
plt.clf()
fig = plt.figure(figsize=(3, 2))

# get tab10 as a color palette
colors = sns.color_palette("tab10")
# create a scatterplot for each of these
sc_plots = []
for length_i, length in enumerate(sorted(df_expression_filter["length"].unique())):
    # create the plot folder if it doesn't exist
    
    # get the indices of the current length
    current_indices = df_expression_filter[df_expression_filter["length"] == length].index
    
    
    for i, cell_line in enumerate(cell_lines_measured):
        current_scale = scales[i]
        sc = plt.scatter(df_expression_filter.loc[current_indices, cell_line]+current_scale,
                    df_knockdown_filter.loc[current_indices, cell_line],
                    s=3, edgecolors="none", color=colors[length_i], rasterized=True, label = str(length)+" nt" if i == 0 else None)
        if i == 0:
            sc_plots.append(sc)
    if length_i == 0:
        plt.plot(x_range_log, hill_func_log_regular(x_range_log,
                        *hill_params), color="black", linewidth=1.5, linestyle="--")
plt.xlim(0, 5.5)
plt.ylim(-1.7, 0.25)

plt.xlabel(r"log$_{10}$(miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")
plt.legend(loc="lower left", frameon=False, fontsize=7, ncols=2, handler_map={sc: HandlerSize(12) for sc in sc_plots})
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.11_length_deviation/3.11_mirna_length.{format}", dpi=600)

# 3.12 - False positive and negative aware merging

## This requires that false positives and negatives to already have been identified.

In [81]:
# create the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/3.12_data_congruence"):
    os.makedirs(f"{plot_folder}/3.12_data_congruence")

In [82]:
def sum_filtering(df):
    """
    Filters a dataframe by summing the values in each row and keeping only those that are above 3.
    """
    df["sum"] = df.sum(axis=1)
    df_filtered = df[df["sum"] > 3]
    return df_filtered.drop(columns="sum")

false_negatives_keller = pd.read_csv("../outputs/3_fitting/Keller2023/Keller2023_false_negatives_filtered.csv", index_col=0)
false_negatives_alles = pd.read_csv("../outputs/3_fitting/Alles2019/Alles2019_false_negatives_filtered.csv", index_col=0)
false_positives_keller = pd.read_csv("../outputs/3_fitting/Keller2023/Keller2023_false_positives_filtered.csv", index_col=0)
false_positives_alles = pd.read_csv("../outputs/3_fitting/Alles2019/Alles2019_false_positives_filtered.csv", index_col=0)

# apply the sum filtering
false_negatives_keller = sum_filtering(false_negatives_keller)
false_negatives_alles = sum_filtering(false_negatives_alles)
false_positives_keller = sum_filtering(false_positives_keller)
false_positives_alles = sum_filtering(false_positives_alles)

false_positives_and_negatives = [false_negatives_keller, false_negatives_alles, false_positives_keller, false_positives_alles]
false_positives_and_negatives_label = ["false negatives sequencing", "false negatives microarray", "false positives sequencing", "false positives microarray"]
false_positives_and_negatives_symbols = ["x", "x", "o", "o"]
false_positives_and_negatives_colors = ["tab:red", "tab:orange", "tab:red", "tab:orange"]

In [83]:
# make a list of allowed mirnas per cell line (crosstalk)
df_knockdown_filter_congruence = df_knockdown.copy()

# Run first with false-false, then with false-true, then with true-true
crosstalk_filtered = True
bias_aware = True

# load the crosstalk dict
input_folder = "../outputs/5_mutations"

with open(f"{input_folder}/5.7_crosstalk_filter_dict.pkl", "rb") as f:
    crosstalk_filter_dict = pickle.load(f)

allowed_mirnas_all = {}
for cell_line in cell_lines_measured:
    allowed_mirnas = []
    crosstalk_dict = crosstalk_filter_dict[cell_line]
    allowed_mirnas_all[cell_line] = list(df_knockdown.index.difference(crosstalk_dict))
if crosstalk_filtered:        
    for cell_line in cell_lines_measured:
        df_knockdown_filter_congruence[cell_line] = df_knockdown_filter_congruence.loc[allowed_mirnas_all[cell_line], cell_line]
 
df_knockdown_filter_congruence = df_knockdown_filter_congruence[df_knockdown_filter_congruence.index != "hsa-miR-3613-3p"]

In [84]:
df_alles_filter = df_alles[cell_lines_measured].copy()
df_keller_filter = df_keller[cell_lines_measured].copy()

# harmonize the indices
common_index = df_alles_filter.index.intersection(df_keller_filter.index)
df_alles_filter = df_alles_filter.loc[common_index, :]
df_keller_filter = df_keller_filter.loc[common_index, :]

mirna_ex_geometric_mean = pd.DataFrame(columns = cell_lines_measured, index = df_alles_filter.index)

if bias_aware:
    forbidden_Alles = false_positives_alles.index.union(false_negatives_alles.index)
    forbidden_Keller = false_positives_keller.index.union(false_negatives_keller.index)

    for mirna in mirna_ex_geometric_mean.index:
        if mirna in forbidden_Alles:
            mirna_ex_geometric_mean.loc[mirna, :] = df_keller_filter.loc[mirna, :]
        elif mirna in forbidden_Keller:
            mirna_ex_geometric_mean.loc[mirna, :] = df_alles_filter.loc[mirna, :]
        else:
            mirna_ex_geometric_mean.loc[mirna, :] = (df_alles_filter.loc[mirna, :] + df_keller_filter.loc[mirna, :])/2
else:
    mirna_ex_geometric_mean = (df_alles_filter + df_keller_filter)/2.0

In [85]:
with open("../microrna_data/likely_real_mirnas.pkl", "rb") as f:
    likely_real_mirnas = pickle.load(f)
    
mirna_ex_geometric_mean = normalize_expr_df_to_rpm_with_index(mirna_ex_geometric_mean.astype(float), likely_real_mirnas)

In [None]:
fig, axs = plt.subplots(5, 2, figsize=(4, 6), sharex=True, sharey=True)

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]

    # specifically plot false positives and negatives
    excluded_points = []
    for curr_index, df in enumerate(false_positives_and_negatives):
        alles = df_alles_filter.loc[df.index, cell_line]
        keller = df_keller_filter.loc[df.index, cell_line]
        if i == 0:
            axs[i//2, i%2].scatter(alles, keller, s=6, color=false_positives_and_negatives_colors[curr_index],
                                label=false_positives_and_negatives_label[curr_index], marker=false_positives_and_negatives_symbols[curr_index],
                                rasterized=True, zorder=2)
        else:
            axs[i//2, i%2].scatter(alles, keller, s=6, color=false_positives_and_negatives_colors[curr_index],
                                marker=false_positives_and_negatives_symbols[curr_index],
                                rasterized=True, zorder=2)
        excluded_points.append(df.index)
        
    # flatten excluded points
    excluded_points = [item for sublist in excluded_points for item in sublist]
    alles = df_alles_filter.loc[:, cell_line]
    keller = df_keller_filter.loc[:, cell_line]
    
    # remove excluded points
    alles = alles[~alles.index.isin(excluded_points)]
    keller = keller[~keller.index.isin(excluded_points)]
    
    axs[i//2, i%2].scatter(alles,keller, s=2, color="tab:blue", rasterized=True, zorder=1, edgecolors="none")
    axs[i//2, i%2].plot([0,6], [0,6], color="black", linewidth=1.5, ls="--", zorder=1)
    
    if i == 0:
        axs[i//2, i%2].legend()
    if i//2 == 4:
        axs[i//2, i%2].set_xlabel(r"log$_{10}$(expr. microarray)")
    if i % 2 == 0:
        axs[i//2, i%2].set_ylabel(r"log$_{10}$(expr. sequencing)")
    
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"3.12_data_congruence/3.12.2_alles_keller.{format}"), dpi=600)

In [87]:
df_expression_filter_congruence = mirna_ex_geometric_mean.loc[df_knockdown_filter_congruence.index, cell_lines_measured]
for cell_line in cell_lines_measured:
    valid_mirnas = df_knockdown_filter_congruence[cell_line].dropna().index
    df_expression_filter_congruence.loc[~(df_expression_filter_congruence.index.isin(valid_mirnas)), cell_line] = np.nan
df_expression_filter_congruence = df_expression_filter_congruence.astype(float)

In [None]:
# set bounds and initial guesses for non-scale fitting parameters
p0 = [3, 10]
num_params = len(p0)
bounds = ([1, 9.99], [10, 10.01])

# Guess initial scale values for all datasets
scale_guesses = [0 for _ in range(len(cell_lines_measured))]
scale_bounds_min = [-2 for _ in range(len(cell_lines_measured))]
scale_bounds_max = [2 for _ in range(len(cell_lines_measured))]

# set scale for HEK293T to 0
scale_bounds_min[0] = -0.001
scale_bounds_max[0] = 0.001

# set up parameters
p0_scale = p0 + scale_guesses
bounds_scale = (bounds[0]+scale_bounds_min, bounds[1]+scale_bounds_max)

x_data = []
y_data = []
dataset_indices = []
for i, cell_line in enumerate(cell_lines_measured):
    ex_df = df_expression_filter_congruence[cell_line].dropna().values
    knock_df = df_knockdown_filter_congruence[cell_line].dropna().values
    x_data.append(ex_df)
    y_data.append(knock_df)
    dataset_indices.append([i] * len(ex_df))

x_data = np.concatenate(x_data)
y_data = np.concatenate(y_data)
dataset_indices = np.concatenate(dataset_indices)

popt_scales, pcov = popt_scales_filter, pcov = opt.curve_fit(
    lambda x, *params: hill_func_log_scales(x, dataset_indices, *params),
    x_data,
    y_data,
    p0=p0_scale,
    bounds=bounds_scale,
    maxfev=5000
)

scales = list(popt_scales[num_params:])
hill_params = popt_scales[:num_params]
hill_params

In [None]:
scale_dict = {cell_line: scale for cell_line, scale in zip(cell_lines_measured, scales)}
scale_dict

In [None]:
from library2_utils.crosstalk import merge_identical_mirnas
# apply the scale
for cell_line in cell_lines_measured:
    df_expression_filter_congruence[cell_line] += scale_dict[cell_line]

if crosstalk_filtered:
    df_expression_filter_congruence, groups = merge_identical_mirnas(df_expression_filter_congruence, mirbase)

In [99]:
# create output folder if it doesn't exist
output_dir = f"../outputs/3_fitting/combined_dataset"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if crosstalk_filtered:
    with open(os.path.join(output_dir, f"combined_dataset_popt_wo_crosstalk_scaled.pkl"), "wb") as f:
        pickle.dump(hill_params, f)

    with open(os.path.join(output_dir, f"combined_dataset_scale_dict_wo_crosstalk_scaled.pkl"), "wb") as f:
        pickle.dump(scale_dict, f)
        
    # also save the expression data
    df_expression_filter_congruence.to_csv("../microrna_data/3_output/Alles_Keller_combined_expression_wo_crosstalk_scaled.csv")
    df_knockdown_filter_congruence.to_csv("../microrna_data/3_output/Alles_Keller_combined_knockdown_wo_crosstalk_scaled.csv")
    
if (not crosstalk_filtered) and bias_aware:
    with open(os.path.join(output_dir, f"combined_dataset_popt_with_crosstalk.pkl"), "wb") as f:
        pickle.dump(hill_params, f)

    with open(os.path.join(output_dir, f"combined_dataset_scale_dict_with_crosstalk.pkl"), "wb") as f:
        pickle.dump(scale_dict, f)

    # also save the expression data
    df_expression_filter_congruence.to_csv("../microrna_data/3_output/Alles_Keller_combined_expression_with_crosstalk.csv")
    df_knockdown_filter_congruence.to_csv("../microrna_data/3_output/Alles_Keller_combined_knockdown_with_crosstalk.csv")

### plot outlier-aware fit

In [95]:
%%capture output
# create a figure with 4 subplots
fig, axs = plt.subplots(5, 2, figsize=(6, 8), sharex=True, sharey=True)
df_deviation_congruent = pd.DataFrame(columns=cell_lines_measured, index=df_knockdown_filter_congruence.index)
x_range_log = np.linspace(0, 5.5, 1000)

# create the plot folder if it doesn't exist
if not os.path.exists(f"{plot_folder}/3.12_data_congruence"):
    os.makedirs(f"{plot_folder}/3.12_data_congruence")

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]

    df_knock = df_knockdown_filter_congruence.loc[:, cell_line].dropna()
    df_ex = df_expression_filter_congruence.loc[:, cell_line].dropna()
    
    axs[i//2, i%2].scatter(df_ex,df_knock, s=5, color="tab:blue", rasterized=True)
    axs[i//2, i%2].plot(x_range_log, hill_func_log_regular(x_range_log,
                        *hill_params), color="black", linewidth=1.5, label="fit", ls="--")

    # calculate the R2 value
    r2 = stats.pearsonr(df_knock,
                        hill_func_log_regular(df_ex,
                        *hill_params))[0]**2
    
    # calculate the RMSD value
    rmsd = np.sqrt(np.mean((df_knock-
                            hill_func_log_regular(df_ex,
                            *hill_params))**2))

    if crosstalk_filtered:
        correlation_dataframe.loc["3.12.3-crosstalk-filtered", cell_line] = r2
        rmsd_dataframe.loc["3.12.3-crosstalk-filtered", cell_line] = rmsd
    else:
        if bias_aware:
            correlation_dataframe.loc["3.12.3-bias-aware", cell_line] = r2
            rmsd_dataframe.loc["3.12.3-bias-aware", cell_line] = rmsd
        else:
            correlation_dataframe.loc["3.12.3-congruent", cell_line] = r2
            rmsd_dataframe.loc["3.12.3-merged", cell_line] = rmsd
    
    df_deviation_congruent.loc[df_knock.index, cell_line] = df_knock - hill_func_log_regular(df_ex, *hill_params)
    
    plt.xlim(0, 5.5)
    plt.ylim(-1.7, 0.25)
    
    if i //2 == 4:
        axs[i//2, i%2].set_xlabel(r"log$_{10}$(miRNA expression)")
    if i % 2 == 0:
        axs[i//2, i%2].set_ylabel(r"log$_{10}$(stability)")
    axs[i//2, i%2].set_title(f"{cell_line}, r2 = {round(r2, 2)}, rmsd = {round(rmsd, 2)}", fontsize=8)

plt.tight_layout()
if crosstalk_filtered:   
    for format in ["png", "svg"]:
        plt.savefig(os.path.join(plot_folder, f"3.12_data_congruence/3.12.3_crosstalk_filtered.{format}"), dpi=600)
elif bias_aware:
    for format in ["png", "svg"]:
        plt.savefig(os.path.join(plot_folder, f"3.12_data_congruence/3.12.3_geometric_mean_outlier_aware.{format}"), dpi=600)
else:
    for format in ["png", "svg"]:
        plt.savefig(os.path.join(plot_folder, f"3.12_data_congruence/3.12.3_geometric_mean_merged.{format}"), dpi=600)

In [68]:
if not crosstalk_filtered and bias_aware:
    df_deviation_congruent.to_csv(os.path.join(output_dir, "combined_dataset_deviation_bias_aware.csv"))

### plot outlier-aware fit with false positives and negatives

In [72]:
%%capture output
# create a figure with 10 subplots
fig, axs = plt.subplots(5, 2, figsize=(6, 8), sharex=True, sharey=True)

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]

    # specifically plot false positives and negatives
    excluded_points = []
    for curr_index, df in enumerate(false_positives_and_negatives):
        #df = df[df[cell_line].notna()]
        df_ex = df_expression_filter_congruence.loc[df.index, cell_line]
        df_knock = df_knockdown_filter_congruence.loc[df.index, cell_line]
        if i == 0:
            axs[i//2, i%2].scatter(df_ex, df_knock, s=5, color=false_positives_and_negatives_colors[curr_index],
                                label=false_positives_and_negatives_label[curr_index], marker=false_positives_and_negatives_symbols[curr_index],
                                rasterized=True, zorder=2)
        else:
            axs[i//2, i%2].scatter(df_ex, df_knock, s=5, color=false_positives_and_negatives_colors[curr_index],
                                marker=false_positives_and_negatives_symbols[curr_index],
                                rasterized=True, zorder=2)
        excluded_points.append(df.index)
        
    # flatten excluded points
    excluded_points = [item for sublist in excluded_points for item in sublist]

    df_knock = df_knockdown_filter_congruence.loc[:, cell_line].dropna()
    df_ex = df_expression_filter_congruence.loc[:, cell_line].dropna()
    
    # These should be calculated BEFORE exluding points for plotting purposes
    # calculate the R2 value
    r2 = stats.pearsonr(df_knock,
                        hill_func_log_regular(df_ex,
                        *hill_params))[0]**2
    
    # calculate the RMSD value
    rmsd = np.sqrt(np.mean((df_knock-
                            hill_func_log_regular(df_ex,
                            *hill_params))**2))
    
    # remove excluded points
    df_ex = df_ex[~df_ex.index.isin(excluded_points)]
    df_knock = df_knock[~df_knock.index.isin(excluded_points)]
    
    axs[i//2, i%2].scatter(df_ex,df_knock, s=5, color="tab:blue", rasterized=True, zorder=1)
    axs[i//2, i%2].plot(x_range_log, hill_func_log_regular(x_range_log,
                        *hill_params), color="black", linewidth=1.5, label="fit", ls="--", zorder=1)
    
    plt.xlim(0, 5.5)
    plt.ylim(-1.7, 0.25)
    
    if i//2 == 4:
        axs[i//2, i%2].set_xlabel(r"log$_{10}$(miRNA expression)")
    if i % 2 == 0:
        axs[i//2, i%2].set_ylabel(r"log$_{10}$(stability)")
    axs[i//2, i%2].set_title(f"{cell_line}, r2 = {round(r2, 2)}", fontsize=7.5)
    if i == 0:
        axs[i//2, i%2].legend()

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"3.12_data_congruence/3.12.3_geometric_mean_aware_with_outliers.{format}"), dpi=600)

## create a plot showing the filtered crosstalk

In [73]:
allowed_mirnas_all = {}
for cell_line in cell_lines_measured:
    allowed_mirnas = []
    crosstalk_dict = crosstalk_filter_dict[cell_line]
    allowed_mirnas_all[cell_line] = list(df_knockdown.index.difference(crosstalk_dict))

In [74]:
if bias_aware and not crosstalk_filtered:
    fig = plt.figure(figsize=(3, 2.1))
    x_range_log = np.linspace(0, 5.5, 1000)

    for i, cell_line in enumerate(cell_lines_measured):
        current_scale = scales[i]
        
        df_knock = df_knockdown_filter_congruence.loc[:, cell_line].dropna()
        df_ex = df_expression_filter_congruence.loc[:, cell_line].dropna()
        
        # specifically plot false positives and negatives
        allowed_mirnas = allowed_mirnas_all[cell_line]
        
        non_crosstalk = df_knock.index.intersection(allowed_mirnas) 
        crosstalk = df_knock.index.difference(allowed_mirnas)
        
        plt.scatter(df_ex.loc[non_crosstalk], df_knock.loc[non_crosstalk], s=3,
                    color="tab:blue", rasterized=True, zorder=1, edgecolors="none")
        plt.scatter(df_ex.loc[crosstalk], df_knock.loc[crosstalk], s=3,
                    color="tab:red", rasterized=True, zorder=2, edgecolors="none")
        
        if i == 0:
            plt.plot(x_range_log, hill_func_log_regular(x_range_log,
                    *hill_params), color="black", linewidth=1, ls="--")
        
    plt.xlabel(r"log$_{10}$"+f"(miRNA expression)")
    plt.ylabel(r"log$_{10}$(stability)")

    plt.xlim(0, 5.5)
    plt.ylim(-1.7, 0.25)
    plt.tight_layout()
    plt.legend(loc="lower left", frameon=False, fontsize=6.5)
    for format in ["png", "svg"]:
        plt.savefig(os.path.join(plot_folder, f"3.12_data_congruence/3.12.3_show_crosstalk.{format}"), dpi=600, bbox_inches='tight')

### plot outlier-aware fit as a single plot

In [97]:
%%capture output
fig = plt.figure(figsize=(2.5, 1.7))

x_range_log = np.linspace(0, 5.5, 1000)
for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    
    df_knock = df_knockdown_filter_congruence.loc[:, cell_line].dropna()
    df_ex = df_expression_filter_congruence.loc[:, cell_line].dropna()
    
    # calculate the R2 value
    r2 = stats.pearsonr(df_knock,
                        hill_func_log_regular(df_ex,
                        *hill_params))[0]**2
    plt.scatter(df_ex, df_knock, color=cell_line_colors[cell_line],
        s=3, marker=cell_line_symbols[cell_line], rasterized=True, label=f"{cell_line}")#, " + r"$r^2$=" + f"{round(r2, 2)}")

plt.plot(x_range_log, hill_func_log_regular(x_range_log,
        *hill_params), color="black", linewidth=1, ls="--")
plt.text(0.5, -1.2, r"$\frac{k_{\mathrm{deg}}}{k_{\mathrm{on}}}=$"+f"{10**hill_params[0]:.0f}", fontsize=7, ha="left")
    
plt.xlabel(r"log$_{10}$"+f"(merged miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")
plt.xlim(0, 5.5)
plt.xticks([0, 1, 2, 3, 4, 5])
plt.ylim(-1.7, 0.25)

plt.tight_layout()
plt.legend(loc = [1,-0.05], frameon=False, fontsize=6.5)
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"3.12_data_congruence/3.12.3_geometric_mean_single_plot_aware_merging.{format}"), dpi=600, bbox_inches='tight')

In [111]:
%%capture output
fig = plt.figure(figsize=(2.5, 1.7))

for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    
    df_knock = 1/10**df_knockdown_filter_congruence.loc[:, cell_line].dropna()
    df_ex = df_expression_filter_congruence.loc[:, cell_line].dropna()
    
    # calculate the R2 value
    r2 = stats.pearsonr(df_knock,
                        10**hill_func_log_regular(df_ex,
                        *hill_params))[0]**2
    plt.scatter(df_ex, df_knock, color=cell_line_colors[cell_line],
        s=1, edgecolor="none", marker=cell_line_symbols[cell_line], rasterized=True, label=f"{cell_line}")#, " + r"$r^2$=" + f"{round(r2, 2)}")
     
plt.plot(x_range_log, 1/10**hill_func_log_regular(x_range_log,
        *hill_params), color="black", linewidth=1, ls="--")
    
plt.xlabel(r"log$_{10}$"+f"(merged miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")
plt.xlim(0, 5.5)
plt.xticks([0, 1, 2, 3, 4, 5])
#plt.ylim(-1.7, 0.25)

plt.tight_layout()
plt.legend(loc = [1,-0.05], frameon=False, fontsize=6.5)
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"3.12_data_congruence/3.12.3_geometric_mean_linear.{format}"), dpi=600, bbox_inches='tight')

In [76]:
if bias_aware and not crosstalk_filtered:
    fig = plt.figure(figsize=(3, 2.1))

    for i, cell_line in enumerate(cell_lines_measured):
        current_scale = scales[i]
        
        # specifically plot false positives and negatives
        excluded_points = []
        for curr_index, df in enumerate(false_positives_and_negatives):
            df_ex = df_expression_filter_congruence.loc[df.index, cell_line]
            df_knock = df_knockdown_filter_congruence.loc[df.index, cell_line]
            if i == 0:
                plt.scatter(df_ex, df_knock, s=5, color=false_positives_and_negatives_colors[curr_index],
                                    label=false_positives_and_negatives_label[curr_index], marker=false_positives_and_negatives_symbols[curr_index],
                                    rasterized=True, zorder=2)
            else:
                plt.scatter(df_ex, df_knock, s=5, color=false_positives_and_negatives_colors[curr_index],
                                    marker=false_positives_and_negatives_symbols[curr_index],
                                    rasterized=True, zorder=2)
            excluded_points.append(df.index)
            
        # flatten excluded points
        excluded_points = [item for sublist in excluded_points for item in sublist]

        df_knock = df_knockdown_filter_congruence.loc[:, cell_line].dropna()
        df_ex = df_expression_filter_congruence.loc[:, cell_line].dropna()

        # remove excluded points
        df_ex = df_ex[~df_ex.index.isin(excluded_points)]
        df_knock = df_knock[~df_knock.index.isin(excluded_points)]
        
        plt.scatter(df_ex,df_knock, s=3, color="tab:blue", rasterized=True, zorder=1, edgecolors="none")
        
        if i == 0:
            plt.plot(x_range_log, hill_func_log_regular(x_range_log,
                    *hill_params), color="black", linewidth=1, ls="--")
        
    plt.xlabel(r"log$_{10}$"+f"(merged miRNA expression)")
    plt.ylabel(r"log$_{10}$(stability)")

    # plt.scatter(0.5, -1.2, color="black", s=10, marker="x")
    # plt.text(0.6, -1.25, f"{label_mirna}", fontsize=7, ha="left")
    # plt.text(0.6, -1.25, r"$t(x)=1-\frac{x}{c+x}$", fontsize=7, ha="left")

    plt.xlim(0, 5.5)
    plt.ylim(-1.7, 0.25)
    plt.tight_layout()
    plt.legend(loc="lower left", frameon=False, fontsize=6.5)
    for format in ["png", "svg"]:
        plt.savefig(os.path.join(plot_folder, f"3.12_data_congruence/3.12.3_geometric_mean_aware_show_fals_pos_and_neg.{format}"), dpi=600, bbox_inches='tight')

In [None]:
rmsd_dataframe.index = ["step 1", "2",
                        "3", "4",
                        "5"]

fig, ax = plt.subplots(figsize=(1.6, 1.4))
sns.heatmap(rmsd_dataframe.T.astype('float')*10, cmap="cividis_r", annot=False, fmt=".2f", vmin=0.5, vmax=1.3, ax=ax,
            annot_kws={"size": 7}, cbar=True, square=False,
            cbar_kws={'label': r'10x rmsd', 'shrink': 1, 'aspect': 15, 'pad': 0.02})

# # Add white lines between columns
# for i in range(1, rmsd_dataframe.shape[0]):
#     ax.axhline(y=i, color='white', lw=5)  # 'lw' sets the line width
    
# plt.yticks([])
# rotate xticks
plt.xticks(rotation=0, ha='right')
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.12_data_congruence/3.12.3_rmsd_all_single_alt_format.{format}", dpi=600, bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(1.6, 1.4))
x = ["1", "2", "3", "4", "5"]
for column in rmsd_dataframe.columns:
    # column_name = column.replace("293T", "HEK293T")
    column_name = column
    plt.plot(rmsd_dataframe.loc[:, column].values, label=column, color=cell_line_colors[column_name],
             marker=cell_line_symbols[column_name], markersize=4, lw=1)
plt.xticks(range(5), x)
plt.xlabel("step")
plt.ylabel("rmsd")
plt.legend(loc=[1.05, -0.1], frameon=False, fontsize=6)
plt.savefig(f"{plot_folder}/3.12_data_congruence/3.12.3_rmsd_all_single_alt_format_lineplot.png", dpi=600, bbox_inches='tight')

In [None]:
correlation_dataframe

In [None]:
fig, ax = plt.subplots(figsize=(0.3, 1.65))
sns.heatmap(correlation_dataframe.iloc[-1:].astype('float').T, cmap="viridis", annot=True, fmt=".2f", vmin=0.2, vmax=0.8, ax=ax,
            annot_kws={"size": 6},
            cbar_kws={'label': r'$r^2$', 'shrink': 1.4, 'aspect': 25, 'pad': 0.02, 'ticks': [0.2, 0.4, 0.6, 0.8]})
plt.xticks([])
plt.yticks([])
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/3.12_data_congruence/3.12.3_correlations_aware.{format}", dpi=600, bbox_inches='tight')