In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import pickle
import os
from library2_utils.color_scheme import cell_line_colors, cell_line_symbols
import matplotlib.patches as patches

# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

cell_lines_measured = ["HEK293T", "HeLa", "MCF7", "A549", "HaCaT", "HUH7", "PC3", "JEG3", "Tera1", "SKNSH"]

plot_folder = f"../plots/4_compare_data_sources"
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)

### This notebooks compares outliers between the two datasets to find consistent outliers (i.e., real false positives and negatives)

# Get the necessary data

In [2]:
mirna_names = ["Alles2019", "Keller2023"]

In [6]:
unfiltered_dfs = []
#filtered_dfs = []

for mirna_name in mirna_names:
    data_dir_input = f"../outputs/3_fitting"

    unfiltered_dfs.append(pd.read_csv(f"{data_dir_input}/{mirna_name}/{mirna_name}_deviation_all_mirnas.csv", index_col=0))
    filtered_dfs.append(pd.read_csv(f"{data_dir_input}/{mirna_name}/{mirna_name}_deviation_crosstalk_filtered.csv", index_col=0))

In [7]:
# ------- unfiltered -------------
# harmonize their indices
df1 = unfiltered_dfs[0]
df2 = unfiltered_dfs[1]

common_index = df1.index.intersection(df2.index)
df1 = df1.loc[common_index]
df2 = df2.loc[common_index]

unfiltered_dfs = [df1, df2]

# ------- filtered -------------
df1 = filtered_dfs[0]
df2 = filtered_dfs[1]

common_index = df1.index.intersection(df2.index)
df1 = df1.loc[common_index]
df2 = df2.loc[common_index]

filtered_dfs = [df1, df2]

In [8]:
# for each cell line, get those in the unfiltered, but not in the crosstalk filtered data
# these are the ones that are removed by the crosstalk filtering
df_diff_dict1 = {}
df_diff_dict2 = {}

# these remain after filtering
df_left_over1 = {}
df_left_over2 = {}

for cell_line in cell_lines_measured:
    df_filtered1 = filtered_dfs[0][cell_line]
    df_filtered2 = filtered_dfs[1][cell_line]
    
    # drop NA from both
    df_filtered1 = df_filtered1.dropna()
    df_filtered2 = df_filtered2.dropna()
    
    df_unfiltered1 = unfiltered_dfs[0][cell_line]
    df_unfiltered2 = unfiltered_dfs[1][cell_line]
    
    # ---------
    # get the difference df
    # get those that are only in the unfiltered df
    df1 = df_unfiltered1.loc[df_unfiltered1.index.difference(df_filtered1.index)]
    df2 = df_unfiltered2.loc[df_unfiltered2.index.difference(df_filtered2.index)]
    
    # harmonize the indices
    common_index = df1.index.intersection(df2.index)
    df1 = df1.loc[common_index]
    df2 = df2.loc[common_index]
    df_diff_dict1[cell_line] = df1
    df_diff_dict2[cell_line] = df2
    
    # ---------
    # get the leftover df
    df1 = df_unfiltered1.loc[df_unfiltered1.index.intersection(df_filtered1.index)]
    df2 = df_unfiltered2.loc[df_unfiltered2.index.intersection(df_filtered2.index)]
    
    # harmonize the indices
    common_index = df1.index.intersection(df2.index)
    df1 = df1.loc[common_index]
    df2 = df2.loc[common_index]
    df_left_over1[cell_line] = df1
    df_left_over2[cell_line] = df2

In [9]:
# load the crosstalk dict
input_folder = "../outputs/5_mutations"

with open(f"{input_folder}/5.7_crosstalk_filter_dict.pkl", "rb") as f:
    crosstalk_filter_dict = pickle.load(f)

## compare

In [10]:
def add_rectangle(ax, x, y, width, height, color='black', label=None):
    ax.add_patch(patches.Rectangle((x, y), width, height, edgecolor=color, facecolor='none', zorder=3))
    if label is not None:
        ax.text(x + 0.1, y + height - 0.15, label, ha='center', va='center', color=color)
        
def add_rectangles(ax):
    add_rectangle(ax, -1, 0.3, 0.7, 0.7, label="1")
    add_rectangle(ax, -0.3, 0.3, 0.6, 0.7, label="2")
    add_rectangle(ax, 0.3, 0.3, 0.7, 0.7, label="3")
    add_rectangle(ax, -1, -0.3, 0.7, 0.6, label="4")
    add_rectangle(ax, -0.3, -0.3, 0.6, 0.6, label="5")
    add_rectangle(ax, 0.3, -0.3, 0.7, 0.6, label="6")
    add_rectangle(ax, -1, -0.3, 0.7, 0.6, label="4")
    add_rectangle(ax, -0.3, -0.3, 0.6, 0.6, label="5")
    add_rectangle(ax, 0.3, -0.3, 0.7, 0.6, label="6")
    add_rectangle(ax, -1, -1, 0.7, 0.7, label="7")
    add_rectangle(ax, -0.3, -1, 0.6, 0.7, label="8")
    add_rectangle(ax, 0.3, -1, 0.7, 0.7, label="9")
    
# define the boxes. Uses open ends so that even smaller values are also included
# Define the rectangles with their bounds and labels
rectangles_label = [
    {"xmin": -3, "xmax": -0.3, "ymin": 0.3, "ymax": 3, "label": "1"},
    {"xmin": -0.3, "xmax": 0.3, "ymin": 0.3, "ymax": 3, "label": "2"},
    {"xmin": 0.3, "xmax": 3, "ymin": 0.3, "ymax": 3, "label": "3"},
    {"xmin": -3, "xmax": -0.3, "ymin": -0.3, "ymax": 0.3, "label": "4"},
    {"xmin": -0.3, "xmax": 0.3, "ymin": -0.3, "ymax": 0.3, "label": "5"},
    {"xmin": 0.3, "xmax": 3, "ymin": -0.3, "ymax": 0.3, "label": "6"},
    {"xmin": -3, "xmax": -0.3, "ymin": -3, "ymax": -0.3, "label": "7"},
    {"xmin": -0.3, "xmax": 0.3, "ymin": -3, "ymax": -0.3, "label": "8"},
    {"xmin": 0.3, "xmax": 3, "ymin": -3, "ymax": -0.3, "label": "9"},
]

def find_rectangle_for_point(x, y, rectangles):
    # Check if the point is within any rectangle
    for rect in rectangles:
        if rect["xmin"] <= x <= rect["xmax"] and rect["ymin"] <= y <= rect["ymax"]:
            return rect["label"]
        
def get_label_df(df1, df2):
    # make a new dataframe with the microRNAs as the index and the cell lines as the columns
    df_label = pd.DataFrame(index=df1.index, columns=cell_lines_measured)
    for cell_line in cell_lines_measured:
        df_label[cell_line] = [find_rectangle_for_point(x, y, rectangles_label) for x, y in zip(df1[cell_line], (df2[cell_line]))]
    return df_label

In [None]:
fig, ax = plt.subplots(figsize=(3,2.4))

for cell_line in cell_lines_measured:
    df1 = unfiltered_dfs[0][cell_line]
    df2 = unfiltered_dfs[1][cell_line]
    
    r2 = stats.pearsonr(df1, df2)[0]**2
    plt.scatter(df1, df2, label=f"{cell_line}, "+r"$r^2$"+f"={r2:.2f}", s=3, rasterized=True)
    
plt.xlabel("deviation from fit (microarray)")
plt.ylabel("deviation from fit (sequencing)")


# add rectangles
add_rectangles(ax)

plt.title("Unfiltered data", fontsize=8)
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xticks(np.arange(-1,1.1,0.5))
plt.yticks(np.arange(-1,1.1,0.5))

plt.tight_layout()
plt.legend(ncol=1, bbox_to_anchor=(1.05, 1), loc='upper left')
for format in [".png", ".svg"]:
    plt.savefig(f"{plot_folder}/4.1_compare_deviation_unfiltered_all{format}", dpi=600)

In [None]:
fig, ax = plt.subplots(figsize=(3,2.4))

# add rectangles
add_rectangles(ax)

for i, cell_line in enumerate(cell_lines_measured):
    df1 = df_left_over1[cell_line]
    df2 = df_left_over2[cell_line]
    
    if i == 0:
        plt.scatter(df1, df2, s=3, color="tab:blue", rasterized=True, label="retained",zorder=1)
    else:
        plt.scatter(df1, df2, s=3, color="tab:blue", rasterized=True,zorder=1)
        
    df1 = df_diff_dict1[cell_line]
    df2 = df_diff_dict2[cell_line]
    
    if i == 0:
        plt.scatter(df1, df2, s=3, color="red", rasterized=True, label="removed",zorder=2)
    else:
        plt.scatter(df1, df2, s=3, color="red", rasterized=True,zorder=2)
    
plt.xlabel("deviation from fit (microarray)")
plt.ylabel("deviation from fit (sequencing)")

plt.title("Crosstalk filtering process", fontsize=8)
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xticks(np.arange(-1,1.1,0.5))
plt.yticks(np.arange(-1,1.1,0.5))

plt.tight_layout()
plt.legend(ncol=1, bbox_to_anchor=(1.05, 1), loc='upper left')

for format in [".png", ".svg"]:
    plt.savefig(f"{plot_folder}/4.2_compare_deviation_crosstalk_filtered_all{format}", dpi=600)

In [None]:
fig, ax = plt.subplots(figsize=(3,2.4))

for cell_line in cell_lines_measured:
    df1 = df_left_over1[cell_line]
    df2 = df_left_over2[cell_line]
    
    r2 = stats.pearsonr(df1, df2)[0]**2
    plt.scatter(df1, df2, label=f"{cell_line}, r2={r2:.2f}", s=3, rasterized=True)
    
plt.xlabel("deviation from fit (microarray)")
plt.ylabel("deviation from fit (sequencing)")


# add rectangles
add_rectangles(ax)
plt.title("After crosstalk filtering")
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xticks(np.arange(-1,1.1,0.5))
plt.yticks(np.arange(-1,1.1,0.5))

plt.tight_layout()
plt.legend(ncol=1, bbox_to_anchor=(1.05, 1), loc='upper left')
for format in [".png", ".svg"]:
    plt.savefig(f"{plot_folder}/4.3_compare_deviation_leftover_all{format}", dpi=300)

# 4.4 - False Positives

In [15]:
# get the series of false positives
input_folder = "../outputs/3_fitting"
false_positives_keller = pd.read_csv(f"{input_folder}/Keller2023/Keller2023_false_positives.csv", index_col=0)
false_positives_alles = pd.read_csv(f"{input_folder}/Alles2019/Alles2019_false_positives.csv", index_col=0)  

In [16]:
# filter the original false positives
false_positives_keller_filter = pd.DataFrame(columns=cell_lines_measured)
false_positives_alles_filter = pd.DataFrame(columns=cell_lines_measured)

for cell_line in cell_lines_measured:
    # only add those that are NaN in the diff df for this cell line
    # when the difference is not NaN, then it's probably not a false positive
    # get the deviation for this cell line
    df_deviation_alles = unfiltered_dfs[0][cell_line]
    df_deviation_keller = unfiltered_dfs[1][cell_line]
    
    # get the false positives for this cell line
    false_pos_keller_cell_line = false_positives_keller[cell_line]
    false_pos_alles_cell_line = false_positives_alles[cell_line]
    
    # for the Keller data, discard all those with an Alles deviation > 0.3 and vice versa
    false_pos_keller_cell_line_index = false_pos_keller_cell_line.index[df_deviation_alles.loc[false_pos_keller_cell_line.index] < 0.3]
    false_pos_alles_cell_line_index = false_pos_alles_cell_line.index[df_deviation_keller.loc[false_pos_alles_cell_line.index] < 0.3]
    
    # Append the new filtered data to the existing DataFrame
    false_positives_keller_filter = false_positives_keller_filter.combine_first(
        false_pos_keller_cell_line.loc[false_pos_keller_cell_line_index].dropna().to_frame(cell_line)
    )
    false_positives_alles_filter = false_positives_alles_filter.combine_first(
        false_pos_alles_cell_line.loc[false_pos_alles_cell_line_index].dropna().to_frame(cell_line)
    )

In [19]:
# save these to a file
false_positives_keller_filter.to_csv(f"{input_folder}/Keller2023/Keller2023_false_positives_filtered.csv")
false_positives_alles_filter.to_csv(f"{input_folder}/Alles2019/Alles2019_false_positives_filtered.csv")

In [None]:
fig, ax = plt.subplots(figsize=(3,2.4))

# add rectangles
add_rectangles(ax)

for i, cell_line in enumerate(cell_lines_measured):
    # plot all data
    df1 = unfiltered_dfs[0][cell_line]
    df2 = unfiltered_dfs[1][cell_line]
    
    plt.scatter(df1, df2, s=3, color="tab:blue", rasterized=True, zorder=1)
    
    # plot sequencing false positives
    df1_filter = df1.loc[false_positives_keller_filter[cell_line].dropna().index]
    df2_filter = df2.loc[false_positives_keller_filter[cell_line].dropna().index]
    
    if i == 0:
        plt.scatter(df1_filter, df2_filter, s=5, color="tab:red", rasterized=True, label="false positives sequencing",zorder=2)
    else:
        plt.scatter(df1_filter, df2_filter, s=5, color="tab:red", rasterized=True,zorder=2)
    
    # plot microarray false positives
    df1_filter = df1.loc[false_positives_alles_filter[cell_line].dropna().index]
    df2_filter = df2.loc[false_positives_alles_filter[cell_line].dropna().index]
    
    if i == 0:
        plt.scatter(df1_filter, df2_filter, s=5, color="tab:orange", rasterized=True, label="false positives microarray",zorder=2)
    else:
        plt.scatter(df1_filter, df2_filter, s=5, color="tab:orange", rasterized=True,zorder=2)

plt.title("Identification of possible false positives")
plt.xlabel("deviation from fit (microarray)")
plt.ylabel("deviation from fit (sequencing)")
plt.xticks(np.arange(-1,1.1,0.5))
plt.yticks(np.arange(-1,1.1,0.5))

plt.xlim(-1,1)
plt.ylim(-1,1)
plt.tight_layout()
plt.legend(ncol=1, bbox_to_anchor=(1.05, 1), loc='upper left')

for format in [".png", ".svg"]:
    plt.savefig(f"{plot_folder}/4.4_false_positives{format}", dpi=600)

# 4.5 - False Negatives

In [39]:
# # reload the deviation df
# filtered_dfs = []

# for mirna_name in mirna_names:
#     data_dir_input = f"../outputs/3_fitting"
#     filtered_dfs.append(pd.read_csv(f"{data_dir_input}/{mirna_name}_deviation_step_3.10.csv", index_col=0))
    
# # ------- filtered -------------
# df1 = filtered_dfs[0]
# df2 = filtered_dfs[1]

# common_index = df1.index.intersection(df2.index)
# df1 = df1.loc[common_index]
# df2 = df2.loc[common_index]

# filtered_dfs = [df1, df2]

In [21]:
# get the labels (pretty much the same as looking at the expression levels)
df1 = unfiltered_dfs[0]
df2 = unfiltered_dfs[1]
df_label = get_label_df(df1, df2)

In [24]:
# get the series of false positives
input_folder = "../outputs/3_fitting"
false_negatives_keller = pd.read_csv(f"{input_folder}/Keller2023/Keller2023_false_negatives.csv", index_col=0)
false_negatives_alles = pd.read_csv(f"{input_folder}/Alles2019/Alles2019_false_negatives.csv", index_col=0)  

In [25]:
# filter the original false negatives
false_negatives_keller_filter = pd.DataFrame(columns=cell_lines_measured, index=false_negatives_keller.index)
false_negatives_alles_filter = pd.DataFrame(columns=cell_lines_measured, index=false_negatives_alles.index)

for cell_line in cell_lines_measured:
    # get the false negative df
    false_neg_keller_cell_line = false_negatives_keller[cell_line]
    false_neg_alles_cell_line = false_negatives_alles[cell_line]
    
    # get the labels
    labels_keller = df_label.loc[false_neg_keller_cell_line.index, cell_line]
    labels_alles = df_label.loc[false_neg_alles_cell_line.index, cell_line]
    
    labels_keller = labels_keller[(labels_keller == "8") | (labels_keller == "9")]
    labels_alles = labels_alles[(labels_alles == "1") | (labels_alles == "4")]
    
    # print(false_neg_alles_cell_line.loc[labels_alles.index])
    
    filtered_df_alles =  false_neg_alles_cell_line.loc[labels_alles.index].dropna()
    filtered_df_keller = false_neg_keller_cell_line.loc[labels_keller.index].dropna()
    
    false_negatives_keller_filter.loc[filtered_df_keller.index,cell_line] = filtered_df_keller
    false_negatives_alles_filter.loc[filtered_df_alles.index, cell_line] = filtered_df_alles

In [26]:
# save these to a file
false_negatives_keller_filter.to_csv(f"{input_folder}/Keller2023/Keller2023_false_negatives_filtered.csv")
false_negatives_alles_filter.to_csv(f"{input_folder}/Alles2019/Alles2019_false_negatives_filtered.csv")

In [None]:
fig, ax = plt.subplots(figsize=(3,2.4))

# add rectangles
add_rectangles(ax)

for i, cell_line in enumerate(cell_lines_measured):
    df1 = unfiltered_dfs[0][cell_line]
    df2 = unfiltered_dfs[1][cell_line]
    
    plt.scatter(df1, df2, s=3, color="tab:blue", rasterized=True)
    
    df1_filter = df1.loc[false_negatives_keller_filter[cell_line].dropna().index]
    df2_filter = df2.loc[false_negatives_keller_filter[cell_line].dropna().index]
    if i == 0:
        plt.scatter(df1_filter, df2_filter, label="false negatives sequencing", s=5, color="tab:red", rasterized=True, marker="x")
    else:
        plt.scatter(df1_filter, df2_filter, s=5, color="tab:red", rasterized=True, marker="x")
    
    df1_filter = df1.loc[false_negatives_alles_filter[cell_line].dropna().index]
    df2_filter = df2.loc[false_negatives_alles_filter[cell_line].dropna().index]
    if i == 0:
        plt.scatter(df1_filter, df2_filter, label="false negatives microarray", s=5, color="tab:orange", rasterized=True, marker="x")
    else:
        plt.scatter(df1_filter, df2_filter, s=5, color="tab:orange", rasterized=True, marker="x")

plt.title("Identification of possible false negatives")
plt.xlabel("deviation from fit (microarray)")
plt.ylabel("deviation from fit (sequencing)")
plt.xticks(np.arange(-1,1.1,0.5))
plt.yticks(np.arange(-1,1.1,0.5))

plt.xlim(-1,1)
plt.ylim(-1,1)
plt.tight_layout()
plt.legend(ncol=1, bbox_to_anchor=(1.05, 1), loc='upper left')
for format in [".png", ".svg"]:
    plt.savefig(f"{plot_folder}/4.5_false_negatives{format}", dpi=600)