In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import scipy.optimize as opt
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
import os
import pickle
from library2_utils.transfer_functions import transfer_function
from library2_utils.mirna_levels import normalize_expr_df_to_rpm, normalize_expr_df_to_rpm_with_partner

cell_lines_measured = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH7", "A549", "HaCaT", "JEG3", "Tera1", "PC3"]

# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

We use four datasets:  
Two human tissue atlases (Ludwig 2016, Keller2022), which are microarray and ngs-based, respectively. These datasets have to be downloaded to run this notebook.  
Two human cell line datasets (Alles2019, Keller2023 (measured in this publication)), which are microarray and ngs-based, respectively. 

The cell line dataset is included in the repository. The tissue atlas data can be downloaded based on the info in the respective publications. The necessary files are:
NGS - raw/ngs/hsa_snc_expression.csv
microarray - data_matrix_quantile.txt

The tissue atlases are named microarray_tisse and ngs_tissue, while the cell line data is referred to as alles and keller. Thus "keller" always refers to the cell line data in this publication, rather than to the 2022 tissue atlas.

In [2]:
# get mirbase
mirbase = pd.read_csv("../microrna_data/mirbase_extended.csv", index_col=0)

# load mirgeneDB
mirgenedb = pd.read_csv("../microrna_data/mirgenedb.csv", index_col=0)

base_plot_folder = "../plots/15_tissue_data_merge"
# create folder if it does not exist
if not os.path.exists(base_plot_folder):
    os.makedirs(base_plot_folder)

In [3]:
# get high confidence miRNAs in mirgenedb
mirbase_high_conf = mirbase[mirbase["confidence"] == "high"]

# get relevant miRNAs from mirgenedb
mirgenedb_mirnas = list(mirgenedb["5p accession"].unique()) + list(mirgenedb["3p accession"].unique())
mirgenedb_mirnas = [mirna for mirna in mirgenedb_mirnas if mirna != "None"]
mirgenedb_mirnas = mirbase[mirbase["MIMAT"].isin(mirgenedb_mirnas)]

allowed_mirnas = list(set(mirbase_high_conf.index.tolist() + mirgenedb_mirnas.index.tolist()))

# 15.1 - Dataset preprocessing

In [4]:
dataset_folder = "../microrna_data/15_human_data_merge"
dataset_processed_folder = "../microrna_data/15_human_data_merge/processed"
# create folder if it does not exist
if not os.path.exists(dataset_processed_folder):
    os.makedirs(dataset_processed_folder)

## 15.1.1 - NGS data (Keller 2022)

In [5]:
df_ngs = pd.read_csv(os.path.join(dataset_folder, 'raw/ngs/hsa_snc_expression.csv'))

In [6]:
# filter to type "mirna"
df_ngs = df_ngs[df_ngs['type'] == 'mirna']

In [7]:
# split the dataframe into a list of dataframes, one for each body
df_ngs_list = [df_ngs[df_ngs['body'] == i] for i in range(1, len(df_ngs['body'].unique()) + 1)]
# filter to allowed mirnas
df_ngs_list = [df_ngs[df_ngs['acc'].isin(allowed_mirnas)] for df_ngs in df_ngs_list]

In [8]:
%%capture output
# create folder if it does not exist
if not os.path.exists(os.path.join(base_plot_folder, "15.1.1_ngs_correlation")):
    os.makedirs(os.path.join(base_plot_folder, "15.1.1_ngs_correlation"))
    
correlations = {}
tissue_expression_ngs = {}
tissues = df_ngs_list[0]["tissue"].unique()
for tissue in tissues:
    expression_list = [df_ngs_list[i][df_ngs_list[i]["tissue"] == tissue][["acc","expression"]] for i in range(len(df_ngs_list))]
    # remove any empty dataframes
    expression_list = [expression for expression in expression_list if len(expression) > 0]
    # make "acc" (accession) the index
    expression_list = [expression.set_index("acc") for expression in expression_list]
    # rename "expression" column to body number
    expression_list = [expression.rename(columns={"expression": i}) for i, expression in enumerate(expression_list)]
    # merge the dataframes using pd.merge
    expression_df = reduce(lambda x, y: pd.merge(x, y, left_index=True, right_index=True), expression_list)
    # average values with identical indices
    expression_df = expression_df.groupby(expression_df.index).mean()
    # normalize to rpm
    expression_df = normalize_expr_df_to_rpm(expression_df)
    # get the geometric mean over bodies
    tissue_expression_ngs[tissue] = expression_df.apply(stats.gmean, axis=1)

    # get log 10 values
    expression_df = np.log10(expression_df)
    # filter to allowed mirnas
    expression_df = expression_df[expression_df.index.isin(allowed_mirnas)]
    expression_df_higher = expression_df.copy()
    expression_df_higher[expression_df_higher < 2] = 2.0
     # get the correlation matrix
    correlations[tissue] = expression_df_higher.corr()**2

    # get the first two bodies and make a scatter plot
    if len(expression_df.columns) > 1:
        plt.clf()
        plt.scatter(expression_df[0], expression_df[1], s=1)
        plt.xlabel("Body 1")
        plt.ylabel("Body 2")
        plt.title(tissue)
        plt.tight_layout()
        plt.savefig(os.path.join(base_plot_folder, f"15.1.1_ngs_correlation/15.1.1_ngs_correlation_{tissue}_scatter.png"))    
        
    # plot the correlation matrices
    plt.clf()
    # set the global sns font to
    sns.set(font_scale=1.5)
    # plot a heatmap of the correlation matrix
    # also insert the actual values in white font
    heat = sns.heatmap(correlations[tissue], vmax=1, vmin=0, cmap="viridis", annot=True, annot_kws={"color": "white"},
                       cbar_kws={'label': 'Pearson R2', })
    # set the title to the tissue type
    plt.title(tissue)
    # save the figure
    plt.savefig(os.path.join(base_plot_folder, f"15.1.1_ngs_correlation/15.1.1_ngs_correlation_{tissue}.png"))

In [9]:
# remove diagonal values from the correlation matrices
correlations_no_diag = {tissue: correlations[tissue].mask(np.eye(len(correlations[tissue])) == 1) for tissue in tissues}

In [10]:
# plot a histogram across all correlation values
flattenend_correlations = [correlations_no_diag[tissue].values.flatten() for tissue in tissues]
flattenend_correlations = [correlation for sublist in flattenend_correlations for correlation in sublist if not np.isnan(correlation)]
# reset matplotlib parameters
plt.rcParams.update(plt.rcParamsDefault)
# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'
plt.clf()
plt.figure(figsize=(2, 1.8))
plt.hist(flattenend_correlations, bins=np.arange(0,1.05,0.05), density=True, color="skyblue")
plt.text(0, 5, f"median: {np.median(flattenend_correlations):.2f}", fontsize=7, ha='left')
plt.xlabel(f"Pearson $r^2$")
plt.ylabel("frequency")
plt.title("NGS expression correlation\nbetween 6 bodies", fontsize=7)
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.1.1_ngs_correlation/15.1.1_ngs_correlation_histogram.{format}"), dpi=300)

In [11]:
# make it a dataframe
tissue_expression_ngs_df = pd.DataFrame.from_dict(tissue_expression_ngs)

In [12]:
# normalize the dataframes
tissue_expression_ngs_df = normalize_expr_df_to_rpm(tissue_expression_ngs_df)

In [13]:
tissue_expression_ngs_df.to_csv(os.path.join(dataset_processed_folder, "ngs_expression.csv"))

## Microarray data (Ludwig 2016)

In [14]:
# read microarray data
df_microarray = pd.read_csv(os.path.join(dataset_folder, 'raw/microarray/data_matrix_quantile.txt'), sep="\t")

In [15]:
# for all entries in df_microarray, replace "," with "."
# then, convert to float
df_microarray = df_microarray.applymap(lambda x: float(x.replace(",", ".")))
df_microarray = df_microarray.astype(float)

In [16]:
df_microarray = df_microarray[df_microarray.index.isin(allowed_mirnas)]
# normalize the dataframes
df_microarray = normalize_expr_df_to_rpm(df_microarray)

In [17]:
def split_and_rename_df(df):
    # Create two empty dataframes for each sample
    df1 = pd.DataFrame()
    df2 = pd.DataFrame()

    # Loop through each column in the original dataframe
    for column in df.columns:
        # Split the column name by '.' and sample number
        split_column = column.split('.', 2)
        for i in range(len(split_column)):
            split_column[i] = split_column[i].replace(".","")

        # If there is a tissue specified
        if len(split_column[2]) > 0:
            organ, tissue, sample = split_column
            new_column_name = tissue
        # If there is no tissue specified
        else:
            organ, sample, _ = split_column
            new_column_name = organ

        # Assign the column to the corresponding new dataframe
        if sample == '1':
            df1[new_column_name] = df[column]
        elif sample == '2':
            df2[new_column_name] = df[column]
    
    return df1, df2

In [18]:
body1, body2 = split_and_rename_df(df_microarray)

In [19]:
# create a scatterplot of each tissue type for the two bodies
# reset the rcparams
plt.rcParams.update(plt.rcParamsDefault)
# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

# first, establish which tissues are present in both bodies
tissues_both = [tissue for tissue in body1.keys() if tissue in body2.keys()]
correlation_vals = []
# create a folder for the scatterplots
if not os.path.exists(os.path.join(base_plot_folder, "15.1.2_microarray")):
    os.makedirs(os.path.join(base_plot_folder, "15.1.2_microarray"))
for tissue in tissues_both:
    plt.clf()
    # calculate the correlation coefficient
    r, p = stats.pearsonr(np.log10(body1[tissue]), np.log10(body2[tissue]))
    plt.figure(figsize=(3,2.5))
    plt.scatter(np.log10(body1[tissue]), np.log10(body2[tissue]), s=3, edgecolors="none")
    plt.title(f"{tissue} (r2={r**2:.2f})")
    plt.xlabel("Body 1")
    plt.ylabel("Body 2")
    
    body1_higher = np.log10(body1.copy())
    body2_higher = np.log10(body2.copy())
    body1_higher[body1_higher < 2] = 2.0
    body2_higher[body2_higher < 2] = 2.0
    r, p = stats.pearsonr(body1_higher[tissue], body2_higher[tissue])
    correlation_vals.append(r**2)
    
    plt.tight_layout()
    plt.savefig(os.path.join(base_plot_folder, f"15.1.2_microarray/15.1.2_scatter_{tissue}.png"))

In [20]:
# reset matplotlib parameters
plt.rcParams.update(plt.rcParamsDefault)
# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'
plt.clf()
plt.figure(figsize=(2, 1.8))
plt.hist(correlation_vals, bins=np.arange(0,1.05,0.05), density=True, color="skyblue")
plt.text(0, 5, f"median: {np.median(correlation_vals):.2f}", fontsize=7, ha='left')
plt.xlabel(f"Pearson $r^2$")
plt.ylabel("frequency")
plt.title("Microarray expression correlation\nbetween 2 bodies", fontsize=7)
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.1.2_microarray/15.1.2_microarray_correlation_histogram.{format}"))

In [21]:
# for the two bodies, create a new dataframe with the average expression values for each tissue
# if a tissue is present in one body but not the other, use the value from the body where it is present

# get a list of tissues present in at least one body
tissues_at_least_one = list(set(body1.keys()).union(set(body2.keys())))

use_body_1 = False
df_microarray_tissue = {}
if use_body_1:
    for tissue in body2.keys():
        df_microarray_tissue[tissue] = body2[tissue]
else:
    for tissue in tissues_at_least_one:
        if tissue in body1.keys() and tissue in body2.keys():
            df_microarray_tissue[tissue] = pd.merge(body1[tissue], body2[tissue], left_index=True, right_index=True)
            # get the geometric mean of the two values
            # gmean is imported from scipy.stats
            df_microarray_tissue[tissue] = df_microarray_tissue[tissue].apply(lambda x: stats.gmean(x), axis=1)
        elif tissue in body1.keys():
            df_microarray_tissue[tissue] = body1[tissue]
        elif tissue in body2.keys():
            df_microarray_tissue[tissue] = body2[tissue]

In [22]:
# make a dataframe from the dictionary
df_microarray_tissue = pd.DataFrame.from_dict(df_microarray_tissue)

In [23]:
# conormalize the dataframes
df_microarray_tissue = normalize_expr_df_to_rpm(df_microarray_tissue)

In [24]:
# save to csv
df_microarray_tissue.to_csv(os.path.join(dataset_processed_folder, "microarray_expression.csv"))

## Tissue cross-correlation

In [25]:
df_ngs_tissue = pd.read_csv(os.path.join(dataset_processed_folder, "ngs_expression.csv"), index_col=0)
df_microarray_tissue = pd.read_csv(os.path.join(dataset_processed_folder, "microarray_expression.csv"), index_col=0)

# make them both log10
df_ngs_tissue = np.log10(df_ngs_tissue)
df_microarray_tissue = np.log10(df_microarray_tissue)

In [26]:
# reorder ngs tissues

organ_dict_ngs = {
    'brain': ['spinal_cord', 'temporal_lobe', 'occipital_lobe', 'cerebellum', 
              'frontal_lobe', 'thalamus',  'white_matter', 
              'nucleus_caudatus', 'grey_matter', 'dura_mater'],
    'nervous': ['nerve'],
    'circulatory': ['artery', 'vein'],
    'musculoskeletal': ['muscle', 'bone'],
    'digestive': ['esophagus', 'stomach', 'colon', 'duodenum', 'jejunum', 'liver', 'pancreas'],
    'respiratory': ['pleurae', 'lung'],
    'endocrine': ['pituitary_gland', 'adrenal_gland', 'thyroid'],
    'lymphatic': ['lymph_node', 'spleen'],
    'urinary': ['kidney'],
    'reproductive': ['prostate', 'testis'],
    'integumentary': ['skin', 'adipocyte'],
}

ordered_columns_ngs = []
for organ_system in organ_dict_ngs:
    ordered_columns_ngs.extend(organ_dict_ngs[organ_system])

df_ngs_tissue = df_ngs_tissue[ordered_columns_ngs]

In [27]:
# calculate the correlation between all tissues and plot a heatmap
corr_ngs = df_ngs_tissue.corr()

# create a list to store the start index of each organ system
organ_indices = [0]
for organ_system in organ_dict_ngs.values():
    organ_indices.append(organ_indices[-1] + len(organ_system))
organ_indices = organ_indices[:-1]  # remove the last index, which is out of bounds

# plot the heatmap
plt.close()
plt.clf()
fig, ax = plt.subplots(figsize=(5,5))


heatmap = sns.heatmap(corr_ngs**2, ax=ax, vmin=0.3, vmax=1, cmap="viridis", 
                      square=True, cbar_kws={"shrink": 0.7, "label": "r2"},
                      linewidths=.5, linecolor='grey')  # add lines between cells
# set sns font size
sns.set(font_scale=0.5)

# draw organ system boundaries
for idx in organ_indices[1:]:  # skip the first index (0)
    heatmap.hlines(idx, *heatmap.get_xlim(), colors='black', linewidth=1)
    heatmap.vlines(idx, *heatmap.get_ylim(), colors='black', linewidth=1)

# label organ systems on x axis
for i in range(len(organ_indices)):
    start = organ_indices[i]
    if i < len(organ_indices) - 1:
        end = organ_indices[i + 1]
    else:
        end = len(ordered_columns_ngs)
    
    label_position = (start + end) / 2
    plt.text(label_position, -1, list(organ_dict_ngs.keys())[i],
             ha='center', va='center', rotation=0, fontsize=8) 
    
# move x and y ticks to the middle of the cells
ax.set_xticks(np.arange(corr_ngs.shape[1]) + 0.5, minor=False)
ax.set_yticks(np.arange(corr_ngs.shape[0]) + 0.5, minor=False)
ax.tick_params(which='both', length=0)  # hide tick marks

#plt.title("NGS")
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.1.1_ngs_correlation/15.1.1_ngs_correlation_heatmap.{format}"), format=format)

In [28]:
# rename gray_matter to grey_matter in the micro columns
df_microarray_tissue = df_microarray_tissue.rename(columns={
    "gray_matter": "grey_matter",
    "nervus_intercostalis": "nerve",
    "pleura": "pleurae",
    "glandula_suprarenalis": "adrenal_gland",})

organ_dict_micro = {
    'brain': ['brain', 'thalamus', 'white_matter', 'grey_matter', 'nucleus_caudatus', 'cerebellum', 'spinal_cord', 'arachnoid_mater', 'dura_mater'],
    'nervous': ['nerve'],
    'circulatory': ['vein', 'artery', 'marrow'],
    'musculoskeletal': ['bone', 'muscle', 'fascia'],
    'digestive': ['esophagus', 'stomach', 'duodenum', 'jejunum', 'small_intestine', 'colon', 'liver', 'gallbladder', 'pancreas'],
    'respiratory': ['pleurae', 'lung'],
    'endocrine': ['pituitary_gland', "adrenal_gland", 'thyroid'],
    'lymphatic': ['lymph_node', 'spleen'],
    'urinary': ['kidney', 'cortex_renalis', 'medulla_renalis', 'bladder'],
    'reproductive': ['testis', 'epididymis', 'tunica_albuginea', 'prostate'],
    'integumentary': ['skin', 'adipocyte'],
}

ordered_columns_micro = []
for organ_system in organ_dict_micro:
    for organ in organ_dict_micro[organ_system]:
        if organ in df_microarray_tissue.columns:
            ordered_columns_micro.append(organ)
        else:
            print(f"organ {organ} not found in microarray data")

df_microarray_tissue = df_microarray_tissue[ordered_columns_micro]

In [29]:
# calculate the correlation between all tissues and plot a heatmap
corr_micro = df_microarray_tissue.corr()

# create a list to store the start index of each organ system
organ_indices = [0]
for organ_system in organ_dict_micro.values():
    organ_indices.append(organ_indices[-1] + len(organ_system))
organ_indices = organ_indices[:-1]  # remove the last index, which is out of bounds

# plot the heatmap
plt.clf()
fig, ax = plt.subplots(figsize=(5.5,5.5))

heatmap = sns.heatmap(corr_micro**2, ax=ax, vmin=0.3, vmax=1, cmap="viridis", 
                      square=True, cbar_kws={"shrink": 0.7, "label": "r2"},
                      linewidths=.5, linecolor='grey')  # add lines between cells
# set sns font size
sns.set(font_scale=0.5)

# draw organ system boundaries
for idx in organ_indices[1:]:  # skip the first index (0)
    heatmap.hlines(idx, *heatmap.get_xlim(), colors='black', linewidth=1)
    heatmap.vlines(idx, *heatmap.get_ylim(), colors='black', linewidth=1)


# label organ systems on x axis
for i in range(len(organ_indices)):
    start = organ_indices[i]
    if i < len(organ_indices) - 1:
        end = organ_indices[i + 1]
    else:
        end = len(ordered_columns_micro)
    
    label_position = (start + end) / 2
    plt.text(label_position, -1, list(organ_dict_micro.keys())[i],
             ha='center', va='center', rotation=0, fontsize=7)  # changed the x-coordinate to 'center'
    
# move x and y ticks to the middle of the cells
ax.set_xticks(np.arange(corr_micro.shape[1]) + 0.5, minor=False)
ax.set_yticks(np.arange(corr_micro.shape[0]) + 0.5, minor=False)
ax.tick_params(which='both', length=0)  # hide tick marks

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.1.2_microarray/15.1.2_microarray_correlation_heatmap.{format}"), format=format)

# 15.2 - Merge the two datasets

In [30]:
if not os.path.exists(os.path.join(base_plot_folder, "15.2_merge_datasets")):
    os.makedirs(os.path.join(base_plot_folder, "15.2_merge_datasets"))

## 15.2.1 - Compare the unprocessed two datasets

In [31]:
# average temporal_lobe, occipital_lobe, frontal_lobe in ngs_data as brain
df_ngs_tissue['brain'] = df_ngs_tissue[['temporal_lobe', 'occipital_lobe', 'frontal_lobe', 'white_matter', 'grey_matter']].mean(axis=1)

In [32]:
# get the list of column names for the microarray data
micro_columns = df_microarray_tissue.columns

# get the list of column names for the NGS data
ngs_columns = df_ngs_tissue.columns

# get the list of column names that are in both datasets
common_columns = list(set(micro_columns).intersection(ngs_columns))

In [None]:
# for both, find columns that aren't in the other
micro_only = [column for column in micro_columns if column not in ngs_columns]
ngs_only = [column for column in ngs_columns if column not in micro_columns]

# print them
print("Microarray only:")
print(micro_only)
print("NGS only:")
print(ngs_only)

In [34]:
# filter the datasets to only include the common columns in the same order
df_microarray_tissue = df_microarray_tissue[common_columns]
df_ngs_tissue = df_ngs_tissue[common_columns]

In [35]:
# harmonize their indices
df_ngs_tissue = df_ngs_tissue.loc[df_microarray_tissue.index]

In [36]:
df_ngs_tissue_orig = df_ngs_tissue.copy()
df_microarray_tissue_orig = df_microarray_tissue.copy()

# remove small values to better assess the relevant correlation
df_ngs_tissue[df_ngs_tissue < 2] = 2.0
df_microarray_tissue[df_microarray_tissue < 2] = 2.0

df_ngs_tissue = df_ngs_tissue.astype(float)
df_microarray_tissue = df_microarray_tissue.astype(float)

In [37]:
# renormalize the two
df_ngs_tissue = normalize_expr_df_to_rpm(df_ngs_tissue, 2)
df_microarray_tissue = normalize_expr_df_to_rpm(df_microarray_tissue, 2)

In [None]:
organs = ['white_matter', 'pituitary_gland', 'colon', 'skin', 'lymph_node',
       'nerve', 'nucleus_caudatus', 'stomach', 'thalamus', 'kidney', 'testis',
       'vein', 'muscle', 'lung', 'prostate', 'jejunum', 'adrenal_gland',
       'thyroid', 'spinal_cord', 'adipocyte', 'grey_matter', 'duodenum',
       'spleen', 'cerebellum', 'dura_mater', 'bone', 'liver', 'artery',
       'esophagus', 'pancreas', 'pleurae', 'brain']

# are there any columns that were left out?
print("Columns left out:")
print([column for column in common_columns if column not in organs])

In [39]:
organ_dict = {
    'brain': ['brain', 'thalamus', 'white_matter', 'grey_matter', 'nucleus_caudatus', 'cerebellum', 'spinal_cord', 'dura_mater'],
    'nervous': ['nerve'],
    'circulatory': ['vein', 'artery'],
    'musculoskeletal': ['bone', 'muscle'],
    'digestive': ['esophagus', 'stomach', 'duodenum', 'jejunum', 'colon', 'liver', 'pancreas'],
    'respiratory': ['pleurae', 'lung'],
    'endocrine': ['thyroid', 'pituitary_gland', 'adrenal_gland'],
    'lymphatic': ['lymph_node', 'spleen'],
    'urinary': ['kidney'],
    'reproductive': ['testis', 'prostate'],
    'integumentary': ['skin', 'adipocyte'],
}

ordered_columns = []
for organ_system in organ_dict:
    for organ in organ_dict[organ_system]:
        if organ in df_microarray_tissue.columns and organ in df_ngs_tissue.columns:
            ordered_columns.append(organ)
        else:
            print(f"organ {organ} not found in microarray or ngs data")
        
df_ngs_tissue = df_ngs_tissue[ordered_columns]
df_microarray_tissue = df_microarray_tissue[ordered_columns]

In [40]:
# Combine the dataframes along the column axis
df_combined = pd.concat([df_ngs_tissue, df_microarray_tissue], axis=1)

# Calculate the correlation
corr = df_combined.corr()

# Get number of columns in each dataframe
n_ngs = df_ngs_tissue.shape[1]
n_micro = df_microarray_tissue.shape[1]

# Get correlation between different dataframes
corr_diff = corr.iloc[:n_ngs, n_ngs:] 

# plot the heatmap
plt.clf()
fig, ax = plt.subplots(figsize=(4.5,4.5))

heatmap = sns.heatmap(corr_diff**2, ax=ax, vmin=0.3, vmax=0.9, cmap="viridis", 
                      square=True, cbar_kws={"shrink": 0.7, "label": "r2"},
                      linewidths=.5, linecolor='grey')  # add lines between cells

for i in range(min(n_ngs, n_micro)):
    rect = plt.Rectangle((i, i), 1, 1, fill=False, edgecolor='red', lw=1)
    ax.add_patch(rect)

# set sns font size
sns.set(font_scale=0.5)

plt.xlabel("Microarray", fontsize=7)
plt.ylabel("NGS", fontsize=7)

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2_merge_datasets/15.2_correlation_unprocessed.{format}"), dpi=300)

## 15.2.2 - Determine outlier miRNAs

First, we find all outliers to redo the normalization - the background miRNAs in the microarray dataset bias the normalization procedure.

In [41]:
def count_df_dict(dict1):
    counts = {}
    for key in dict1.keys():
        for i in dict1[key].index:
            if i not in counts:
                counts[i] = 0
            else:
                counts[i] += 1
    return counts

In [42]:
df_difference = df_ngs_tissue - df_microarray_tissue

In [43]:
# get the 30 most different microRNAs for each tissue
most_different = {}
for tissue in common_columns:
    # get the 30 most different microRNAs
    most_different[tissue] = df_difference[tissue].abs().sort_values(ascending=False).head(30)
    
# get count statistics
counts_most_diff = count_df_dict(most_different)

In [44]:
# plot a histogram of the number of times each microRNA is in the top 30 most different
# reset matplotlib settings
plt.rcParams.update(plt.rcParamsDefault)
# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

plt.clf()
plt.figure(figsize=(2, 1.4))
organ_bins = np.arange(1, len(organs) + 1)
plt.hist(counts_most_diff.values(), bins=organ_bins, edgecolor="black", color="skyblue")
plt.xlabel("# in top 30 most different")
plt.ylabel("microRNAs")
plt.tight_layout()
plt.savefig(os.path.join(base_plot_folder, "15.2_merge_datasets/15.2_most_different_histogram.png"), dpi=300)

In [45]:
exclude_mirnas = [microRNA for microRNA, count in counts_most_diff.items() if count >= 3]

In [46]:
# renormalize the two
df_ngs_tissue_exclude = df_ngs_tissue[~df_ngs_tissue.index.isin(exclude_mirnas)]
df_microarray_tissue_exclude = df_microarray_tissue[~df_microarray_tissue.index.isin(exclude_mirnas)]

df_ngs_tissue_exclude, df_ngs_tissue = normalize_expr_df_to_rpm_with_partner(df_ngs_tissue_exclude, df_ngs_tissue, minimum=2)
df_microarray_tissue_exclude, df_microarray_tissue = normalize_expr_df_to_rpm_with_partner(df_microarray_tissue_exclude, df_microarray_tissue, minimum=2)

Now, we identify which microRNAs are consistently higher in one or the other dataset.

In [47]:
df_difference = df_ngs_tissue - df_microarray_tissue

In [48]:
ngs_larger_micro = {}
micro_larger_ngs = {}
for tissue in common_columns:
    # get the miRNAs with large differences
    ngs_larger_micro[tissue] = df_difference[tissue][df_difference[tissue] > 1]
    micro_larger_ngs[tissue] = df_difference[tissue][df_difference[tissue] < -1]
    
counts_larger_ngs = count_df_dict(ngs_larger_micro)
counts_larger_micro = count_df_dict(micro_larger_ngs)    

In [49]:
# add 0 counts for missing miRNAs
counts_larger_micro_plot = counts_larger_micro.copy()
micro_total_mirna_not_counted = 0
for mirna in df_difference.index:
    if mirna not in counts_larger_micro_plot:
        counts_larger_micro_plot[mirna] = 0
        micro_total_mirna_not_counted += 1
        
counts_larger_ngs_plot = counts_larger_ngs.copy()
ngs_total_mirna_not_counted = 0
for mirna in df_difference.index:
    if mirna not in counts_larger_ngs_plot:
        counts_larger_ngs_plot[mirna] = 0
        ngs_total_mirna_not_counted += 1

In [None]:
# plot a histogram of the number of times each microRNA is in the top 30 most different
plt.clf()
organ_bins = np.arange(1, len(organs) + 1)
plt.figure(figsize=(2, 1.4))
plt.text(20, 5, f"miRNAs #0 count: {ngs_total_mirna_not_counted}", fontsize=7, ha='center', va='center')
plt.hist(counts_larger_ngs_plot.values(), bins=organ_bins, edgecolor="black", color="skyblue")
plt.axvline(x=5, color='black', linestyle='--')
plt.xlabel("# tissues NGS >> microarray")
plt.ylabel("microRNAs")
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2_merge_datasets/15.2_larger_ngs_histogram.{format}"), dpi=300)

In [51]:
# plot a histogram of the number of times each microRNA is in the top 30 most different
plt.clf()
organ_bins = np.arange(1, len(organs) + 1)
plt.figure(figsize=(2, 1.4))
plt.text(20, 15, f"miRNAs #0 count: {micro_total_mirna_not_counted}", fontsize=7, ha='center', va='center')
plt.hist(counts_larger_micro_plot.values(), bins=organ_bins, edgecolor="black", color="skyblue")
# create a vertical line at 5
plt.axvline(x=5, color='black', linestyle='--')
plt.ylim(0, 22)
plt.xlabel("# tissues microarray >> NGS")
plt.ylabel("microRNAs")
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2_merge_datasets/15.2_larger_micro_histogram.{format}"), dpi=300)

In [None]:
# get all miRNAs that are in the top 30 most different in at least 3 tissues
ngs_larger_in_multiple = [microRNA for microRNA, count in counts_larger_ngs.items() if count >= 5]
print(len(ngs_larger_in_multiple))

In [None]:
# get all miRNAs that are in the top 30 most different in at least 3 tissues
micro_larger_in_multiple = [microRNA for microRNA, count in counts_larger_micro.items() if count >= 5]
print(len(micro_larger_in_multiple))

In [54]:
# specifically plot false positives and negatives
excluded_points = micro_larger_in_multiple + ngs_larger_in_multiple
non_exclude_index = df_ngs_tissue.index[~df_ngs_tissue.index.isin(excluded_points)]

In [55]:
for tissue in common_columns:
    plt.clf()
    # reset matplotlib settings
    plt.rcParams.update(plt.rcParamsDefault)
    plt.rcParams.update({'font.size': 7})
    plt.rcParams['font.family'] = 'Helvetica'
    
    fig = plt.figure(figsize=(2,1.8))
    r2_all = stats.pearsonr(df_microarray_tissue[tissue], df_ngs_tissue[tissue])[0]**2
    
    plt.scatter(df_microarray_tissue.loc[ngs_larger_in_multiple, tissue],
                df_ngs_tissue.loc[ngs_larger_in_multiple, tissue], s=5, edgecolors="none", alpha=1,
                label="ngs larger", color="tab:red", marker="v", zorder=2)
    plt.scatter(df_microarray_tissue.loc[micro_larger_in_multiple, tissue],
                df_ngs_tissue.loc[micro_larger_in_multiple, tissue], s=5, edgecolors="none", alpha=1,
                label="microarray larger", color="tab:orange", marker="o", zorder=2)

    non_exclude_index = df_ngs_tissue.index[~df_ngs_tissue.index.isin(excluded_points)]
    plt.scatter(df_microarray_tissue.loc[non_exclude_index, tissue], df_ngs_tissue.loc[non_exclude_index, tissue],
                s=5, edgecolors="none", alpha=1, zorder=1)
    # calculate the correlation coefficient
    r2 = stats.pearsonr(df_microarray_tissue.loc[non_exclude_index, tissue], df_ngs_tissue.loc[non_exclude_index, tissue])[0]**2
    plt.plot([0, 6], [0, 6], color="black", linestyle="--", linewidth=1)

    plt.xlabel("Microarray")
    plt.ylabel("NGS")
    plt.xlim(1.5, 6)
    plt.ylim(1.5, 6)
    plt.title(f"{tissue}", fontsize=7)
    plt.text(1.6, 5.4, f"$r^2$ (all)={r2_all:.2f}", fontsize=7, ha="left")
    plt.text(1.6, 4.9, f"$r^2$ (excl.)={r2:.2f}", fontsize=7, ha="left")
    plt.tight_layout()
    plt.legend(loc=[1.05,0.5], fontsize=7)
    for format in ["png", "svg"]:
        plt.savefig(os.path.join(base_plot_folder, f"15.2_merge_datasets/15.2_scatter_{tissue}.{format}"), dpi=300)
    plt.close()

In [56]:
# Create a figure with a grid of subplots
fig, axes = plt.subplots(2, 3, figsize=(5, 3.2))

# Flatten the array of axes for easy iteration
axes = axes.flatten()
chosen_columns = ["adipocyte", "grey_matter", "liver", "lung", "prostate", "skin"]

for i, tissue in enumerate(chosen_columns):
    ax = axes[i]
    
    r2_all = stats.pearsonr(df_microarray_tissue[tissue], df_ngs_tissue[tissue])[0]**2
    
    ax.scatter(df_microarray_tissue.loc[ngs_larger_in_multiple, tissue],
               df_ngs_tissue.loc[ngs_larger_in_multiple, tissue], s=6, edgecolors="none", alpha=1,
               label="ngs consistently larger", color="tab:red", marker="v", zorder=2, rasterized=True)
    ax.scatter(df_microarray_tissue.loc[micro_larger_in_multiple, tissue],
               df_ngs_tissue.loc[micro_larger_in_multiple, tissue], s=6, edgecolors="none", alpha=1,
               label="microarray consistently larger", color="tab:orange", marker="o", zorder=2, rasterized=True)
    
    non_exclude_index = df_ngs_tissue.index[~df_ngs_tissue.index.isin(excluded_points)]
    ax.scatter(df_microarray_tissue.loc[non_exclude_index, tissue], df_ngs_tissue.loc[non_exclude_index, tissue],
               s=6, edgecolors="none", alpha=1, zorder=1, rasterized=True)
    
    r2 = stats.pearsonr(df_microarray_tissue.loc[non_exclude_index, tissue], df_ngs_tissue.loc[non_exclude_index, tissue])[0]**2
    ax.plot([0, 6], [0, 6], color="black", linestyle="--", linewidth=1)
    
    ax.set_xlim(1.5, 5.8)
    ax.set_ylim(1.5, 5.8)
    ax.set_title(f"{tissue}", fontsize=8)
    ax.text(1.6, 5.4, f"$r^2$ (all)={r2_all:.2f}", fontsize=7, ha="left")
    ax.text(1.6, 4.9, f"$r^2$ (excl.)={r2:.2f}", fontsize=7, ha="left")
    
    ax.set_xticks([2, 3, 4, 5])
    ax.set_yticks([2, 3, 4, 5])
    
    if i % 3 == 0:
        ax.set_ylabel("NGS")
    if i >= 3:
        ax.set_xlabel("Microarray")

    # Hide inner tick labels
    if i % 3 != 0:
        ax.tick_params(labelleft=False)
    if i < 3:
        ax.tick_params(labelbottom=False)
        

# Adjust layout
plt.tight_layout()
plt.legend(loc='best', fontsize=7)
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2_merge_datasets/15.2_scatter_chosen_columns.{format}"), dpi=600)

In [57]:
# calculate the correlation between all columns in the NGS and all columns microarray datasets
# This is NOT supposed to compare columns with the same names only
# Combine the dataframes along the column axis
df_combined = pd.concat([df_ngs_tissue.loc[non_exclude_index], df_microarray_tissue.loc[non_exclude_index]], axis=1)

# Calculate the correlation
corr = df_combined.corr()

# Get number of columns in each dataframe
n_ngs = df_ngs_tissue.shape[1]
n_micro = df_microarray_tissue.shape[1]

# Get correlation between different dataframes
corr_diff = corr.iloc[:n_ngs, n_ngs:]  # correlations between df_ngs_tissue and df_microarray_tissue

# plot the heatmap
plt.clf()
fig, ax = plt.subplots(figsize=(4.5,4.5))

heatmap = sns.heatmap(corr_diff**2, ax=ax, vmin=0.3, vmax=0.9, cmap="viridis", 
                      square=True, cbar_kws={"shrink": 0.7, "label": "r2"},
                      linewidths=.5, linecolor='grey')  # add lines between cells

sns.set(font_scale=0.5)

for i in range(min(n_ngs, n_micro)):
    rect = plt.Rectangle((i, i), 1, 1, fill=False, edgecolor='red', lw=1)
    ax.add_patch(rect)

plt.xlabel("Microarray", fontsize=7)
plt.ylabel("NGS", fontsize=7)

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2_merge_datasets/15.2_correlation_excluded_mirnas.{format}"), dpi=300)

In [58]:
# save these two to a file
with open(os.path.join(dataset_processed_folder, "ngs_larger_in_multiple.txt"), "w") as f:
    f.write("\n".join(ngs_larger_in_multiple))
with open(os.path.join(dataset_processed_folder, "micro_larger_in_multiple.txt"), "w") as f:
    f.write("\n".join(micro_larger_in_multiple))
with open(os.path.join(dataset_processed_folder, "tissue_dataset_mirnas.txt"), "w") as f:
    for mirna in df_microarray_tissue.index:
        f.write(mirna+"\n")

## 15.2.3 - Apply the same procedure to the cell line data

In [59]:
if not os.path.exists(os.path.join(base_plot_folder, "15.2.3_cell_line_data")):
    os.makedirs(os.path.join(base_plot_folder, "15.2.3_cell_line_data"))

In [60]:
def sum_filtering(df):
    """
    Filters a dataframe by summing the values in each row and keeping only those that are above 3.
    """
    df["sum"] = df.sum(axis=1)
    df_filtered = df[df["sum"] > 3]
    return df_filtered.drop(columns="sum")

false_negatives_ngs = pd.read_csv("../outputs/3_fitting/Keller2023/Keller2023_false_negatives_filtered.csv", index_col=0)
false_negatives_microarray = pd.read_csv("../outputs/3_fitting/Alles2019/Alles2019_false_negatives_filtered.csv", index_col=0)
false_positives_ngs = pd.read_csv("../outputs/3_fitting/Keller2023/Keller2023_false_positives_filtered.csv", index_col=0)
false_positives_microarray = pd.read_csv("../outputs/3_fitting/Alles2019/Alles2019_false_positives_filtered.csv", index_col=0)

# # apply the sum filtering
false_negatives_ngs = sum_filtering(false_negatives_ngs)
false_negatives_microarray = sum_filtering(false_negatives_microarray)
false_positives_ngs = sum_filtering(false_positives_ngs)
false_positives_microarray = sum_filtering(false_positives_microarray)

false_positives_and_negatives = [false_negatives_ngs, false_negatives_microarray, false_positives_ngs, false_positives_microarray]
false_positives_and_negatives_label = ["false negatives sequencing", "false negatives microarray", "false positives sequencing", "false positives microarray"]
false_positives_and_negatives_symbols = ["v", "v", "o", "o"]
false_positives_and_negatives_colors = ["tab:red", "tab:orange", "tab:red", "tab:orange"]

In [None]:
# create a plot containing the names of all false positives and negatives
false_negatives_microarray.index

In [62]:
df_alles = pd.read_csv("../microrna_data/2_output/Alles2019_conormalized.csv", index_col=0)
df_alles = np.log10(df_alles)
df_alles = df_alles[df_alles.index.isin(allowed_mirnas)]

df_keller = pd.read_csv('../microrna_data/2_output/Keller2023_conormalized.csv', index_col=0)
df_keller = np.log10(df_keller)
df_keller = df_keller[df_keller.index.isin(allowed_mirnas)]

common_index = df_keller.index.intersection(df_alles.index)
common_columns = df_keller.columns.intersection(df_alles.columns)
df_alles = df_alles.loc[common_index, common_columns]
df_keller = df_keller.loc[common_index, common_columns]

df_alles_original_mean = df_alles.drop(columns=["JEG3", "Tera1"]).mean(axis=1)
df_keller_original_mean = df_keller.drop(columns=["JEG3", "Tera1"]).mean(axis=1)

df_alles[df_alles < 2] = 2
df_keller[df_keller < 2] = 2

In [63]:
df_difference = df_keller - df_alles

In [64]:
# get the 30 most different microRNAs for each tissue
most_different = {}
for tissue in common_columns:
    # get the 30 most different microRNAs
    most_different[tissue] = df_difference[tissue].abs().sort_values(ascending=False).head(30)
    
# get count statistics
counts_most_diff = count_df_dict(most_different)

In [65]:
# plot a histogram of the number of times each microRNA is in the top 30 most different
# reset matplotlib settings
plt.rcParams.update(plt.rcParamsDefault)
# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

plt.clf()
plt.figure(figsize=(2, 1.4))
organ_bins = np.arange(1, len(df_alles.columns) + 1)
plt.hist(counts_most_diff.values(), bins=organ_bins, edgecolor="black", color="skyblue")
plt.xlabel("# in top 30 most different")
plt.ylabel("microRNAs")
plt.tight_layout()
plt.savefig(os.path.join(base_plot_folder, "15.2.3_cell_line_data/15.2.3_most_different_histogram.png"), dpi=300)

In [66]:
exclude_mirnas = [microRNA for microRNA, count in counts_most_diff.items() if count >= 3]

# renormalize the two
df_keller_exclude = df_keller[~df_keller.index.isin(exclude_mirnas)]
df_alles_exclude = df_alles[~df_alles.index.isin(exclude_mirnas)]

df_keller_exclude, df_keller = normalize_expr_df_to_rpm_with_partner(df_keller_exclude, df_keller, minimum=2)
df_alles_exclude, df_alles = normalize_expr_df_to_rpm_with_partner(df_alles_exclude, df_alles, minimum=2)

### Now, identify consistent outliers

In [67]:
df_difference = df_keller - df_alles

In [68]:
# get the most different microRNAs
keller_larger_alles = {}
alles_larger_keller = {}
for tissue in df_difference.columns:
    # get the most different microRNAs
    keller_larger_alles[tissue] = df_difference[tissue][df_difference[tissue] > 1]
    alles_larger_keller[tissue] = df_difference[tissue][df_difference[tissue] < -1]
    
# get statistics for the most different microRNAs
counts_larger_keller = count_df_dict(keller_larger_alles)
counts_larger_alles = count_df_dict(alles_larger_keller)    

In [69]:
# add 0 counts for missing miRNAs
counts_larger_alles_plot = counts_larger_alles.copy()
alles_total_mirna_not_counted = 0
for mirna in df_difference.index:
    if mirna not in counts_larger_alles_plot:
        counts_larger_alles_plot[mirna] = 0
        alles_total_mirna_not_counted += 1
        
counts_larger_keller_plot = counts_larger_keller.copy()
keller_total_mirna_not_counted = 0
for mirna in df_difference.index:
    if mirna not in counts_larger_keller_plot:
        counts_larger_keller_plot[mirna] = 0
        keller_total_mirna_not_counted += 1

In [70]:
# plot a histogram of the number of times each microRNA is in the top 30 most different
cell_line_bins = np.arange(1, len(df_difference.columns) + 1)

plt.clf()
plt.figure(figsize=(1.8, 1.4))
plt.hist(counts_larger_keller.values(), bins=cell_line_bins, edgecolor="black", color="skyblue")
plt.text(7, 5, f"miRNAs #0 count: {keller_total_mirna_not_counted}", fontsize=7, ha='center', va='center')
plt.xlim(0,13)
plt.ylim(0,7)
plt.axvline(x=3, color='black', linestyle='--')
plt.xlabel("# tissues NGS >> microarray")
plt.ylabel("microRNAs")
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2.3_cell_line_data/15.2.3_larger_keller_histogram.{format}"), dpi=300)

In [71]:
# plot a histogram of the number of times each microRNA is in the top 30 most different
plt.clf()
plt.figure(figsize=(1.8, 1.4))
plt.hist(counts_larger_alles.values(), bins=cell_line_bins, edgecolor="black", color="skyblue")
plt.text(7, 5, f"miRNAs #0 count: {alles_total_mirna_not_counted}", fontsize=7, ha='center', va='center')
plt.xlabel("# tissues microarray >> NGS")
plt.xlim(0,13)
plt.ylim(0,7)
plt.axvline(x=3, color='black', linestyle='--')
plt.ylabel("microRNAs")
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2.3_cell_line_data/15.2_larger_alles_histogram.{format}"), dpi=300)

In [None]:
alles_larger_in_multiple = [microRNA for microRNA, count in counts_larger_alles.items() if count >= 3]
print(len(alles_larger_in_multiple))

keller_larger_in_multiple = [microRNA for microRNA, count in counts_larger_keller.items() if count >= 3]
print(len(keller_larger_in_multiple))

In [73]:
# create the folder for the scatterplots
if not os.path.exists(os.path.join(base_plot_folder, "alles_keller")):
    os.makedirs(os.path.join(base_plot_folder, "alles_keller"))

for tissue in common_columns:
    fig = plt.figure(figsize=(2,1.6))

    # specifically plot false positives and negatives
    excluded_points = alles_larger_in_multiple + keller_larger_in_multiple
    non_exclude_index = df_keller.index[~df_keller.index.isin(excluded_points)]
    plt.scatter(df_alles.loc[keller_larger_in_multiple, tissue],
                df_keller.loc[keller_larger_in_multiple, tissue], s=3, alpha=1,
                label="keller larger", color="tab:red", marker="v", zorder=2)
    plt.scatter(df_alles.loc[alles_larger_in_multiple, tissue],
                df_keller.loc[alles_larger_in_multiple, tissue], s=3, alpha=1,
                label="alles larger", color="tab:orange", marker="o", zorder=2)

    non_exclude_index = df_keller.index[~df_keller.index.isin(excluded_points)]
    plt.scatter(df_alles.loc[non_exclude_index, tissue], df_keller.loc[non_exclude_index, tissue],
                s=3, alpha=1, zorder=1)
    # calculate the correlation coefficient
    r, p = stats.pearsonr(df_alles.loc[non_exclude_index, tissue], df_keller.loc[non_exclude_index, tissue])
    plt.plot([0, 5.5], [0, 5.5], color="black", linestyle="--")

    plt.xlabel("Micro")
    plt.ylabel("NGS")
    plt.xlim(1.5, 6)
    plt.ylim(1.5, 6)
    plt.title(f"{tissue} (r2={r**2:.2f})", fontsize=7)
    plt.tight_layout()
    plt.legend(loc=[1.05,0.5], fontsize=7)
    plt.savefig(os.path.join(base_plot_folder, f"15.2.3_cell_line_data/15.2.3_keller_alles_scatter_most_diff_{tissue}.png"), dpi=300)
    plt.close()
    
fig = plt.figure(figsize=(2,1.6))
# specifically plot false positives and negatives
excluded_points = alles_larger_in_multiple + keller_larger_in_multiple
non_exclude_index = df_keller.index[~df_keller.index.isin(excluded_points)]
plt.scatter(df_alles_original_mean.loc[keller_larger_in_multiple],
            df_keller_original_mean.loc[keller_larger_in_multiple], s=3, alpha=1,
            label="keller larger", color="tab:red", marker="v", zorder=2)
plt.scatter(df_alles_original_mean.loc[alles_larger_in_multiple],
            df_keller_original_mean.loc[alles_larger_in_multiple], s=3, alpha=1,
            label="alles larger", color="tab:orange", marker="o", zorder=2)

non_exclude_index = df_keller.index[~df_keller.index.isin(excluded_points)]
plt.scatter(df_alles_original_mean.loc[non_exclude_index], df_keller_original_mean.loc[non_exclude_index],
            s=3, alpha=1, zorder=1)
# calculate the correlation coefficient
plt.plot([0, 5.5], [0, 5.5], color="black", linestyle="--")

plt.xlabel("Micro mean")
plt.ylabel("NGS mean")
plt.xlim(1, 5)
plt.ylim(-0.5, 5)
plt.xticks(np.arange(1, 5, 1))
plt.yticks(np.arange(0, 5, 1))
plt.title(f"Cell line mean", fontsize=7)
plt.tight_layout()
plt.legend(loc=[1.05,0.5], fontsize=7)
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2.3_cell_line_data/15.2.3_keller_alles_scatter_most_diff_mean.{format}"), dpi=300)
plt.close()   

In [74]:
# save these two to a file
with open(os.path.join(dataset_processed_folder, "alles_larger_in_multiple.txt"), "w") as f:
    f.write("\n".join(alles_larger_in_multiple))
with open(os.path.join(dataset_processed_folder, "keller_larger_in_multiple.txt"), "w") as f:
    f.write("\n".join(keller_larger_in_multiple))

### Load the calls for the cell line data
Here, we used the collected cell line stability data to determine which dataset is wrong. The calls are made in notebook 3.

In [75]:
input_folder = "../outputs/3_fitting/Alles2019/cell_line_mirna_calls"
df_call_alles_wrong = pd.read_csv(os.path.join(input_folder, "calls_alles.csv"), index_col=0)
df_call_keller_wrong = pd.read_csv(os.path.join(input_folder, "calls_keller.csv"), index_col=0)

In [76]:
alles_larger_in_multiple_alles_wrong = [mirna for mirna in alles_larger_in_multiple if mirna in df_call_alles_wrong.index]
alles_larger_in_multiple_keller_wrong = [mirna for mirna in alles_larger_in_multiple if mirna in df_call_keller_wrong.index]
alles_larger_in_multiple_undecided = [mirna for mirna in alles_larger_in_multiple if (mirna not in alles_larger_in_multiple_alles_wrong)
                                      and (mirna not in alles_larger_in_multiple_keller_wrong)]
assert len(alles_larger_in_multiple_alles_wrong) + len(alles_larger_in_multiple_keller_wrong) + len(alles_larger_in_multiple_undecided) == len(alles_larger_in_multiple)

keller_larger_in_multiple_alles_wrong = [mirna for mirna in keller_larger_in_multiple if mirna in df_call_alles_wrong.index]
keller_larger_in_multiple_keller_wrong = [mirna for mirna in keller_larger_in_multiple if mirna in df_call_keller_wrong.index]
keller_larger_in_multiple_undecided = [mirna for mirna in keller_larger_in_multiple if (mirna not in keller_larger_in_multiple_alles_wrong)
                                      and (mirna not in keller_larger_in_multiple_keller_wrong)]
assert len(keller_larger_in_multiple_alles_wrong) + len(keller_larger_in_multiple_keller_wrong) + len(keller_larger_in_multiple_undecided) == len(keller_larger_in_multiple)

In [77]:
os.makedirs(os.path.join(base_plot_folder, "15.2.3_calls"), exist_ok=True)

for tissue in df_keller.columns:
    plt.clf()
    fig = plt.figure(figsize=(2,2))

    # specifically plot false positives and negatives
    excluded_points = alles_larger_in_multiple + keller_larger_in_multiple
    non_exclude_index = df_keller.index[~df_keller.index.isin(excluded_points)]
    
    # call set 1
    plt.scatter(df_alles.loc[keller_larger_in_multiple_keller_wrong, tissue],
                df_keller.loc[keller_larger_in_multiple_keller_wrong, tissue], s=3, alpha=1,
                label="ngs larger | ngs wrong", color="tab:red", marker="v", zorder=2)
    plt.scatter(df_alles.loc[keller_larger_in_multiple_alles_wrong, tissue],
                df_keller.loc[keller_larger_in_multiple_alles_wrong, tissue], s=3, alpha=1,
                label="ngs larger | micro wrong", color="tab:red", marker="*", zorder=2)
    
     # call set 2
    plt.scatter(df_alles.loc[alles_larger_in_multiple_keller_wrong, tissue],
                df_keller.loc[alles_larger_in_multiple_keller_wrong, tissue], s=3, alpha=1,
                label="micro larger | ngs wrong", color="tab:orange", marker="v", zorder=2)
    plt.scatter(df_alles.loc[alles_larger_in_multiple_alles_wrong, tissue],
                df_keller.loc[alles_larger_in_multiple_alles_wrong, tissue], s=3, alpha=1,
                label="micro larger | micro wrong", color="tab:orange", marker="*", zorder=2)
    
    non_exclude_index = df_keller.index[~df_keller.index.isin(excluded_points)]
    plt.scatter(df_alles.loc[non_exclude_index, tissue], df_keller.loc[non_exclude_index, tissue],
                s=3, alpha=1, zorder=1)
    
    # calculate the correlation coefficient
    r, p = stats.pearsonr(df_alles.loc[non_exclude_index, tissue], df_keller.loc[non_exclude_index, tissue])
    plt.plot([0, 5.5], [0, 5.5], color="black", linestyle="--")

    plt.xlabel("Alles")
    plt.ylabel("Keller")
    plt.xlim(1.5, 6)
    plt.ylim(1.5, 6)
    plt.title(f"{tissue} (r2={r**2:.2f})", fontsize=7)
    plt.tight_layout()
    plt.legend(loc="upper left", fontsize=4)
    plt.savefig(os.path.join(base_plot_folder, f"15.2.3_calls/15.2.3_calls_{tissue}.png"), dpi=300)
    plt.close()

In [78]:
plt.figure(figsize=(2, 1.7))
plt.scatter(df_alles_original_mean.loc[alles_larger_in_multiple_alles_wrong+keller_larger_in_multiple_alles_wrong],\
            df_keller_original_mean.loc[alles_larger_in_multiple_alles_wrong+keller_larger_in_multiple_alles_wrong],
            label='micro wrong', color="tab:blue", s=10, edgecolors="none", zorder=3)
plt.scatter(df_alles_original_mean.loc[keller_larger_in_multiple_keller_wrong+alles_larger_in_multiple_keller_wrong],
            df_keller_original_mean.loc[keller_larger_in_multiple_keller_wrong+alles_larger_in_multiple_keller_wrong], 
             label='NGS wrong', color="tab:red", s=10, edgecolors="none", zorder=3)
plt.scatter(df_alles_original_mean.loc[alles_larger_in_multiple_undecided+keller_larger_in_multiple_undecided],\
            df_keller_original_mean.loc[alles_larger_in_multiple_undecided+keller_larger_in_multiple_undecided],
            label='undecided', color="tab:grey", s=10, edgecolors="none", zorder=2)

plt.plot([0, 5.5], [0, 5.5], color="black", linestyle="--", zorder=2)
plt.ylabel('NGS cell line mean')
plt.xlabel('microarray tissue mean')
plt.xlim(1, 5)
plt.ylim(-0.5, 5)
plt.xticks(np.arange(1, 5, 1))
plt.yticks(np.arange(0, 5, 1))
plt.grid(True, zorder=1)
plt.tight_layout()
plt.legend(loc="lower left", fontsize=4)
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2.3_calls/15.2.3_mean_calls.{format}"), dpi=300)

In [None]:
def find_common_mirnas(miRNA_list, df):
    """
    Find the miRNAs that are in both the list and the dataframe.
    """
    return [miRNA for miRNA in miRNA_list if miRNA in df.index]

keller_wrong_indices = [0, 2]
alles_wrong_indices = [1, 3]
keller_wrong = []
alles_wrong = []

for index in range(len(false_positives_and_negatives)):
    print(f"{false_positives_and_negatives_label[index]} in alles larger:")
    mirnas = find_common_mirnas(alles_larger_in_multiple, false_positives_and_negatives[index])
    print(mirnas)
    print(f"{false_positives_and_negatives_label[index]} in keller larger:")
    print(find_common_mirnas(keller_larger_in_multiple, false_positives_and_negatives[index]))

In [80]:
alles_wrong = [false_positives_and_negatives[i].index.to_list() for i in alles_wrong_indices]
keller_wrong = [false_positives_and_negatives[i].index.to_list() for i in keller_wrong_indices]

# flatten them
alles_wrong = [item for sublist in alles_wrong for item in sublist]
keller_wrong = [item for sublist in keller_wrong for item in sublist]
undecided_cell_lines = [mirna for mirna in alles_larger_in_multiple if not mirna in alles_wrong and not mirna in keller_wrong] +\
                        [mirna for mirna in keller_larger_in_multiple if not mirna in alles_wrong and not mirna in keller_wrong]

In [81]:
plt.figure(figsize=(2, 1.7))
plt.scatter(df_alles_original_mean.loc[alles_wrong],\
    df_keller_original_mean.loc[alles_wrong],
            label='micro wrong', color="tab:blue", s=10, edgecolors="none", zorder=3)
plt.scatter(df_alles_original_mean.loc[keller_wrong],
            df_keller_original_mean.loc[keller_wrong], 
             label='NGS wrong', color="tab:red", s=10, edgecolors="none", zorder=3)
plt.scatter(df_alles_original_mean.loc[undecided_cell_lines],
            df_keller_original_mean.loc[undecided_cell_lines], 
             label='undecided', color="tab:grey", s=10, edgecolors="none", zorder=2)
plt.plot([0, 5.5], [0, 5.5], color="black", linestyle="--", zorder=2)
plt.ylabel('NGS cell line mean')
plt.xlabel('microarray cell line mean')
plt.xlim(1, 5)
plt.ylim(-0.5, 5)
plt.xticks(np.arange(1, 5, 1))
plt.yticks(np.arange(0, 5, 1))
plt.grid(True, zorder=1)
plt.tight_layout()
plt.legend(loc=[1,0.05])
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2.3_cell_line_data/15.2.3_mean_false_pos_and_neg.{format}"), dpi=300)

## 15.2.4 - Decide which miRNAs are correct
### Use the calls from the MPRA (Notebook 3 - 3.2.2)

In [82]:
input_folder = "../outputs/3_fitting/Keller2023/tissue_mirna_calls"
df_call_alles_wrong = pd.read_csv(os.path.join(input_folder, "calls_alles.csv"), index_col=0)
df_call_keller_wrong = pd.read_csv(os.path.join(input_folder, "calls_keller.csv"), index_col=0)

In [83]:
micro_larger_in_multiple_alles_wrong = [mirna for mirna in micro_larger_in_multiple if mirna in df_call_alles_wrong.index]
micro_larger_in_multiple_keller_wrong = [mirna for mirna in micro_larger_in_multiple if mirna in df_call_keller_wrong.index]
micro_larger_in_multiple_undecided = [mirna for mirna in micro_larger_in_multiple if (mirna not in micro_larger_in_multiple_alles_wrong)
                                      and (mirna not in micro_larger_in_multiple_keller_wrong)]
assert len(micro_larger_in_multiple_alles_wrong) + len(micro_larger_in_multiple_keller_wrong) + len(micro_larger_in_multiple_undecided) == len(micro_larger_in_multiple)

ngs_larger_in_multiple_alles_wrong = [mirna for mirna in ngs_larger_in_multiple if mirna in df_call_alles_wrong.index]
ngs_larger_in_multiple_keller_wrong = [mirna for mirna in ngs_larger_in_multiple if mirna in df_call_keller_wrong.index]
ngs_larger_in_multiple_undecided = [mirna for mirna in ngs_larger_in_multiple if (mirna not in ngs_larger_in_multiple_alles_wrong)
                                      and (mirna not in ngs_larger_in_multiple_keller_wrong)]
assert len(ngs_larger_in_multiple_alles_wrong) + len(ngs_larger_in_multiple_keller_wrong) + len(ngs_larger_in_multiple_undecided) == len(ngs_larger_in_multiple)

In [84]:
# reload the original data
df_ngs_tissue_orig_min = pd.read_csv(os.path.join(dataset_processed_folder, "ngs_expression.csv"), index_col=0)
df_microarray_tissue_orig_min = pd.read_csv(os.path.join(dataset_processed_folder, "microarray_expression.csv"), index_col=0)

# make them both log10
df_ngs_tissue_orig_min = np.log10(df_ngs_tissue_orig_min)
df_microarray_tissue_orig_min = np.log10(df_microarray_tissue_orig_min)

In [85]:
# this is the geometric mean of the tissues
df_ngs_tissue_mean = df_ngs_tissue_orig_min.mean(axis=1)
df_ngs_tissue_std = df_ngs_tissue_orig_min.std(axis=1)

df_microarray_tissue_mean = df_microarray_tissue_orig_min.mean(axis=1)
df_microarray_tissue_std = df_microarray_tissue_orig_min.std(axis=1)

In [86]:
# Create scatter plot with error bars
plt.figure(figsize=(2, 1.7))

plt.scatter(df_microarray_tissue_mean.loc[micro_larger_in_multiple_alles_wrong], df_ngs_tissue_mean.loc[micro_larger_in_multiple_alles_wrong],
            label='micro larger|NGS wrong', color="tab:blue", s=10, edgecolors="none", zorder=2)
plt.scatter(df_microarray_tissue_mean.loc[micro_larger_in_multiple_keller_wrong], df_ngs_tissue_mean.loc[micro_larger_in_multiple_keller_wrong], 
             label='micro larger|micro wrong', color="tab:red", s=10, edgecolors="none", zorder=2)
plt.plot([0, 5.5], [0, 5.5], color="black", linestyle="--", zorder=2)
plt.ylabel(r'mean(log$_{10}$(NGS exp.))')
plt.xlabel(r'mean(log$_{10}$(micro exp.))')
plt.xlim(1, 5)
plt.ylim(-0.5, 5)
plt.xticks(np.arange(1, 5, 1))
plt.yticks(np.arange(0, 5, 1))
plt.grid(True, zorder=1)
plt.tight_layout()
plt.legend(loc=[1,0.05])
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2.4_calls/15.2.4_mean_micro_larger.{format}"), dpi=300)

In [87]:
# Create scatter plot with error bars
plt.figure(figsize=(2, 1.7))
plt.scatter(df_microarray_tissue_mean.loc[ngs_larger_in_multiple_alles_wrong], df_ngs_tissue_mean.loc[ngs_larger_in_multiple_alles_wrong],
            label='NGS larger|NGS wrong', color="tab:blue", s=10, edgecolors="none", zorder=2)
plt.scatter(df_microarray_tissue_mean.loc[ngs_larger_in_multiple_keller_wrong], df_ngs_tissue_mean.loc[ngs_larger_in_multiple_keller_wrong], 
             label='NGS larger|micro wrong', color="tab:red", s=10, edgecolors="none", zorder=2)
plt.plot([0, 5.5], [0, 5.5], color="black", linestyle="--", zorder=2)
plt.ylabel(r'mean(log$_{10}$(NGS exp.))')
plt.xlabel(r'mean(log$_{10}$(micro exp.))')
plt.xlim(1, 5)
plt.ylim(-0.5, 5)
plt.xticks(np.arange(1, 5, 1))
plt.yticks(np.arange(0, 5, 1))
plt.grid(True, zorder=1)
plt.tight_layout()
plt.legend(loc=[1,0.05])
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2.4_calls/15.2.4_mean_ngs_larger.{format}"), dpi=300)

In [88]:
# Create scatter plot with error bars
plt.figure(figsize=(2, 1.7))
plt.ylabel(r'mean(log$_{10}$(NGS exp.))')
plt.xlabel(r'mean(log$_{10}$(micro exp.))')
plt.xlim(1, 5)
plt.ylim(-0.5, 5)
plt.xticks(np.arange(1, 5, 1))
plt.yticks(np.arange(0, 5, 1))
plt.grid(True, zorder=1)
plt.scatter(df_microarray_tissue_mean.loc[micro_larger_in_multiple_undecided], df_ngs_tissue_mean.loc[micro_larger_in_multiple_undecided],
            label='Micro larger|Undecided', color="tab:blue", s=5, edgecolors="none", zorder=2)
plt.scatter(df_microarray_tissue_mean.loc[ngs_larger_in_multiple_undecided], df_ngs_tissue_mean.loc[ngs_larger_in_multiple_undecided], 
             label='NGS larger|Undecided', color="tab:red", s=5, edgecolors="none", zorder=2)
plt.plot([0, 5.5], [0, 5.5], color="black", linestyle="--", zorder=2)
plt.tight_layout()
plt.legend(loc=[1,0.05])
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2.4_calls/15.2.4_mean_undecided.{format}"), dpi=300)

In [94]:
# Create scatter plot with error bars
plt.figure(figsize=(2.4, 1.7))
plt.ylabel(r'mean(log$_{10}$(NGS exp.))')
plt.xlabel(r'mean(log$_{10}$(micro exp.))')
plt.xlim(1, 5)
plt.ylim(-0.5, 5)
plt.xticks(np.arange(1, 5, 1))
plt.yticks(np.arange(0, 5, 1))
plt.grid(True, zorder=1)
plt.scatter(df_microarray_tissue_mean.loc[set_micro_real], df_ngs_tissue_mean.loc[set_micro_real],
            label='validated', color="tab:blue", s=5, edgecolors="none", zorder=3)
plt.scatter(df_microarray_tissue_mean.loc[set_micro_false], df_ngs_tissue_mean.loc[set_micro_false], 
             label='invalidated', color="tab:red", s=5, edgecolors="none", zorder=3)
plt.scatter(df_microarray_tissue_mean.loc[set_all_others], df_ngs_tissue_mean.loc[set_all_others], 
             label='other', color="grey", s=2, edgecolors="none", zorder=2, alpha=0.5)
plt.plot([0, 5.5], [0, 5.5], color="black", linestyle="--", zorder=2)
plt.tight_layout()
plt.legend(loc=[0.0,0.65])
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2.4_calls/15.2.4_mean_keller_publication.{format}"), dpi=300)

### Look into GC content

In [95]:
micro_larger = mirbase.loc[micro_larger_in_multiple, "sequence_orig"].to_list()
other_mirnas = [mirna for mirna in df_microarray_tissue.index if mirna not in micro_larger]
other_seqs = mirbase.loc[other_mirnas, "sequence_orig"].to_list()

In [96]:
# calculate the GC content of each mirna
def gc_content(seq):
    return 100*(seq.count("G") + seq.count("C")) / len(seq)

def nuc_content(seq, nucleotide):
    return 100*seq.count(nucleotide) / len(seq)

micro_larger_gc = [gc_content(seq) for seq in micro_larger]
other_seqs_gc = [gc_content(seq) for seq in other_seqs]

plt.figure(figsize=(2.2, 1.6))
plt.hist(micro_larger_gc, bins=np.arange(0, 101, 5), color="tab:blue", 
         edgecolor="black", label=f"micro >> ngs, mean: {np.mean(micro_larger_gc):.1f}",
         density=True, alpha=0.5)
plt.hist(other_seqs_gc, bins=np.arange(0, 101, 5), color="tab:red",
            edgecolor="black", label=f"other, mean: {np.mean(other_seqs_gc):.1f}",
            density=True, alpha=0.5)

plt.legend(loc="upper left")
plt.xlabel("GC content(%)")
plt.ylabel("Density")

plt.ylim(0, 0.08)
# perform a statistical test (Mann-Whitney U test)
u, p = stats.mannwhitneyu(micro_larger_gc, other_seqs_gc)

# add the mean and p-value to the plot
plt.title(f"p={p:.2e}", fontsize=8)
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2.4_calls/15.2.4_content_gc.{format}"), dpi=300)

In [97]:

fig, axes = plt.subplots(2, 2, figsize=(4, 3))  # Adjust the figure size as needed

nucleotides = ["A", "C", "G", "T"]

for i, nucleotide in enumerate(nucleotides):
    micro_larger_nuc = [nuc_content(seq, nucleotide) for seq in micro_larger]
    other_seqs_nuc = [nuc_content(seq, nucleotide) for seq in other_seqs]

    # Determine the row and column index for the subplot
    row = i // 2
    col = i % 2
    ax = axes[row, col]

    # Plot histograms for the current nucleotide
    ax.hist(micro_larger_nuc, bins=np.arange(0, 101, 5), color="tab:blue", 
            edgecolor="black", label=f"micro >> ngs\nmean: {np.mean(micro_larger_nuc):.1f}",
            density=True, alpha=0.5)
    ax.hist(other_seqs_nuc, bins=np.arange(0, 101, 5), color="tab:red",
            edgecolor="black", label=f"other\nmiRNAs: {np.mean(other_seqs_nuc):.1f}",
            density=True, alpha=0.5)

    # Add legends and labels
    ax.legend(loc="upper right")
    ax.set_xlabel(f"{nucleotide} content(%)")
    ax.set_ylabel("Density")
    ax.set_ylim(0, 0.08)
    
    # hide x and y ticks on the inside
    if row == 0:
        ax.set_xticks([])
    if col == 1:
        ax.set_yticks([])
        ax.set_ylabel("")

    # Perform a statistical test (Mann-Whitney U test)
    u, p = stats.mannwhitneyu(micro_larger_nuc, other_seqs_nuc)

    # Add the p-value to the plot title
    ax.set_title(f"p={p:.2e}", fontsize=8)

# Adjust layout for better spacing
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2.4_calls/15.2.4_content_all.{format}"), dpi=300)

In [98]:
df_gc_content_all = pd.DataFrame(index = df_microarray_tissue.index, columns=["GC content"])
df_gc_content_all["GC content"] = [gc_content(seq) for seq in mirbase.loc[df_microarray_tissue.index, "sequence_orig"]]
df_gc_content_all["G content"] = [nuc_content(seq, "G") for seq in mirbase.loc[df_microarray_tissue.index, "sequence_orig"]]
df_gc_content_all["C content"] = [nuc_content(seq, "C") for seq in mirbase.loc[df_microarray_tissue.index, "sequence_orig"]]
df_gc_content_all["A content"] = [nuc_content(seq, "A") for seq in mirbase.loc[df_microarray_tissue.index, "sequence_orig"]]
df_gc_content_all["T content"] = [nuc_content(seq, "T") for seq in mirbase.loc[df_microarray_tissue.index, "sequence_orig"]]

In [None]:
extreme_outliers = df_microarray_tissue_mean - df_ngs_tissue_mean.loc[df_microarray_tissue.index]
extreme_outliers = extreme_outliers[extreme_outliers.abs() > 2.5].index
"hsa-miR-3940-5p" in extreme_outliers

In [None]:
# get all miRNAs with more than 40% G content
ABs = []
for x in np.arange(0,100,2.5):
    high_g = df_gc_content_all[df_gc_content_all["G content"] > x].index
    other = df_gc_content_all.index[~df_gc_content_all.index.isin(high_g)]

    A = len([outlier for outlier in extreme_outliers if outlier in high_g])/len(extreme_outliers)
    B = len([outlier for outlier in extreme_outliers if outlier in high_g])/len(high_g) if len(high_g) > 0 else 0
    # print(A)
    # print(B)
    # print(A*B)
    ABs.append(A*B)

# find the maximum 
max_index = np.argmax(ABs)
print(max_index*2.5, ABs[max_index])

plt.plot(np.arange(0,100,2.5), ABs)
plt.show()

In [103]:
high_g = df_gc_content_all[df_gc_content_all["G content"] > 50].index
other = df_gc_content_all.index[~df_gc_content_all.index.isin(high_g)]

In [104]:
# Create scatter plot with error bars
plt.figure(figsize=(2, 1.7))
plt.scatter(df_microarray_tissue_mean.loc[high_g],\
    df_ngs_tissue_mean.loc[high_g],
            label='>50% G', color="tab:red", s=10, edgecolors="none", zorder=3)
plt.scatter(df_microarray_tissue_mean.loc[other],
            df_ngs_tissue_mean.loc[other],
             label='other', color="tab:blue", s=3, edgecolors="none", zorder=2)
plt.plot([0, 5.5], [0, 5.5], color="black", linestyle="--", zorder=2)
plt.ylabel(r'mean(log$_{10}$(NGS exp.))')
plt.xlabel(r'mean(log$_{10}$(micro exp.))')
plt.xlim(1, 5)
plt.ylim(-0.5, 5)
plt.xticks(np.arange(1, 5, 1))
plt.yticks(np.arange(0, 5, 1))
plt.grid(True, zorder=1)
plt.tight_layout()
plt.legend(loc=[0,0.75])
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2.4_calls/15.2.4_mean_high_g.{format}"), dpi=300)

# 15.3 - Merge datasets

At this point, we have various sources of information about which dataset is wrong about which microRNA.  
Based on our fitting data, we infer that the following microRNAs are probably incorrect in the sequencing data:  
ngs_larger_in_multiple_keller_wrong  
micro_larger_in_multiple_keller_wrong

Based on our fitting data, we infer that the following microRNAs are probably incorrect in sequencing data:  
ngs_larger_in_multiple_alles_wrong  
micro_larger_in_multiple_alles_wrong

We additionally have information gained from outer treatment of false positives and negatives in the cell line data:   
false_positives_and_negatives = [false_negatives_ngs, false_negatives_microarray, false_positives_ngs, false_positives_microarray]

Heuristically, miRNAs that are almost always very small in the sequencing data and large in the microarray data are probably false positives.  
Thus, if we don't have information otherwise, we use  
heuristic_micro_wrong   
to exclude microarray data.

Finally, all other miRNAs are merged by the geometric mean.

In [None]:
sequencing_wrong = []
sequencing_wrong += ngs_larger_in_multiple_keller_wrong
sequencing_wrong += micro_larger_in_multiple_keller_wrong
print(len(sequencing_wrong))

print([mirna for mirna in false_negatives_ngs.index if mirna in sequencing_wrong])
print([mirna for mirna in false_positives_ngs.index if mirna in sequencing_wrong])

sequencing_wrong += list(false_negatives_ngs.index)
sequencing_wrong += list(false_positives_ngs.index)
sequencing_wrong = list(set(sequencing_wrong))
sequencing_wrong
print(len(sequencing_wrong))

In [None]:
microarray_wrong = []
microarray_wrong += ngs_larger_in_multiple_alles_wrong
microarray_wrong += micro_larger_in_multiple_alles_wrong

print([mirna for mirna in false_negatives_microarray.index if mirna in microarray_wrong])
print([mirna for mirna in false_positives_microarray.index if mirna in microarray_wrong])

microarray_wrong += list(false_negatives_microarray.index)
microarray_wrong += list(false_positives_microarray.index)
print(len(microarray_wrong))
print(len(heuristic_micro_wrong))
heuristic_micro_wrong_filter = [mirna for mirna in heuristic_micro_wrong if not mirna in sequencing_wrong+microarray_wrong]
print(len(heuristic_micro_wrong_filter))
microarray_wrong += list(heuristic_micro_wrong_filter)
print(len(microarray_wrong))
microarray_wrong = list(set(microarray_wrong))
print(len(microarray_wrong))

In [108]:
exclude_mirnas = sequencing_wrong + microarray_wrong

In [109]:
# Create scatter plot with error bars
plt.figure(figsize=(2, 1.7))
plt.scatter(df_microarray_tissue_mean.loc[sequencing_wrong],\
    df_ngs_tissue_mean.loc[sequencing_wrong],
            label='NGS wrong', color="tab:red", s=10, edgecolors="none", zorder=3)
plt.scatter(df_microarray_tissue_mean.loc[microarray_wrong],\
    df_ngs_tissue_mean.loc[microarray_wrong],
            label='microarray wrong', color="tab:orange", s=10, edgecolors="none", zorder=3)
plt.scatter(df_microarray_tissue_mean.loc[other],
            df_ngs_tissue_mean.loc[other],
             label='others', color="tab:blue", s=3, edgecolors="none", zorder=2)
plt.plot([0, 5.5], [0, 5.5], color="black", linestyle="--", zorder=2)
plt.ylabel(r'mean(log$_{10}$(NGS exp.))')
plt.xlabel(r'mean(log$_{10}$(micro exp.))')
plt.xlim(1, 5)
plt.ylim(-0.5, 5)
plt.xticks(np.arange(1, 5, 1))
plt.yticks(np.arange(0, 5, 1))
plt.grid(True, zorder=1)
plt.tight_layout()
plt.legend(loc=[0,0.75])
for format in ["png", "svg"]:
    plt.savefig(os.path.join(base_plot_folder, f"15.2.4_calls/15.2.4_all_calls.{format}"), dpi=300)

In [110]:
# GET THE TISSUE DATASET
df_ngs_tissue = df_ngs_tissue_orig.copy()
df_microarray_tissue = df_microarray_tissue_orig.copy()

# renormalize the two
df_ngs_tissue_exclude = df_ngs_tissue[~df_ngs_tissue.index.isin(exclude_mirnas)]
df_microarray_tissue_exclude = df_microarray_tissue[~df_microarray_tissue.index.isin(exclude_mirnas)]

df_ngs_tissue_exclude, df_ngs_tissue = normalize_expr_df_to_rpm_with_partner(df_ngs_tissue_exclude, df_ngs_tissue)
df_microarray_tissue_exclude, df_microarray_tissue = normalize_expr_df_to_rpm_with_partner(df_microarray_tissue_exclude, df_microarray_tissue)

In [111]:
# GET THE CELL LINE DATASET
allowed_mirnas = df_microarray_tissue.index

# reload datasets
df_alles = pd.read_csv("../microrna_data/2_output/Alles2019_conormalized.csv", index_col=0)
df_alles = np.log10(df_alles)
df_alles = df_alles[df_alles.index.isin(allowed_mirnas)]

df_keller = pd.read_csv('../microrna_data/2_output/Keller2023_conormalized.csv', index_col=0)
df_keller = np.log10(df_keller)
df_keller = df_keller[df_keller.index.isin(allowed_mirnas)]

common_index = df_keller.index.intersection(df_alles.index)
common_columns = df_keller.columns.intersection(df_alles.columns)
df_alles = df_alles.loc[common_index, common_columns]
df_keller = df_keller.loc[common_index, common_columns]

df_alles_exclude = df_alles[~df_alles.index.isin(exclude_mirnas)]
df_keller_exclude = df_keller[~df_keller.index.isin(exclude_mirnas)]

df_alles_exclude, df_alles = normalize_expr_df_to_rpm_with_partner(df_alles_exclude, df_alles)
df_keller_exclude, df_keller = normalize_expr_df_to_rpm_with_partner(df_keller_exclude, df_keller)

In [112]:
sequencing_merge = pd.concat([df_ngs_tissue, df_keller], axis=1).astype(float)
microarray_merge = pd.concat([df_microarray_tissue, df_alles], axis=1).astype(float)

In [113]:
total_merge = pd.DataFrame(index=sequencing_merge.index, columns=sequencing_merge.columns)
for mirna in total_merge.index:
    for tissue in total_merge.columns:
        if mirna in microarray_wrong:
            total_merge.loc[mirna, tissue] = sequencing_merge.loc[mirna, tissue]
        elif mirna in sequencing_wrong:
            total_merge.loc[mirna, tissue] = microarray_merge.loc[mirna, tissue]
        else:
            total_merge.loc[mirna, tissue] = (sequencing_merge.loc[mirna, tissue] + microarray_merge.loc[mirna, tissue]) / 2.0
total_merge = total_merge.astype(float)

In [114]:
# renormalize total merge
total_merge = normalize_expr_df_to_rpm(total_merge)

# remove unwanted mirna
if "hsa-miR-3613-3p" in total_merge.index:
    df_merge = total_merge.drop("hsa-miR-3613-3p", axis=0)
    
# merge identical mirnas
from library2_utils.crosstalk import merge_identical_mirnas

df_expression_orig = total_merge.copy()
total_merge, groups = merge_identical_mirnas(total_merge, mirbase)

## Crosstalk filtering

In [115]:
# filter crosstalk
from library2_utils.transfer_functions import transfer_function

with open(f"../outputs/3_fitting/combined_dataset/combined_dataset_popt_wo_crosstalk.pkl", "rb") as f:
    popt = pickle.load(f)
with open(f"../outputs/3_fitting/combined_dataset/combined_dataset_scale_dict_wo_crosstalk.pkl", "rb") as f:
    scale_dict = pickle.load(f)
    
input_folder_crosstalk = f"../outputs/5_mutations"

# load full_crosstalk_dict
with open(f"{input_folder_crosstalk}/5.5_full_crosstalk_dict.pkl", "rb") as f:
    full_crosstalk_dict = pickle.load(f)

In [116]:
df_knockdown_predicted = np.log10((10**total_merge).apply(lambda x: transfer_function(x, *popt)))   

crosstalk_filter_df = pd.DataFrame(columns=total_merge.columns, index=total_merge.index)
crosstalk_filter_df.loc[:, :] = False

for key in full_crosstalk_dict.keys():
    if not key in df_knockdown_predicted.index:
        continue
    
    df = full_crosstalk_dict[key].copy()
    df = df[df.index != key]
    
    # check for total less than 5 mutations
    df = df[(df["no_total_impact"]) < 5]
    
    # check for high impact mutations
    df = df[df["no_high_impact"] < 2]
    
    # check for mid_impact mutations
    df = df[(df["no_mid_impact"]+df["no_high_impact"]) < 4]
    
    # check for all mutations
    df = df[(df["no_low_impact"]+df["no_mid_impact"]+df["no_high_impact"]) < 5]

    # add an expression row to the df
    df = df[df.index.isin(df_knockdown_predicted.index)]
    
    if len(df) > 0:
        df.loc[:, total_merge.columns] = df_knockdown_predicted.loc[df.index, total_merge.columns]
    else:
        continue
    
    # this is the knockdown across all remaining miRNAs after identifying those with likely crosstalk
    min_by_cell_line = df[total_merge.columns].min(axis=0)
    
    # this is the expected knockdown for the miRNA itself
    knockdown_orig = df_knockdown_predicted.loc[key, total_merge.columns]
    
    # we only filter if there is substantial expression of at least one of the potentially crosstalking miRNAs
    # this is min_by_cell_line < -0.2
    crosstalk_filter_df.loc[key, :] = ((10**knockdown_orig)/3 > 10**min_by_cell_line) & (min_by_cell_line < -0.5)

In [117]:
filtered_mirnas = crosstalk_filter_df[crosstalk_filter_df.any(axis=1)].index
total_merge = total_merge.drop(filtered_mirnas, axis=0)

### We use somewhat fewer miRNAs than before - we should redo the fitting to make sure this doesn't alter the constants. Also, redoing the fitting will tell us if our merge improved the fit.
#### Load the MPRA data

In [119]:
data_dir_input = "../measured_data/2_normalized_log10"

# get the name of all files in "reference" folder
reference_files = os.listdir(data_dir_input)

# read them into a dictionary
reference_dict = {}
for reference_file in reference_files:
    if reference_file.endswith(".csv"):
        reference_dict[reference_file.split('.')[0]] = pd.read_csv(os.path.join(data_dir_input, reference_file), index_col=0)
        
# get all dfs that contain "single" in their key
single_dfs = {key: reference_dict[key].copy() for key in reference_dict.keys() if "single" in key}

for key, df in single_dfs.items():
    df.set_index("miRNA1", inplace=True)
    # remove all columns that are not 3UTR
    df = df.filter(regex='(3UTR)')
    
    # check which cell lines are in the expression_df
    for column in df.columns:
        if column.split("_")[0] not in total_merge.columns:
           df = df.drop(column, axis=1)

    # restrict to microRNAs that are present in the expression_df
    df = df.loc[df.index.intersection(total_merge.index)]

    # get the current list of cell lines:
    cell_lines = [column.split("_")[0] for column in df.columns]
    for cell_line in cell_lines:
        df.loc[:, f"{cell_line}_exp"] = total_merge.loc[df.index, cell_line]

    # drop NaN values
    df.dropna(inplace=True)

    single_dfs[key] = df
    
# high confidence
measured_single = single_dfs["1_mirna_full_single_high_conf"]
# add mirgenedb
measured_single = pd.concat([measured_single, single_dfs["2_mirna_full_single_low_conf_mirgenedb"]], axis=0)

# split into knockdown and expression
# filter to columns that contain "_3UTR"
df_knockdown = measured_single.filter(regex='_3UTR')
# rename columns to drop everthing except the cell line name
df_knockdown.columns = [col.split("_")[0] for col in df_knockdown.columns]

# filter to columns that contain "exp"
df_expression = measured_single.filter(regex='exp')
# rename columns to drop the "_exp"
df_expression.columns = [col.split("_")[0] for col in df_expression.columns]

In [120]:
# constrain cell_lines_measured to those in the data
cell_lines_measured = [cell_line for cell_line in cell_lines_measured if cell_line in df_knockdown.columns]

x_data = []
y_data = []
dataset_indices = []
for i, cell_line in enumerate(cell_lines_measured):
    ex_df = df_expression[cell_line].dropna().values
    knock_df = df_knockdown[cell_line].dropna().values
    x_data.append(ex_df)
    y_data.append(knock_df)
    dataset_indices.append([i] * len(ex_df))

x_data = np.concatenate(x_data)
y_data = np.concatenate(y_data)
dataset_indices = np.concatenate(dataset_indices)

In [None]:
def hill_func_log_scales(x_data, dataset_indices, c1=3, c2=5, *scales):
    """This is a hill function for a set of microRNA expression values that can be scaled individually.
    
    The expression is assumed to be normalized to one.
    The microRNA data is assumed to be log10.
    The return value is also log10."""
    c1 = 10**c1
    c2 = 10**c2
    results = []

    for i, scale in enumerate(scales):
        mask = (dataset_indices == i)
        x = x_data[mask] + scale
        x = 10**x
        result = (1 / (1 + x / c1)) * (1 + x / c2)
        results.append( np.log10( result ))
    return np.concatenate(results)

# set bounds and initial guesses for non-scale fitting parameters
p0 = [3, 10]
num_params = len(p0)
bounds = ([1, 9.99], [10, 10.01])

# Guess initial scale values for all datasets
scale_guesses = [0 for _ in range(len(cell_lines_measured))]
# scale_bounds_min = [-0.001 for _ in range(len(cell_lines_measured))]
# scale_bounds_max = [0.001 for _ in range(len(cell_lines_measured))]
scale_bounds_min = [-2 for _ in range(len(cell_lines_measured))]
scale_bounds_max = [2 for _ in range(len(cell_lines_measured))]

# set scale for HEK293T to 0
scale_bounds_min[0] = -0.001
scale_bounds_max[0] = 0.001

# set up parameters
p0_scale = p0 + scale_guesses
bounds_scale = (bounds[0]+scale_bounds_min, bounds[1]+scale_bounds_max)

popt_scales, pcov = popt_scales_filter, pcov = opt.curve_fit(
    lambda x, *params: hill_func_log_scales(x, dataset_indices, *params),
    x_data,
    y_data,
    p0=p0_scale,
    bounds=bounds_scale,
    maxfev=5000
)

scales = list(popt_scales[num_params:])
hill_params = popt_scales[:num_params]
print(hill_params)

In [122]:
%%capture output
x_range_log = np.linspace(0, 5.5, 1000)
fig = plt.figure(figsize=(2.5, 1.7))
r2_dict = {}
for i, cell_line in enumerate(cell_lines_measured):
    current_scale = scales[i]
    
    df_ex = df_expression[f"{cell_line}"].dropna()+current_scale
    df_knock = df_knockdown[f"{cell_line}"].dropna()
    
    plt.scatter(df_ex,
        df_knock,
        color="tab:blue",
        s=3,
        edgecolors="none",
        rasterized=True,
        zorder = 1)
    
    r2 = stats.pearsonr(df_knock,
                        np.log10(transfer_function(10**df_ex,
                        *hill_params)))[0]**2
    r2_dict[cell_line] = r2
    
    if i == 0:
        plt.plot(x_range_log, np.log10(transfer_function(10**x_range_log,
                *hill_params)), color="black", linewidth=1, ls="--", zorder=2)
    
plt.xlabel(r"log$_{10}$"+f"(miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")

plt.xlim(0, 5.5)
plt.ylim(-1.7, 0.25)
plt.tight_layout()
plt.legend(loc="lower left", frameon=False, fontsize=6.5)
for format in ["png", "svg"]:
    plt.savefig(f"{base_plot_folder}/15.3-merged-fit-with-scaling.{format}", dpi=600, bbox_inches='tight')

In [123]:
correlation_dataframe = pd.DataFrame.from_dict(r2_dict, orient="index")
fig, ax = plt.subplots(figsize=(0.3, 1.65))
sns.heatmap(correlation_dataframe.astype('float'), cmap="viridis", annot=True, fmt=".2f", vmin=0.2, vmax=0.8, ax=ax,
            annot_kws={"size": 6},
            cbar_kws={'label': r'$r^2$', 'shrink': 1.4, 'aspect': 25, 'pad': 0.02, 'ticks': [0.2, 0.4, 0.6, 0.8]})

plt.xticks([])
plt.yticks([])
for format in ["png", "svg"]:
    plt.savefig(f"{base_plot_folder}/15.3-merged-r2-with-scaling.{format}", dpi=600, bbox_inches='tight')

In [None]:
tissue_scales = {}
for i, tissue in enumerate(cell_lines_measured):
    tissue_scales[tissue] = scales[i]
tissue_scales

In [125]:
with open(os.path.join(dataset_processed_folder, "tissue_data_popt.pkl"), "wb") as f:
    pickle.dump(hill_params, f)
with open(os.path.join(dataset_processed_folder, "tissue_data_scales.pkl"), "wb") as f:
    pickle.dump(tissue_scales, f)

In [126]:
organ_dict = {
    'cell_lines': ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH7", "A549", "HaCaT", "JEG3", "Tera1", "PC3"],
    'brain': ['brain', 'thalamus', 'white_matter', 'grey_matter', 'nucleus_caudatus', 'cerebellum', 'spinal_cord', 'dura_mater'],
    'nervous': ['nerve'],
    'circulatory': ['vein', 'artery'],
    'musculoskeletal': ['bone', 'muscle'],
    'digestive': ['esophagus', 'stomach', 'duodenum', 'jejunum', 'colon', 'liver', 'pancreas'],
    'respiratory': ['pleurae', 'lung'],
    'endocrine': ['thyroid', 'pituitary_gland', 'adrenal_gland'],
    'lymphatic': ['lymph_node', 'spleen'],
    'urinary': ['kidney'],
    'reproductive': ['testis', 'prostate'],
    'integumentary': ['skin', 'adipocyte'],
}

ordered_columns = []
for organ_system in organ_dict:
    ordered_columns.extend(organ_dict[organ_system])

total_merge = total_merge[ordered_columns]

In [127]:
# save the dataset to csv
total_merge.to_csv(os.path.join(dataset_processed_folder, "15.3_merged_tissue_datasets.csv"))