In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import scipy.optimize as opt
import itertools
import random
import os

cell_lines_main = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH-7", "A549"]
cell_lines_other = ["HaCaT", "JEG-3", "Tera-1", "PC-3"]
cell_lines_measured = ["HEK293T", "HeLa", "SKNSH", "MCF7"]
cell_lines = cell_lines_main + cell_lines_other

plot_folder = "plots/2_individual_miRNAs"

Here, we pick the miRNAs that we will measure individually in the main context sequence. We split miRNAs into three categories:

1) High confidence in miRbase
2) Low confidence in miRbase, in MirGeneDB
3) Low confidence in miRbase, not in MirGeneDB

In [5]:
# get mirbase and mirgenedb miRNAs
mirbase = pd.read_csv('../input_data/mirbase.csv', index_col=0)
mirgenedb = pd.read_csv('../input_data/mirgenedb.csv', index_col=0)

mirgenedb_mirnas = list(mirgenedb["5p accession"].unique()) + list(mirgenedb["3p accession"].unique())
mirgenedb_mirnas = [mirna for mirna in mirgenedb_mirnas if mirna != "None"]
mirgenedb_mirnas = mirbase[mirbase["MIMAT"].isin(mirgenedb_mirnas)]

In [7]:
def normalize_expr_df_to_rpm_with_partner(df1, df2):
    """This function normalizes the expression data in the second df to the expression data in the first df.
    We do this because we want to normalize based on high confidence miRNAs only."""
    # normalize
    df2 = df2.div(df1.sum(axis=0), axis=1) * 1000000
    df1 = df1.div(df1.sum(axis=0), axis=1) * 1000000
    # deduct the minimum and add one to the expression data to avoid division by 0
    df2 = df2 - df1.min().min() + 1
    df1 = df1 - df1.min().min() + 1
    # normalize to rpm
    df2 = df2.div(df1.sum(axis=0), axis=1) * 1000000
    df1 = df1.div(df1.sum(axis=0), axis=1) * 1000000
    return df1, df2

In [8]:
# Conormalize the expression data
df_alles = pd.read_csv('../input_data/mirna_expression_data/1_input/Alles2019_quantile_all_mirnas.csv', index_col=0)

# drop Accession from df_alles
df_alles = df_alles.drop(columns=['Accession'])

df_alles["confidence"] = mirbase["confidence"]
df_alles_high_conf = df_alles[df_alles["confidence"] == "high"]

df_alles = df_alles.drop(columns=['confidence'])
df_alles_high_conf = df_alles_high_conf.drop(columns=['confidence'])

df_alles_high_conf, df_alles = normalize_expr_df_to_rpm_with_partner(df_alles_high_conf, df_alles)
# create the output folder if it does not exist
if not os.path.exists('../input_data/miRNA_expression_data/2_output'):
    os.makedirs('../input_data/miRNA_expression_data/2_output')
df_alles.to_csv('../input_data/miRNA_expression_data/2_output/2_Alles2019_conormalized.csv')

# get low and high confidence miRNAs
df_alles = np.log10(df_alles)
df_alles_high_conf = np.log10(df_alles_high_conf)
df_alles_low_conf = df_alles[~df_alles.index.isin(df_alles_high_conf.index)]

In [9]:
# split low confidence into those that are in mirgenedb and those that are not
df_alles_low_conf_mirgenedb = df_alles_low_conf[df_alles_low_conf.index.isin(mirgenedb_mirnas.index)]
df_alles_low_conf_not_mirgenedb = df_alles_low_conf[~df_alles_low_conf.index.isin(mirgenedb_mirnas.index)]

In [10]:
df_alles = df_alles[cell_lines]
df_alles_high_conf = df_alles_high_conf[cell_lines]
df_alles_low_conf_mirgenedb = df_alles_low_conf_mirgenedb[cell_lines]
df_alles_low_conf_not_mirgenedb = df_alles_low_conf_not_mirgenedb[cell_lines]

In [None]:
# print the lengths of all dfs from the previous step
print(len(mirbase))
print(len(df_alles))
print(len(df_alles_high_conf))
print(len(df_alles_low_conf_mirgenedb))
print(len(df_alles_low_conf_not_mirgenedb))

## Pick low and high confidence miRNAs to test

We use all high confidence miRNAs.

In [12]:
# get all high_confidence miRNAs
mirnas_alles_high_conf = df_alles_high_conf.copy()
mirnas_alles_high_conf["miRNA1"] = mirnas_alles_high_conf.index

# make miRNA the first column
mirnas_alles_high_conf = mirnas_alles_high_conf[["miRNA1"] + [col for col in mirnas_alles_high_conf.columns if col != "miRNA1"]]
mirnas_alles_high_conf.index = [f"1_mirna_full_single_high_conf_{i+1}" for i in range(len(mirnas_alles_high_conf))]

# change the name of the index to "design_id"
mirnas_alles_high_conf.index.name = "design_id"
mirnas_alles_high_conf.to_csv('../designs/1_mirna_full_single_high_conf.csv', index=True)

We use all low confidence miRNAs in MirGeneDB

In [13]:
# Get all low confidence mirgenedb miRNAs
mirnas_alles_low_conf_mirgenedb = df_alles_low_conf_mirgenedb.copy()
mirnas_alles_low_conf_mirgenedb["miRNA1"] = mirnas_alles_low_conf_mirgenedb.index

# make miRNA the first column
mirnas_alles_low_conf_mirgenedb = mirnas_alles_low_conf_mirgenedb[["miRNA1"] + [col for col in mirnas_alles_low_conf_mirgenedb.columns if col != "miRNA1"]]
mirnas_alles_low_conf_mirgenedb.index = [f"2_mirna_full_single_low_conf_mirgenedb_{i+1}" for i in range(len(mirnas_alles_low_conf_mirgenedb))]

# change the name of the index to "design_id"
mirnas_alles_low_conf_mirgenedb.index.name = "design_id"
mirnas_alles_low_conf_mirgenedb.to_csv('../designs/2_mirna_full_single_low_conf_mirgenedb.csv', index=True)

## Sample from the low confidence mirnas not in mirgenedb

In [16]:
# Filter out mirnas that contain unwanted sequence motifs
# restriction sites for BsaI
restriction_sites = ["GAGACC", "GGTCTC"]
polyA_signals = ["AATAAA", "ATTAAA", "AGTAAA", "TATAAA", "ACTAAA"]
filter_motifs = restriction_sites + polyA_signals

# get the target sequences that will be inserted
mirbase_sequences = pd.read_csv('../input_data/mirbase_with_families_and_targets.csv', index_col=0)

# filter out mirnas in mirbase_sequences that contain unwanted sequence motifs in the "target" column
mirbase_sequences_filter = mirbase_sequences[~mirbase_sequences['target'].apply(lambda seq: any(motif in seq for motif in filter_motifs))]

# filter df_alles_low_conf_not_mirgenedb according to the index of mirbase_sequences_filter
df_alles_low_conf_not_mirgenedb = df_alles_low_conf_not_mirgenedb[df_alles_low_conf_not_mirgenedb.index.isin(mirbase_sequences_filter.index)]

In [None]:
# get different quantiles of miRNAs
less_than_two = df_alles_low_conf_not_mirgenedb[df_alles_low_conf_not_mirgenedb.max(axis=1) < 2]
less_than_three = df_alles_low_conf_not_mirgenedb[(df_alles_low_conf_not_mirgenedb.max(axis=1) > 2)
                                    & (df_alles_low_conf_not_mirgenedb.max(axis=1) < 3)]
less_than_four = df_alles_low_conf_not_mirgenedb[(df_alles_low_conf_not_mirgenedb.max(axis=1) > 3)
                                   & (df_alles_low_conf_not_mirgenedb.max(axis=1) < 4)]
more_than_four = df_alles_low_conf_not_mirgenedb[(df_alles_low_conf_not_mirgenedb.max(axis=1) > 4)]

# sample from the first two dataframes
# we use all mirnas from the second two dataframes
less_than_two = less_than_two.sample(n=76)
less_than_three = less_than_three.sample(n=100)

low_confidence_choice = pd.concat([less_than_two, less_than_three, less_than_four, more_than_four])
len(low_confidence_choice)

In [18]:
# Save the choice
low_confidence_choice["miRNA1"] = low_confidence_choice.index

# make miRNA the first column
low_confidence_choice = low_confidence_choice[["miRNA1"] + [col for col in low_confidence_choice.columns if col != "miRNA1"]]
low_confidence_choice.index = [f"3_mirna_full_single_low_conf_not_mirgenedb_{i+1}" for i in range(len(low_confidence_choice))]

# change the name of the index to "design_id"
low_confidence_choice.index.name = "design_id"
low_confidence_choice.to_csv('../designs/3_mirna_full_single_low_conf_not_mirgenedb.csv', index=True)

## Plot the expected distributions of stabilities

In [19]:
def hill_func_log(x, c1=3.6, c2=10, n=1):
    """The expression is assumed to be normalized to one. The microRNA data is assumed to be log10."""
    x = (10**x)**n
    c1 = (10**c1)**n
    c2 = 10**c2
    result = (1 / (1 + x / c1)) * (1 + x / c2)
    return np.log10(result)

In [20]:
df_knockdown = low_confidence_choice.set_index("miRNA1").apply(hill_func_log)

In [22]:
%%capture output
sample = "low_conf_not_mirgenedb"
plot_folder = f"../plots/2_pick_single_mirnas/{sample}/"
# create the output folder if it does not exist
os.makedirs(plot_folder, exist_ok=True)

for cell_line in cell_lines:
    plt.figure(figsize=(4,3))
    plt.title(f"{cell_line}, {sample}")
    plt.scatter(low_confidence_choice.set_index("miRNA1")[cell_line], df_knockdown[cell_line], s=5, alpha=1)
    plt.ylim(-1.35, 0.1)
    plt.xlim(0, 5.5)
    plt.xlabel("log10(expression)")
    plt.ylabel("log10(stability)")
    plt.tight_layout()
    plt.savefig(plot_folder + f"{cell_line}_{sample}.png", dpi=300)