In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import random
import os
from lib.transfer_functions import transfer_function
from lib.additive_model import add_mirna_combs, max_mirna_combs, add_mirna_expression, max_mirna_expression

cell_lines_main = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH-7", "A549"]
cell_lines_other = ["HaCaT", "JEG-3", "Tera-1", "PC-3"]
cell_lines_measured = ["HEK293T", "HeLa", "SKNSH", "MCF7"]
cell_lines = cell_lines_main + cell_lines_other

plot_folder = "../plots/7_miRNA_combinations/"
# Create folder for plots if it does not exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)

### Here, we pick combinations of different full and mutated targets.

# 7.1 - Full target combinations

### Set miRNA expression buckets

In [2]:
# load microRNA data
mirna_data_filter = pd.read_csv("../input_data/miRNA_expression_data/1_output/1.10_alles_quantile_crosstalk_filter.csv", index_col=0)
mirna_data_filter = mirna_data_filter.loc[:, cell_lines]

# get mirbase data
mirbase_df = pd.read_csv('../input_data/mirbase_with_families_and_targets.csv', index_col=0)

In [3]:
# filter out mirnas that contain unwanted sequence motifs
# restriction sites for BsaI
restriction_sites = ["GAGACC", "GGTCTC"]
polyA_signals = ["AATAAA", "ATTAAA", "AGTAAA", "TATAAA", "ACTAAA"]
filter_motifs = restriction_sites + polyA_signals

# check which miRNAs contain restriction sites in their target sequence
forbidden = mirbase_df[mirbase_df["target"].str.contains("|".join(filter_motifs)) == True].index
mirna_data_filter = mirna_data_filter[mirna_data_filter.index.isin(forbidden) == False]

In [4]:
stability = (10**mirna_data_filter).apply(transfer_function)
mirna_data = mirna_data_filter.copy()
mirna_data["mean"] = mirna_data.mean(axis=1)

In [None]:
buckets = [(2.5,3),(3,3.5),(3.5,6)]

data_buckets = []
for bucket in buckets:
    data_buckets.append(mirna_data[(mirna_data["mean"] >= bucket[0]) & (mirna_data["mean"] < bucket[1])])

data_bucket_mirnas = []
for bucket in data_buckets:
    data_bucket_mirnas.append(list(bucket.index.values))

# print the number of miRNAs in each bucket
for bucket in data_buckets:
    print(len(bucket))

## Create construucts with 2 miRNA targets

In [6]:
def get_sorted_mirnas(row, n):
    return tuple(sorted([row[f"miRNA{i}"] for i in range(1,n+1)]))

def get_random_mirna():
    # pick a random bucket
    rand_bucket = random.choice(data_bucket_mirnas)
    # pick a random mirnas from the bucket
    rand_mirna = random.choice(rand_bucket)
    return rand_mirna

def get_random_mirna_comb(n_mirnas, n_combinations):
    mirna_combs = []
    for i in range(n_combinations):
        mirna_comb = []
        for j in range(n_mirnas):
            mirna_comb.append(get_random_mirna())
        mirna_combs.append(tuple(sorted(mirna_comb)))
    return mirna_combs

def get_random_mirna_df(n_mirnas, n_combinations, index_start=0, label_start=0):
    rand_comb = pd.DataFrame(get_random_mirna_comb(n_mirnas, n_combinations), columns=[f"miRNA{i+1}" for i in range(n_mirnas)])
    rand_comb["category"] = "random"
    rand_comb["sorted_mirnas"] = rand_comb.apply(lambda row: get_sorted_mirnas(row, n_mirnas), axis=1)
    columns = [f"miRNA{i+1}" for i in range(n_mirnas)] + ["category", "sorted_mirnas"]
    rand_comb = rand_comb[columns]
    rand_comb.index =[f"{label_start}_miRNA_full_combination_probe_{index_start+i+1}" for i in range(n_combinations)]
    return rand_comb

def save_flattened_df(AND_df, n_mirnas, label_start, mut="full"):
    AND_df_flat = pd.concat(AND_df, axis=0)
    # are there duplicates?
    print(AND_df_flat[AND_df_flat.duplicated(subset="sorted_mirnas")])
    # if so, keep only the first
    AND_df_flat = AND_df_flat[~AND_df_flat.duplicated(subset="sorted_mirnas")]
    # reindex
    AND_df_flat.index = [f"{label_start}_miRNA_{mut}_combination_probe_x{n_mirnas}_{i+1}" for i in range(len(AND_df_flat))]
    # save to csv
    AND_df_flat.to_csv(f"../designs/{label_start}_miRNA_{mut}_combination_probe_x{n_mirnas}.csv")
    # return the df
    return AND_df_flat

In [None]:
no_combinations = [30, 30, 20]

# get combinations from the same category
AND2 = []
categories = ["low", "mid", "high"]
for i, mirna_bucket in enumerate(data_bucket_mirnas):
    new_mirnas = random.sample(list(itertools.combinations(mirna_bucket, 2)), no_combinations[i])
    df = pd.DataFrame(new_mirnas, columns=["miRNA1", "miRNA2"])
    df["category"] = categories[i]
    df["sorted_mirnas"] = df.apply(lambda row: get_sorted_mirnas(row, 2), axis=1)
    df = df[["miRNA1", "miRNA2", "category", "sorted_mirnas"]]
    AND2.append(df)

no_rand_comb = 20
index_start = sum(no_combinations)
label_start = 11
rand_comb_2 = get_random_mirna_df(2, no_rand_comb, index_start, label_start)
AND2.append(rand_comb_2)

AND2_flat = save_flattened_df(AND2, 2, label_start)

## Create combinations of 3 to 6 targets

In [8]:
def get_mirna_buckets(n_mirnas, basis_df, buckets):
    result_df = []
    for i, bucket in enumerate(buckets):
        df = basis_df[i].copy()
        df[f"miRNA{n_mirnas}"] = [random.choice(bucket) for i in range(len(df))]
        df["category"] = categories[i]
        df["sorted_mirnas"] = df.apply(lambda row: get_sorted_mirnas(row, n_mirnas), axis=1)
        columns = [f"miRNA{i+1}" for i in range(n_mirnas)] + ["category", "sorted_mirnas"]
        df = df[columns]
        result_df.append(df)
    return result_df

In [None]:
label_start = 12
n_mirnas = 3
AND3 = get_mirna_buckets(n_mirnas, AND2, data_bucket_mirnas)
rand_comb_3 = get_random_mirna_df(n_mirnas, no_rand_comb, index_start, label_start)
AND3.append(rand_comb_3)
AND3_flat = save_flattened_df(AND3, n_mirnas, label_start)

In [None]:
label_start = 13
n_mirnas = 4
AND4 = get_mirna_buckets(n_mirnas, AND3, data_bucket_mirnas)
rand_comb_4 = get_random_mirna_df(n_mirnas, no_rand_comb, index_start, label_start)
AND4.append(rand_comb_4)
AND4_flat = save_flattened_df(AND4, n_mirnas, label_start)

In [None]:
label_start = 14
n_mirnas = 5
AND5 = get_mirna_buckets(n_mirnas, AND4, data_bucket_mirnas)
rand_comb_5 = get_random_mirna_df(n_mirnas, no_rand_comb, index_start, label_start)
AND5.append(rand_comb_5)
AND5_flat = save_flattened_df(AND5, n_mirnas, label_start)

In [None]:
label_start = 15
n_mirnas = 6
AND6 = get_mirna_buckets(n_mirnas, AND5, data_bucket_mirnas)
rand_comb_6 = get_random_mirna_df(n_mirnas, no_rand_comb, index_start, label_start)
AND6.append(rand_comb_6)
AND6_flat = save_flattened_df(AND6, n_mirnas, label_start)

#### Add some of the previous designs (never ended up doing anything with these)

In [14]:
previous_designs = pd.read_csv("../input_data/measurements_lib1/7_full_AND5_context1.csv", index_col=0)
previous_designs = previous_designs.iloc[::2,:]
previous_designs = previous_designs[["miRNA1", "miRNA2", "miRNA3", "miRNA4", "miRNA5"]]
previous_designs["sorted_mirnas"] = previous_designs.apply(lambda row: get_sorted_mirnas(row, 5), axis=1)
previous_designs["category"] = "previous"
previous_designs.index = [f"16_miRNA_previous_" + "_".join(previous_designs.index[i].split("_")[2:-1]) for i in range(len(previous_designs))]

In [15]:
previous_designs.to_csv("../designs/16_miRNA_previous_AND5.csv")

## Explore expected stability patterns
Here, we want to make sure that we can distinguish the effect of multiple miRNAs from the strongest miRNA

In [16]:
used_df = AND5_flat

AND_expression = add_mirna_expression(10**mirna_data_filter, used_df)
AND_max = max_mirna_expression(10**mirna_data_filter, used_df)

AND_knockdown = AND_expression.apply(transfer_function).astype(float)
AND_max_knockdown = AND_max.apply(transfer_function).astype(float)

# ratio add vs max
AND_expression_ratio = AND_expression.div(AND_max)

for cell_line in cell_lines:
    plt.figure(figsize=(3,2.5))
    plt.plot(np.log10(AND_knockdown.loc[:,cell_line]), np.log10(AND_knockdown.loc[:,cell_line]), color="black")
    plt.scatter(np.log10(AND_knockdown.loc[:,cell_line]), np.log10(AND_max_knockdown.loc[:,cell_line]), s=8,
                edgecolors='none', color="tab:blue")
    plt.xlabel("Additive model")
    plt.ylabel("Max model")
    plt.title(cell_line)
    plt.tight_layout()
    plt.savefig(os.path.join(plot_folder, f"fill_{cell_line}_AND_expression_vs_knockdown.png"))
    plt.close()

# 7.2 - Mutated target combinations

In [17]:
# read mutation data
mutation_df = pd.read_csv("../designs/5_miRNA_single_mut.csv", index_col=0)

chosen_mirnas = list(mutation_df["orig_mi"].unique())

In [None]:
# filter for unwanted motifs
print(mutation_df[mutation_df["target"].str.contains("|".join(filter_motifs)) == True].index)
mutation_df = mutation_df[mutation_df["target"].str.contains("|".join(filter_motifs)) == False]

In [None]:
# Heuristically divide mutations according to their expected impact on miRNA function
# (The warning are irrelevant for filtering.)
# --------------------------------------
# get single mutations
single_mutations = mutation_df[mutation_df.index.str.contains("5.1_")]
weak_single = single_mutations[single_mutations.index.to_series().str.contains(r'_(mut|wob)(9|1[0-9]|20)\b')]
medium_single = single_mutations[single_mutations.index.to_series().str.contains(r'_(mut)(1|2|3|4|5|6|7|8)\b')]

# --------------------------------------
# get double mutations
weak_double = mutation_df[mutation_df.index.str.contains(r'5\.(7|8|9|10)_miRNA')]
medium_double = mutation_df[mutation_df.index.str.contains(r'5\.6_miRNA')]

# --------------------------------------
# get triple mutations
medium_triple = mutation_df[mutation_df.index.str.contains(r'5\.(1[4-7])_miRNA')]
strong_triple = mutation_df[mutation_df.index.str.contains(r'5\.(12|13)_miRNA')]

# --------------------------------------
# get quadruple mutations
medium_quadruple = mutation_df[mutation_df.index.str.contains(r'5\.(2[1-4])_miRNA')]
strong_quadruple = mutation_df[mutation_df.index.str.contains(r'5\.(18|19|20)_miRNA')]

# --------------------------------------
weak_mutations = pd.concat([weak_single, weak_double], axis=0)
medium_mutations = pd.concat([medium_single, medium_double, medium_triple, medium_quadruple], axis=0)
strong_mutations = pd.concat([strong_triple, strong_quadruple], axis=0)

In [None]:
# create the mutation buckets
# add non-mutated miRNAs to the first bucket
mid_mirnas = data_bucket_mirnas[1]
mutation_buckets = [mid_mirnas*10+weak_mutations.index.to_list(), weak_mutations.index.to_list(),
                    medium_mutations.index.to_list(), strong_mutations.index.to_list()]

no_combinations = [20, 35, 35, 25]
label_start = 17

# get combinations from the same category
AND2_mut = []
categories = ["mixed", "weak", "medium", "strong"]
for i, mutation_bucket in enumerate(mutation_buckets):
    new_mirnas = random.sample(list(itertools.combinations(mutation_bucket, 2)), no_combinations[i])
    df = pd.DataFrame(new_mirnas, columns=["miRNA1", "miRNA2"])
    df["category"] = categories[i]
    df["sorted_mirnas"] = df.apply(lambda row: get_sorted_mirnas(row, 2), axis=1)
    df = df[["miRNA1", "miRNA2", "category", "sorted_mirnas"]]
    AND2_mut.append(df)

AND2_mut_flat = save_flattened_df(AND2_mut, 2, label_start, mut="mut")

In [None]:
label_start = 18
n_mirnas = 3
AND3_mut = get_mirna_buckets(n_mirnas, AND2_mut, mutation_buckets)
AND3_mut_flat = save_flattened_df(AND3_mut, n_mirnas, label_start, mut="mut")

In [None]:
label_start = 19
n_mirnas = 4
AND4_mut = get_mirna_buckets(n_mirnas, AND3_mut, mutation_buckets)
AND4_mut_flat = save_flattened_df(AND4_mut, n_mirnas, label_start, mut="mut")

In [None]:
label_start = 20
n_mirnas = 5
AND5_mut = get_mirna_buckets(n_mirnas, AND4_mut, mutation_buckets)
AND5_mut_flat = save_flattened_df(AND5_mut, n_mirnas, label_start, mut="mut")

In [None]:
label_start = 21
n_mirnas = 6
AND6_mut = get_mirna_buckets(n_mirnas, AND5_mut, mutation_buckets)
AND6_mut_flat = save_flattened_df(AND6_mut, n_mirnas, label_start, mut="mut")