In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from lib.transfer_functions import transfer_function

plot_folder = "../plots/6_pick_repeats/"
# Create the folder if it does not exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)

cell_lines_main = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH-7", "A549"]
cell_lines_other = ["HaCaT", "JEG-3", "Tera-1", "PC-3"]
cell_lines_measured = ["HEK293T", "HeLa", "SKNSH", "MCF7"]
cell_lines = cell_lines_main + cell_lines_other

### Here, we pick both full targets and mutated targets to repeat two to six times.

# 6.1 - Full Targets

In [3]:
# load microRNA data
mirna_data_filter = pd.read_csv("../input_data/miRNA_expression_data/1_output/1.10_alles_quantile_crosstalk_filter.csv", index_col=0)
mirna_data_filter = mirna_data_filter.loc[:, cell_lines]
mirna_data_filter = 10**mirna_data_filter

mirna_data_no_filter = pd.read_csv("../input_data/miRNA_expression_data/1_output/1.10_alles_quantile_no_crosstalk_filter.csv", index_col=0)
mirna_data_no_filter = mirna_data_no_filter.loc[:, cell_lines]
mirna_data_no_filter = 10**mirna_data_no_filter

mirna_data = mirna_data_filter

In [4]:
# get mirbase data
mirbase_df = pd.read_csv('../input_data/mirbase_with_families_and_targets.csv', index_col=0)
# get all high confidence miRNAs
high_confidence = mirbase_df[mirbase_df['confidence'] == 'high']
# filter high confidence to those microRNAs in the expression df
high_confidence = high_confidence[high_confidence.index.isin(mirna_data.index)]

### Set miRNAs that I definitely want to test as repeast

In [5]:
mirnas_mutation_test = [
    "hsa-let-7a-5p",
    "hsa-miR-19b-3p",
    'hsa-miR-365a-3p',
    'hsa-miR-23a-3p',
    'hsa-miR-21-5p',
    'hsa-miR-31-5p',
    'hsa-miR-22-3p',
    'hsa-miR-16-5p',
    'hsa-miR-31-3p',
    'hsa-miR-107',
    'hsa-miR-24-3p'
]

mirnas_flow = [
    "hsa-miR-122-5p",
    "hsa-miR-18a-5p",
    "hsa-miR-20a-5p",
    "hsa-miR-25-3p",
    "hsa-miR-7-5p",
    "hsa-miR-21-5p",
    "hsa-miR-141-3p",
    "hsa-miR-10a-5p",
    "hsa-miR-31-5p",
    "hsa-let-7a-5p",
]

additional_let7 = [
    "hsa-let-7b-5p",
    "hsa-let-7c-5p",
    "hsa-let-7d-5p",
    "hsa-let-7e-5p",
    "hsa-let-7f-5p",
    "hsa-let-7g-5p",
    "hsa-let-7i-5p",
]

# merge them
mirnas_preset = mirnas_mutation_test + mirnas_flow + additional_let7
mirnas_preset = list(set(mirnas_preset))
mirnas_preset = mirna_data_no_filter.loc[mirnas_preset,:]

### Choose other miRNAs based on their expression

In [None]:
# exclude the miRNAs I already have
mirna_data = mirna_data[~mirna_data.index.isin(mirnas_preset.index)]

# divide miRNAs into groups based on their maximum expression
less_than_two = mirna_data[mirna_data.max(axis=1) < 10**2]
less_than_three = mirna_data[(mirna_data.max(axis=1) > 10**2)
                                    & (mirna_data.max(axis=1) < 10**3)]
less_than_four = mirna_data[(mirna_data.max(axis=1) > 10**3)
                                   & (mirna_data.max(axis=1) < 10**4)]
more_than_four = mirna_data[(mirna_data.max(axis=1) > 10**4)]

# sample from the dataframes
less_than_two = less_than_two.sample(n=5)
less_than_three = less_than_three.sample(n=10)
less_than_four = less_than_four.sample(n=46)
# I use all miRNAs with max expression above 10^4

print("miRNAs per group:")
print(len(less_than_two), len(less_than_three), len(less_than_four), len(more_than_four))

# append the dataframes
picked_mirnas = pd.concat([mirnas_preset, less_than_two, less_than_three, less_than_four, more_than_four])
print("total miRNAs picked:", len(picked_mirnas))

In [7]:
%%capture output
for cell_line in cell_lines:
    cur_mirna = picked_mirnas[cell_line]
    plt.clf()
    plt.figure(figsize=(3, 2))
    plt.xlim(0, 5.5)
    plt.ylim(-0.05, 1.05)
    plt.scatter(np.log10(cur_mirna), cur_mirna.apply(transfer_function))
    plt.xlabel("log10(miRNA expression)")
    plt.ylabel("Stability")
    plt.tight_layout()
    plt.savefig(os.path.join(plot_folder, f"full_{cell_line}_picked_mirnas.png"), dpi=300)

### Generate the design files

In [8]:
for i in range(2,7):
    design_df = picked_mirnas.copy()
    
    # calculate the expected knockdown
    design_df.loc[:, cell_lines] = (design_df.loc[:, cell_lines]*i).apply(transfer_function)
    
    # add mirna columns
    for j in range(1, i+1):
        design_df.loc[:,f"miRNA{j}"] = design_df.index
        
    # reorder the columns to put the miRNA columns first
    design_df = design_df[[f"miRNA{j}" for j in range(1, i+1)] + cell_lines]
    
    # set the index
    design_df.index = [f"{i+4}_miRNA_full_repeat_x{i}_{j}" for j in range(1, len(design_df)+1)]
    
    # save the design
    design_df.to_csv(f"../designs/{i+4}_miRNA_full_repeat_x{i}.csv")

# 6.2 - Mutated Targets

In [19]:
# read single mutation data
mutation_df = pd.read_csv("../designs/5_miRNA_single_mut.csv", index_col=0)

chosen_mirnas = list(mutation_df["orig_mi"].unique())

In [20]:
# filter out mirnas that contain unwanted sequence motifs
# restriction sites
restriction_sites = ["GAGACC", "GGTCTC"]
polyA_signals = ["AATAAA", "ATTAAA", "AGTAAA", "TATAAA", "ACTAAA"]
filter_motifs = restriction_sites + polyA_signals

In [None]:
# filter for unwanted motifs
print(mutation_df[mutation_df["target"].str.contains("|".join(filter_motifs)) == True].index)
mutation_df = mutation_df[mutation_df["target"].str.contains("|".join(filter_motifs)) == False]

In [None]:
# Heuristically divide mutations according to their expected impact on miRNA function
# (The warning are irrelevant for filtering.)
# --------------------------------------
# get single mutations
single_mutations = mutation_df[mutation_df.index.str.contains("5.1_")]
weak_single = single_mutations[single_mutations.index.to_series().str.contains(r'_(mut|wob)(9|1[0-9]|20)\b')]
medium_single = single_mutations[single_mutations.index.to_series().str.contains(r'_(mut)(1|2|3|4|5|6|7|8)\b')]

# --------------------------------------
# get double mutations
weak_double = mutation_df[mutation_df.index.str.contains(r'5\.(7|8|9|10)_miRNA')]
medium_double = mutation_df[mutation_df.index.str.contains(r'5\.6_miRNA')]

# --------------------------------------
# get triple mutations
medium_triple = mutation_df[mutation_df.index.str.contains(r'5\.(1[4-7])_miRNA')]
strong_triple = mutation_df[mutation_df.index.str.contains(r'5\.(12|13)_miRNA')]

# --------------------------------------
# get quadruple mutations
medium_quadruple = mutation_df[mutation_df.index.str.contains(r'5\.(2[1-4])_miRNA')]
strong_quadruple = mutation_df[mutation_df.index.str.contains(r'5\.(18|19|20)_miRNA')]

# --------------------------------------
weak_mutations = pd.concat([weak_single, weak_double], axis=0)
medium_mutations = pd.concat([medium_single, medium_double, medium_triple, medium_quadruple], axis=0)
strong_mutations = pd.concat([strong_triple, strong_quadruple], axis=0)

In [None]:
# pick 3 weak, 4 medium and 3 strong mutations per chosen microRNA
chosen_mutations = []
for mirna in chosen_mirnas:
    # get the mutations for the current mirna
    if mirna == "hsa-let-7i-5p":
        continue
    cur_weak = weak_mutations[weak_mutations["orig_mi"] == mirna].sample(n=3)
    cur_medium = medium_mutations[medium_mutations["orig_mi"] == mirna].sample(n=4)
    cur_strong = strong_mutations[strong_mutations["orig_mi"] == mirna].sample(n=3)
    
    # append
    df = pd.concat([cur_weak, cur_medium, cur_strong], axis=0)
    chosen_mutations.append(df)

chosen_mutations = pd.concat(chosen_mutations, axis=0)
print(len(chosen_mutations))

In [28]:
# repeat these 2 to 6 times
for i in range(2,7):
    design_df = chosen_mutations.copy()
    # add mirna columns
    for j in range(2, i+1):
        design_df.loc[:, f"miRNA{j}"] = design_df["miRNA1"]
    # set the index
    design_df.index = [f"{i+34}_miRNA_mut_repeat_x{i}_{j}" for j in range(1, len(design_df)+1)]
    # save the design
    design_df.to_csv(f"../designs/{i+34}_miRNA_mut_repeat_x{i}.csv")