In [9]:
import pandas as pd
import random
from lib.transfer_functions import inverse_transfer

cell_lines_main = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH-7", "A549"]
cell_lines_other = ["HaCaT", "JEG-3", "Tera-1", "PC-3"]
cell_lines_measured = ["HEK293T", "HeLa", "SKNSH", "MCF7"]
cell_lines = cell_lines_main + cell_lines_other

### Here, we shuffle miRNA combinations to check if the position in the UTR matters

In [2]:
def determine_mirna_usage(df):
    """Assumes microRNAs in columns are named 'miRNA' and returns a dict with the usage of each microRNA in the dataframe"""
    usage_dict = {}
    mirna_columns = [column for column in df.columns if "miRNA" in column]
    used_mirnas = df[mirna_columns].values.tolist()
    for design in used_mirnas:
        for mirna in design:
            if mirna in usage_dict:
                usage_dict[mirna] += 1
            else:
                usage_dict[mirna] = 1
    
    # sort dict by value
    usage_dict = {k: v for k, v in sorted(usage_dict.items(), key=lambda item: item[1], reverse=True)}
    return usage_dict

def create_unique_shuffles(df, n_shuffles=10):
    """Assumes microRNAs in columns are named 'miRNA' and returns a dataframe with unique shuffled designs."""
    shuffle_df = pd.DataFrame(columns=df.columns)
    mirna_columns = [column for column in df.columns if "miRNA" in column]
    for index, row in df.iterrows():
        mirna_design = row[mirna_columns].values.tolist()
        original_design = mirna_design.copy()
        shuffled_designs = set()
        shuffled_designs.add(tuple(original_design))
        while len(shuffled_designs) < (n_shuffles + 1):
            random.shuffle(mirna_design)
            shuffled_designs.add(tuple(mirna_design))
        # remove the original design
        shuffled_designs.remove(tuple(original_design))
        # make it a list
        shuffled_designs = list(shuffled_designs)
        for shuffled_design in shuffled_designs:
            new_row = row.copy()
            new_row[mirna_columns] = shuffled_design
            shuffle_df.loc[len(shuffle_df)] = new_row
    return shuffle_df

In [10]:
# load microRNA data
mirna_data_filter = pd.read_csv("../input_data/miRNA_expression_data/1_output/1.10_alles_quantile_crosstalk_filter.csv", index_col=0)
mirna_data_filter = mirna_data_filter.loc[:, cell_lines]
mirna_data_filter = 10**mirna_data_filter

# load measured data
cell_lines_measured = ["HEK293T", "HeLa", "SKNSH", "MCF7"]
# load real data
measured_single = pd.read_csv('../input_data/measurements_lib1/1_full_single_context1.csv', index_col=0)
# make miRNA1 the index
measured_single = measured_single.set_index('miRNA1')
# drop all columns that do not contain _3UTR_log10
measured_single = measured_single.filter(regex='_3UTR_log10')
# remove _3UTR_log10 from column names
measured_single.columns = measured_single.columns.str.replace('_3UTR_log10', '')
# make it linaer
measured_single = 10**measured_single
# make all values larger than 1 equal to 1
measured_single[measured_single > 1] = 1
# get the inverse of the expression
mirna_expr_fr_knockdown = inverse_transfer(measured_single)
# sort for better performance
mirna_expr_fr_knockdown = mirna_expr_fr_knockdown.sort_index()
mirna_expr_fr_knockdown = mirna_expr_fr_knockdown.loc[:, cell_lines_measured]

# 10.1 - Shuffle full target sites

In [11]:
# Get combinations of five full target sites
full_AND5_df = pd.read_csv("../designs/14_miRNA_full_combination_probe_x5.csv", index_col=0)

In [12]:
# count the number of unique mirnas in each design
full_AND5_df["mirna_number"] = full_AND5_df.apply(lambda row: \
    len(set(row[["miRNA1", "miRNA2", "miRNA3", "miRNA4", "miRNA5"]])), axis=1)
# only use designs with 5 unique mirnas
full_AND5_df = full_AND5_df[full_AND5_df["mirna_number"] == 5]
# drop the mirna_number column
full_AND5_df = full_AND5_df.drop(columns=["mirna_number"])

In [13]:
# get the categories
low = full_AND5_df[full_AND5_df["category"] == "low"]
mid = full_AND5_df[full_AND5_df["category"] == "mid"]
high = full_AND5_df[full_AND5_df["category"] == "high"]

In [None]:
# sample 10 designs from each category
low = low.sample(10)
mid = mid.sample(10)
high = high.sample(10)

In [14]:
# shuffle each one 15 times
low_shuffles = create_unique_shuffles(low, 15)
mid_shuffles = create_unique_shuffles(mid, 15)
high_shuffles = create_unique_shuffles(high, 15)

In [15]:
# merge them into a single dataframe
all_shuffles = pd.concat([low_shuffles, mid_shuffles, high_shuffles])
# add the correct index
all_shuffles.index = [f"22_miRNA_full_combination_shuffle_x5_{i}" for i in range(1, len(all_shuffles)+1)]
# save the designs
all_shuffles.to_csv("../designs/22_miRNA_full_combination_shuffle_x5.csv")

# 10.2 - Shuffle mutated sites

In [21]:
# Get combinations of five mutated target sites
mut_AND5_df = pd.read_csv("../designs/20_miRNA_mut_combination_probe_x5.csv", index_col=0)

In [22]:
# count the number of unique mirnas in each design
mut_AND5_df["mirna_number"] = mut_AND5_df.apply(lambda row: \
    len(set(row[["miRNA1", "miRNA2", "miRNA3", "miRNA4", "miRNA5"]])), axis=1)
# only use designs with 5 unique mirnas
mut_AND5_df = mut_AND5_df[mut_AND5_df["mirna_number"] == 5]
# drop the mirna_number column
mut_AND5_df = mut_AND5_df.drop(columns=["mirna_number"])

# count the number of mutated mirnas in each design
mut_AND5_df["mutated_mirna_number"] = mut_AND5_df.apply(lambda row: \
    len([mirna for mirna in row[["miRNA1", "miRNA2", "miRNA3", "miRNA4", "miRNA5"]] if "mut" in mirna or "Mut" in mirna]), axis=1)
# only use designs with 2 or more mutated mirnas
mut_AND5_df = mut_AND5_df[mut_AND5_df["mutated_mirna_number"] >= 2]
# drop the mutated_mirna_number column
mut_AND5_df = mut_AND5_df.drop(columns=["mutated_mirna_number"])

In [23]:
mut_mixed = mut_AND5_df[mut_AND5_df["category"] == "mixed"]
mut_weak = mut_AND5_df[mut_AND5_df["category"] == "weak"]
mut_medium = mut_AND5_df[mut_AND5_df["category"] == "medium"]

mut_mixed = mut_mixed.sample(10)
mut_weak = mut_weak.sample(15)
mut_medium = mut_medium.sample(15)

In [24]:
mut_mixed_shuffles = create_unique_shuffles(mut_mixed, 15)
mut_weak_shuffles = create_unique_shuffles(mut_weak, 15)
mut_medium_shuffles = create_unique_shuffles(mut_medium, 15)

In [25]:
all_mut_shuffles = pd.concat([mut_mixed_shuffles, mut_weak_shuffles, mut_medium_shuffles])
all_mut_shuffles.index = [f"23_miRNA_mut_combination_shuffle_x5_{i}" for i in range(1, len(all_mut_shuffles)+1)]
all_mut_shuffles.to_csv("../designs/23_miRNA_mut_combination_shuffle_x5.csv")