In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os
from lib.count_mismatches import count_mismatches_in_region, get_mismatches_with_reverse_complement
from lib.design_utilities import tsi
from lib.general_utility import intersection_of_lists
from lib.transfer_functions import transfer_function
from lib.NA_sequence_utilities import complement, reverse_complement

cell_lines_main = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH-7", "A549"]
cell_lines_other = ["HaCaT", "JEG-3", "Tera-1", "PC-3"]
cell_lines_measured = ["HEK293T", "HeLa", "SKNSH", "MCF7"]
cell_lines = cell_lines_main + cell_lines_other

plot_folder = "plots/5_mutated_targets/"
# create the folder if it does not exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)

# load mirbase
mirbase_df = pd.read_csv('../input_data/mirbase_with_families_and_targets.csv', index_col=0)

In [2]:
# load microRNA data
mirna_data_filter = pd.read_csv("../input_data/miRNA_expression_data/1_output/1.10_alles_quantile_crosstalk_filter.csv", index_col=0)
mirna_data_filter = mirna_data_filter.loc[:, cell_lines]
mirna_data_filter = 10**mirna_data_filter

mirna_data_no_filter = pd.read_csv("../input_data/miRNA_expression_data/1_output/1.10_alles_quantile_no_crosstalk_filter.csv", index_col=0)
mirna_data_no_filter = mirna_data_no_filter.loc[:, cell_lines]
mirna_data_no_filter = 10**mirna_data_no_filter

# Find microRNAs with little expected crosstalk to mutate

In [3]:
high_confidence = mirbase_df[mirbase_df["confidence"] == "high"].copy()

mismatch_dict = {}

for mirna_target, row in high_confidence.iterrows():
    target = high_confidence.loc[mirna_target, "sequence"]
    query_df = pd.DataFrame(index = high_confidence.index, columns = ["mismatch", "wobble"])
    for mirna_query, row in high_confidence.iterrows():
        query = high_confidence.loc[mirna_query, "sequence"]
        mismatch, wobble = count_mismatches_in_region(target, query)
        query_df.loc[mirna_query, "mismatch"] = mismatch
        query_df.loc[mirna_query, "wobble"] = wobble

    mismatch_dict[mirna_target] = query_df

In [4]:
for key in mismatch_dict.keys():
    df = mismatch_dict[key]
    df["mismatch_seed"] = df["mismatch"].apply(lambda x: x[1])
    df["mismatch_four"] = df["mismatch"].apply(lambda x: sum(x[0:5]))
    mismatch_dict[key] = df

mismatch_dict_filter = {}

for key in mismatch_dict.keys():
    df = mismatch_dict[key].copy()
    df = df[df["mismatch_four"] < 6]
    mismatch_dict_filter[key] = df[df["mismatch_seed"] < 2]

# for each microRNA, find if there are other microRNAs in the filtered mismatch dict with a higher expression
crosstalk_dict = {}
for cell_line in cell_lines:
    expr_df = mirna_data_no_filter[cell_line]
    crosstalk_dict[cell_line] = {}
    for i, value in expr_df.items():
        other_mirnas = mismatch_dict_filter[i].index
        # filter to those that have not been filtered out before
        other_mirnas = other_mirnas[other_mirnas.isin(expr_df.index)]
        other_expr_values = expr_df[other_mirnas]
        if value > 0:
            other_mirnas = other_expr_values[(other_expr_values > 0.5*value) & (other_expr_values > 300)].index
            # exclude the microRNA itself
            other_mirnas = other_mirnas[other_mirnas != i]
            crosstalk_dict[cell_line][i] = other_mirnas.to_list()
        else:
            crosstalk_dict[cell_line][i] = []

crosstalking_mirnas = []
for cell_line in cell_lines:
    for key in crosstalk_dict[cell_line].keys():
        if len(crosstalk_dict[cell_line][key]) > 0:
            crosstalking_mirnas.append(key)
crosstalking_mirnas = list(set(crosstalking_mirnas))

allowed_mirnas_all = {}
for cell_line in cell_lines:
    allowed_mirnas = []
    for key in crosstalk_dict[cell_line].keys():
        if len(crosstalk_dict[cell_line][key]) == 0:
            allowed_mirnas.append(key)
    allowed_mirnas_all[cell_line] = allowed_mirnas

In [5]:
# get the intersection of all allowed mirnas
allowed_mirnas_all_intersection = intersection_of_lists(allowed_mirnas_all)
non_crosstalking_mirnas = mirna_data_no_filter.loc[allowed_mirnas_all_intersection,:]

In [6]:
non_crosstalking_mirnas = non_crosstalking_mirnas[non_crosstalking_mirnas.max(axis=1) > 5000]
non_crosstalking_mirnas["tsi"] = tsi(non_crosstalking_mirnas.to_numpy())

# select somewhat unspecific microRNAs to get more data points
non_crosstalking_mirnas = non_crosstalking_mirnas[non_crosstalking_mirnas["tsi"] < 0.8]
non_crosstalking_mirnas = non_crosstalking_mirnas[cell_lines]

In [None]:
non_crosstalking_mirnas_ex = non_crosstalking_mirnas.apply(transfer_function)
non_crosstalk_mirna_list = non_crosstalking_mirnas_ex.index.to_list()
print(non_crosstalk_mirna_list)
print(len(non_crosstalk_mirna_list))

In [8]:
chosen_mirnas = [
    "hsa-let-7a-5p",
    "hsa-miR-19b-3p",
    'hsa-miR-365a-3p',
    'hsa-miR-23a-3p',
    'hsa-miR-21-5p',
    'hsa-miR-31-5p',
    'hsa-miR-22-3p',
    'hsa-miR-16-5p',
    'hsa-miR-31-3p',
    'hsa-miR-107',
    'hsa-miR-24-3p'
]

In [None]:
mirna_data_no_filter.loc[chosen_mirnas, :]

# Generate mutated constructs

## Single mutations

In [10]:
mutation_alphabets = {
    "A": ["C", "T"],
    "C": ["A", "G"],
    "G": ["A", "C", "T"],
    "T": ["A", "C", "G"]
}
wobble_alphabets = {
    "A": ["G"],
    "C": ["T"],
    "G": [],
    "T": []
}

# set regions
regions = [(0, 1), (1, 7), (7, 8), (8, 11), (11, 14), (14, 17), (17, 21)]

index_left_seed = regions[0][0]
index_right_seed = regions[2][1]
range_seed = np.arange(index_left_seed, index_right_seed)

index_left_outside = regions[3][0]
index_right_outside = regions[-1][1]
range_outside = np.arange(index_left_outside, index_right_outside)

In [11]:
def abbrev_mirna_name(name):
    return "-".join(name.split("-")[1:])

def single_mutation(sequence, name):
    """Get all single mutations of a sequence (one per position)."""
    target = complement(sequence, "DNA")
    mut_targets = {}
    for i, letter in enumerate(target):
        mutation_alphabet = mutation_alphabets[letter]
        mut_target = target[:i] + random.choice(mutation_alphabet) + target[i+1:]
        abbrev_name = abbrev_mirna_name(name)
        mut_targets[f"5.1_miRNA_{abbrev_name}_mut{i+1}"] = mut_target[::-1]
    return mut_targets

def single_wobble(sequence, name):
    """Get all single wobble mutations of a sequence (if one exists)."""
    target = complement(sequence, "DNA")
    wob_targets = {}
    for i, letter in enumerate(target):
        wobble_alphabet = wobble_alphabets[letter]
        if len(wobble_alphabet) == 0:
            continue
        wobble_target = target[:i] + wobble_alphabet[0] + target[i+1:]
        abbrev_name = abbrev_mirna_name(name)
        wob_targets[f"5.2_miRNA_{abbrev_name}_wob{i+1}"] = wobble_target[::-1]
    return wob_targets

def insert_A_at_beginning(sequence, name):
    """Test the impact of an a at seed position 1."""
    target = complement(sequence, "DNA")
    if target[0] == "A":
        return {}
    else:
        target = "A" + target[1:]
        abbrev_name = abbrev_mirna_name(name)
        return {f"5.3_miRNA_{abbrev_name}_insSeedA": target[::-1]}

In [12]:
all_single_mutations = {}
all_single_wobbles = {}
all_seed_A_insertions = {}
for mirna in chosen_mirnas:
    sequence = high_confidence.loc[mirna, "sequence"]
    all_single_mutations.update(single_mutation(sequence, mirna))
    all_single_wobbles.update(single_wobble(sequence, mirna))
    all_seed_A_insertions.update(insert_A_at_beginning(sequence, mirna))

In [None]:
print(len(all_single_mutations))
print(len(all_single_wobbles))
print(len(all_seed_A_insertions))

## Add extra sequences for let-7i-5p for single mutations only.

Because the rest of the sequence is different, let-7i gives sequences that have not been included by mutating let-7a.

In [14]:
let7i_mut = {}
let7i_wobble = {}
let7i_seedA = {}

mirna = "hsa-let-7i-5p"
let7i_sequence = high_confidence.loc[mirna, "sequence"]

all_single_mutations.update(single_mutation(let7i_sequence, mirna))
all_single_wobbles.update(single_wobble(let7i_sequence, mirna))
all_seed_A_insertions.update(insert_A_at_beginning(let7i_sequence, mirna))

## Double mutations (two adjacent mutations)

In [15]:
def insert_double_mutation(target, region, name):
    """Inserts a double mutation (two adjacent mutations) into the target sequence at the given region.
    Region is a tuple (start, end)."""

    # pick two random positions in the region
    pos_range = np.arange(region[0], region[1]-1)
    positions = np.random.choice(pos_range, 2, replace=False)
    positions.sort()

    # insert the double mutation
    for position in positions:
        mutation_alphabet = mutation_alphabets[target[position]]
        target = target[:position] + random.choice(mutation_alphabet) + target[position+1:]

    dict_name = name + "_doubleMut" + ".".join([str(pos+1) for pos in positions])
    return {dict_name: target[::-1]}

def insert_double_mutations_outside_of_seed(sequence, name):
    """Inserts a double mutation (two adjacent mutations) into the target sequence outside of the seed region."""
    
    target = complement(sequence, "DNA")
    mut_targets = {}
    regions_outside_seed = regions[3:]
    for region in regions_outside_seed:
        mut_targets.update(insert_double_mutation(target, region, name))
    return mut_targets

def insert_n_mutations_outside_of_seed(sequence, name, n):
    """Inserts n double mutations into the target sequence outside of the seed region."""
    abbrev_name = abbrev_mirna_name(name)
    target = complement(sequence, "DNA")
    index_left = regions[3][0]
    index_right = regions[-1][1]
    positions = np.random.choice(np.arange(index_left, index_right), n, replace=False)
    positions.sort()

    # insert the mutations
    for position in positions:
        mutation_alphabet = mutation_alphabets[target[position]]
        target = target[:position] + random.choice(mutation_alphabet) + target[position+1:]

    dict_name = name + ".".join([str(pos+1) for pos in positions])
            
    return {dict_name: target[::-1]}

## Arbitrary patterns

In [16]:
def get_possible_wobble_positions(target):
    """Returns a list of possible wobble positions in the target sequence."""
    possible_wobble_positions = []
    for i, letter in enumerate(target):
        wobble_alphabet = wobble_alphabets[letter]
        if len(wobble_alphabet) == 0:
            continue
        possible_wobble_positions.append(i)
    return possible_wobble_positions

def general_mutation_insert(sequence, n_mut_seed, n_wobble_seed, n_mut_outside, n_wobble_outside):
    target = complement(sequence, "DNA")
    wobble_positions = get_possible_wobble_positions(target)

    used_positions = []
    used_position_dict = {
        "seed_mut": [],
        "outside_mut": [],
        "seed_wob": [],
        "outside_wob": []}

    # insert mutations in the seed region
    for i in range(n_mut_seed):
        possible_positions = [pos for pos in range_seed if pos not in used_positions]
        position = random.choice(possible_positions)
        mutation_alphabet = mutation_alphabets[target[position]]
        target = target[:position] + random.choice(mutation_alphabet) + target[position+1:]
        used_positions.append(position)
        used_position_dict["seed_mut"].append(position)

    # insert wobbles in the seed region
    for i in range(n_wobble_seed):
        possible_positions = [pos for pos in range_seed if pos not in used_positions]
        possible_positions = [pos for pos in possible_positions if pos in wobble_positions]
        if len(possible_positions) == 0:
            break
        position = random.choice(possible_positions)
        wobble_alphabet = wobble_alphabets[target[position]]
        target = target[:position] + random.choice(wobble_alphabet) + target[position+1:]
        used_positions.append(position)
        used_position_dict["seed_wob"].append(position)

    # insert mutations outside of the seed region
    for i in range(n_mut_outside):
        possible_positions = [pos for pos in range_outside if pos not in used_positions]
        position = random.choice(possible_positions)
        mutation_alphabet = mutation_alphabets[target[position]]
        target = target[:position] + random.choice(mutation_alphabet) + target[position+1:]
        used_positions.append(position)
        used_position_dict["outside_mut"].append(position)    

    # insert wobbles outside of the seed region
    for i in range(n_wobble_outside):
        possible_positions = [pos for pos in range_outside if pos not in used_positions]
        possible_positions = [pos for pos in possible_positions if pos in wobble_positions]
        if len(possible_positions) == 0:
            break
        position = random.choice(possible_positions)
        wobble_alphabet = wobble_alphabets[target[position]]
        target = target[:position] + random.choice(wobble_alphabet) + target[position+1:]
        used_positions.append(position)
        used_position_dict["outside_wob"].append(position)

    # sort the position dicts
    for key in used_position_dict.keys():
        used_position_dict[key].sort()

    # make the target the reverse complement
    # this is what is actually inserted into the UTR
    target = target[::-1]

    # generate the label
    seed_mut_names = "smut" + ".".join([str(pos+1) for pos in used_position_dict["seed_mut"]])
    seed_wob_names = "swob" + ".".join([str(pos+1) for pos in used_position_dict["seed_wob"]])
    outside_mut_names = "omut" + ".".join([str(pos+1) for pos in used_position_dict["outside_mut"]])
    outside_wob_names = "owob" + ".".join([str(pos+1) for pos in used_position_dict["outside_wob"]])
    label = f"{seed_mut_names}_{seed_wob_names}_{outside_mut_names}_{outside_wob_names}"

    return label, target

## Set the desired mutations patterns for multiple mutation

In [17]:
two_mutation_design_dict = {
    "5.4": [{"n_mut_seed": 2, "n_wobble_seed": 0, "n_mut_outside": 0, "n_wobble_outside": 0}, 2],
    "5.5": [{"n_mut_seed": 0, "n_wobble_seed": 2, "n_mut_outside": 0, "n_wobble_outside": 0}, 1],
    "5.6": [{"n_mut_seed": 1, "n_wobble_seed": 0, "n_mut_outside": 1, "n_wobble_outside": 0}, 2],
    "5.7": [{"n_mut_seed": 0, "n_wobble_seed": 1, "n_mut_outside": 1, "n_wobble_outside": 0}, 2],
    "5.8": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 2, "n_wobble_outside": 0}, 4],
    "5.9": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 1, "n_wobble_outside": 1}, 4],
    "5.10": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 0, "n_wobble_outside": 2}, 2],
}

three_mutation_design_dict = {
    "5.11": [{"n_mut_seed": 2, "n_wobble_seed": 0, "n_mut_outside": 1, "n_wobble_outside": 0}, 1],
    "5.12": [{"n_mut_seed": 1, "n_wobble_seed": 0, "n_mut_outside": 2, "n_wobble_outside": 0}, 3],
    "5.13": [{"n_mut_seed": 1, "n_wobble_seed": 0, "n_mut_outside": 1, "n_wobble_outside": 1}, 3],
    "5.14": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 3, "n_wobble_outside": 0}, 3],
    "5.15": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 2, "n_wobble_outside": 1}, 3],
    "5.16": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 1, "n_wobble_outside": 2}, 3],
    "5.17": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 0, "n_wobble_outside": 3}, 1],
}

four_mutation_design_dict = {
    "5.18": [{"n_mut_seed": 1, "n_wobble_seed": 0, "n_mut_outside": 3, "n_wobble_outside": 0}, 2],
    "5.19": [{"n_mut_seed": 1, "n_wobble_seed": 0, "n_mut_outside": 2, "n_wobble_outside": 1}, 2],
    "5.20": [{"n_mut_seed": 1, "n_wobble_seed": 0, "n_mut_outside": 1, "n_wobble_outside": 2}, 1],
    "5.21": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 4, "n_wobble_outside": 0}, 2],
    "5.22": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 3, "n_wobble_outside": 1}, 2],
    "5.23": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 2, "n_wobble_outside": 2}, 2],
    "5.24": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 1, "n_wobble_outside": 3}, 1],
}

five_mutation_design_dict = {
    "5.25": [{"n_mut_seed": 1, "n_wobble_seed": 0, "n_mut_outside": 4, "n_wobble_outside": 0}, 2],
    "5.26": [{"n_mut_seed": 1, "n_wobble_seed": 0, "n_mut_outside": 3, "n_wobble_outside": 1}, 2],
    "5.27": [{"n_mut_seed": 1, "n_wobble_seed": 0, "n_mut_outside": 2, "n_wobble_outside": 2}, 1],
    "5.28": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 5, "n_wobble_outside": 0}, 2],
    "5.29": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 3, "n_wobble_outside": 2}, 2],
}

six_mutation_design_dict = {
    "5.30": [{"n_mut_seed": 1, "n_wobble_seed": 0, "n_mut_outside": 5, "n_wobble_outside": 0}, 1],
    "5.31": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 6, "n_wobble_outside": 1}, 2],
    "5.32": [{"n_mut_seed": 0, "n_wobble_seed": 0, "n_mut_outside": 4, "n_wobble_outside": 2}, 2],
}

In [18]:
def generate_designs(mutation_design_dict):
    mutation_designs = {}

    for key in mutation_design_dict.keys():
        params = mutation_design_dict[key][0]
        n = mutation_design_dict[key][1]
        for mirna in chosen_mirnas:
            sequence = high_confidence.loc[mirna, "sequence"]
            abbrev_name = abbrev_mirna_name(mirna)
            i = 0
            tries = 0
            while i < n:
                label, target = general_mutation_insert(sequence, **params)
                tries += 1
                if tries > 100:
                    print(f"Could not find {n} mutations for {mirna}, {key}")
                    break
                construct_name = f"{key}_miRNA_{abbrev_name}_{label}"
                if construct_name in mutation_designs.keys():
                    continue
                else:
                    mutation_designs[construct_name] = target
                    i += 1
                    
    return mutation_designs

### Generate the actual mutation patterns

In [None]:
two_mutations = generate_designs(two_mutation_design_dict)
print(len(two_mutations))

three_mutations = generate_designs(three_mutation_design_dict)
print(len(three_mutations))

four_mutations = generate_designs(four_mutation_design_dict)
print(len(four_mutations))

five_mutations = generate_designs(five_mutation_design_dict)
print(len(five_mutations))

six_mutations = generate_designs(six_mutation_design_dict)
print(len(six_mutations))

#### Also add double mutations

In [20]:
double_muts = {}
for mirna in chosen_mirnas:
    sequence = high_confidence.loc[mirna, "sequence"]
    abbrev_name = abbrev_mirna_name(mirna)
    abbrev_name = "5.8_miRNA_" + abbrev_name

    # make sure not to produce duplicates
    while True:
        new_muts = insert_double_mutations_outside_of_seed(sequence, abbrev_name)
        if any([val in two_mutations.values() for val in new_muts.values()]) == False:
            break
    double_muts.update(new_muts)

two_mutations.update(double_muts)

In [21]:
# merge them all
all_mutations = {}
all_mutations.update(all_single_mutations)
all_mutations.update(all_single_wobbles)
all_mutations.update(all_seed_A_insertions)
all_mutations.update(two_mutations)
all_mutations.update(three_mutations)
all_mutations.update(four_mutations)
all_mutations.update(five_mutations)
all_mutations.update(six_mutations)

# Save the mutated targets

In [22]:
df_all_mutations = pd.DataFrame.from_dict(all_mutations, orient="index", columns=["target"])
df_all_mutations["orig_mi"] = df_all_mutations.index.map(lambda x: "hsa-"+ x.split("_")[2])
df_all_mutations["miRNA1"] = df_all_mutations.index
df_all_mutations = df_all_mutations[["orig_mi", "miRNA1", "target"]]

In [23]:
df_all_mutations.to_csv("../designs/5_miRNA_single_mut.csv")