In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import itertools
import os
import pickle
from library2_utils.color_scheme import cell_line_colors, cell_line_symbols
from library2_utils.transfer_functions import transfer_function
from typing import Union
import ast

# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

# suppress setting with copy warning
pd.options.mode.chained_assignment = None

cell_lines_measured = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH7", "A549", "HaCaT", "JEG3", "Tera1", "PC3"]
base_plot_folder = f"../plots/5_mutational_data_single/"

# create it if it does not exist
if not os.path.exists(base_plot_folder):
    os.makedirs(base_plot_folder)

In [3]:
data_dir_input = "../measured_data/2_normalized_log10"

# get the name of all files in "reference" folder
reference_files = os.listdir(data_dir_input)

# read them into a dictionary
reference_dict = {}
for reference_file in reference_files:
    if reference_file.endswith(".csv"):
        reference_dict[reference_file.split('.')[0]] = pd.read_csv(os.path.join(data_dir_input, reference_file), index_col=0)

# 5.1 - Get the relevant data

In [None]:
"""
### What is the organization of the data?
5.0 will be added as non-mutated control data [from 1_]
5.1 contains single mutations along the microRNA sequence
5.2 contains single wobble mutations along the microRNA sequence (where possible)
5.3 contains A mutations inserted at position 1 where possible
5.8 contains double mutations (amongst other things)

The entire rest contains various mutation patterns with multiple mutations or wobbles.
"""

In [5]:
single_data = reference_dict["1_mirna_full_single_high_conf"].copy()
mut_data = reference_dict["5_miRNA_single_mut"].copy()

In [6]:
mut_mirs = mut_data["orig_mi"].unique()
single_data = single_data[single_data["miRNA1"].isin(mut_mirs)]

# drop predicted_ columns
single_data = single_data.drop([col for col in single_data.columns if "predicted_" in col], axis=1)
single_data["orig_mi"] = single_data["miRNA1"]

# append the two dataframes
mut_data = pd.concat([single_data, mut_data], axis=0)

# remove _3UTR in the column names
mut_data.columns = [col.replace("_3UTR", "") for col in mut_data.columns]

In [7]:
# alter the index for the added non-mutated data to 5.0_miRNA_...
new_index = []
for idx, value in mut_data.iterrows():
    if idx.startswith('1_mirna_full_single'):
        # This splits the miRNA name and takes everything after 'hsa'
        mirna_name = value[0].split('-')[1:]  
        
        # Join it back together
        mirna_name = '-'.join(mirna_name)  
        
        # Construct the new index
        new_idx = f'5.0_miRNA_{mirna_name}' 
        new_index.append(new_idx)
        
    else:
        new_index.append(idx)

# Update the DataFrame's index
mut_data.index = new_index

## Get the sequence information from miRbase

In [8]:
# add the mirna target sequence 
mirbase = pd.read_csv("../microrna_data/mirbase_extended.csv", index_col=0)

# working on the RNA level
mirbase["sequence_norm"] = mirbase["sequence_norm"].str.replace("T", "U")
mirbase["target"] = mirbase["target"].str.replace("T", "U")

# get only the high confidence microRNAs in mirbase
mirbase_high_confidence = mirbase[mirbase["confidence"] == "high"]

In [9]:
# add the original microRNA sequences
mut_data["mirna_sequence"] = mut_data["orig_mi"].map(mirbase["sequence_norm"])
mut_data["mirna_sequence_orig"] = mut_data["orig_mi"].map(mirbase["sequence_orig"])

In [None]:
# what is the distribution of lengths in the original sequence?
mut_data["mirna_sequence_orig"].str.len().value_counts()

In [11]:
# none of them are shorter than 21 nt (the target length)
# drop the original sequence column
mut_data = mut_data.drop("mirna_sequence_orig", axis=1)

In [12]:
# the added non-mutated data has no target sequence at this point
# add target data based on mirbase
for i, row in mut_data.iterrows():
    if i.startswith("5.0_miRNA"):
        mut_data.loc[i, "target"] = mirbase.loc[row["orig_mi"], "target"]

# 5.2 - Manually investigate the data

In [13]:
# create the plot_folder
plot_folder = os.path.join(base_plot_folder, "5.2_manual_plotting")

# create it if it does not exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)

## 5.2.1 - Look at designs with an inserted A at positions 1

In [30]:
# Get all designs that contain insSeedA in their index
insSeedA_designs = [design for design in mut_data.index if "insSeedA" in design]
mut_data_seedA = mut_data.loc[insSeedA_designs, :]

# Get the non-mutated designs that belong to the designs with insSeedA
mirnas_seedA = mut_data.loc[insSeedA_designs, "orig_mi"]
mut_data_nonmut_seedA = mut_data[mut_data["orig_mi"].isin(mirnas_seedA)]
mut_data_nonmut_seedA = mut_data_nonmut_seedA[mut_data_nonmut_seedA.index.str.startswith("5.0_miRNA")]

# Make sure these have the same order as the insSeedA designs (i.e., the order of "orig_mi" is the same)

# Create a mapping of 'orig_mi' values to their order of appearance
orig_mi_order = {mi: i for i, mi in enumerate(mut_data_seedA['orig_mi'])}
# Map the 'orig_mi' in 'mut_data_nonmut_seedA' to their order
mut_data_nonmut_seedA['order'] = mut_data_nonmut_seedA['orig_mi'].map(orig_mi_order)
# Sort 'mut_data_nonmut_seedA' by the new 'order' column
mut_data_nonmut_seedA_sorted = mut_data_nonmut_seedA.sort_values(by='order')
# Drop the 'order' column
mut_data_nonmut_seedA_sorted = mut_data_nonmut_seedA_sorted.drop(columns=['order'])

In [None]:
plt.figure(figsize=(2.4, 1.8))

# flatten the data (unroll along cell lines)
mut_data_seedA_flatten = mut_data_seedA[cell_lines_measured].values.flatten()
mut_data_nonmut_seedA_flatten = mut_data_nonmut_seedA_sorted[cell_lines_measured].values.flatten()

plt.scatter(mut_data_seedA_flatten, mut_data_nonmut_seedA_flatten, color="tab:blue", s=6, rasterized=True)
r2 = stats.pearsonr(mut_data_seedA_flatten, mut_data_nonmut_seedA_flatten)[0]**2
plt.plot([-1.2, 0.1], [-1.2, 0.1], color="black", linestyle="--")
plt.xlabel(r"log$_{10}$(stability full target)")
plt.ylabel(r"log$_{10}$(stability A at pos 1)")
plt.title(r"r$^2$: "+f"{r2:.2f}", fontsize=7.5)
plt.tight_layout()
for format in [".png", ".svg"]:
    plt.savefig(os.path.join(plot_folder, f"5.2.1_scatterplot_seedA{format}"), dpi=300)

## 5.2.2 - Are the chosen microRNAs likely to have crosstalk?

In [15]:
from library2_utils.crosstalk import count_mismatches_in_region, region_split
from library2_utils.NA_sequence_utilities import reverse_complement

In [16]:
def region_split(sequence, regions=[(0, 8), (8, 17), (17 ,21)]):
    """Splits a sequence into a list of regions, each of which is a string.
    Assumes that the sequence is a miRNA sequence."""
    
    return [sequence[region[0]:region[1]] for region in regions]

In [None]:
let7 = mirbase_high_confidence.loc["hsa-let-7a-5p", "sequence_norm"]
print(let7)
print(region_split(let7))

In [18]:
mismatch_dict = {}

for mirna_target, row in mirbase_high_confidence.iterrows():
    intended_target = mirbase_high_confidence.loc[mirna_target, "sequence_norm"]
    query_df = pd.DataFrame(index = mirbase_high_confidence.index, columns = ["mismatch", "wobble"])
    for mirna_query, row in mirbase_high_confidence.iterrows():
        query = mirbase_high_confidence.loc[mirna_query, "sequence_norm"]
        mismatch, wobble = count_mismatches_in_region(intended_target, query, regions=[(0, 8), (8, 17), (17, 21)])
        query_df.loc[mirna_query, "mismatch"] = mismatch
        query_df.loc[mirna_query, "wobble"] = wobble

    mismatch_dict[mirna_target] = query_df

In [19]:
# add columns that further summarize the mismatch data
for key in mismatch_dict.keys():
    df = mismatch_dict[key]
    
    # is there a mismatch in the seed region?
    df["mismatch_seed"] = df["mismatch"].apply(lambda x: x[0])
    
    # is there a mismatch in the first 17 nts outside the seed?
    df["mismatch_17nt"] = df["mismatch"].apply(lambda x: x[1])
    
    mismatch_dict[key] = df

# filter to those with at most 4 mismatches in the first 17 nts
# and 0 or 1 mismatches in the seed region
mismatch_dict_filter = {}
for key in mismatch_dict.keys():
    df = mismatch_dict[key].copy()
    df = df[df["mismatch_17nt"] < 6]
    mismatch_dict_filter[key] = df[df["mismatch_seed"] < 3]

#### Get miRNA expression data

In [14]:
df_combined = pd.read_csv('../microrna_data/3_output/Alles_Keller_combined_expression_with_crosstalk.csv', index_col=0)
used_mirna_data = df_combined
used_mirna_name = "combined_dataset"

In [None]:
unique_mirs_mut = mut_data["orig_mi"].unique()
unique_mirs_mut

In [None]:
used_mirna_lin = 10**used_mirna_data

for microrna in unique_mirs_mut:
    print("----------------- NEW MIRNA ----------")
    print(f"microRNA: {microrna}")
    #print(mismatch_dict_filter[microrna])
    print("potentially crosstalking microRNAs:")
    indices = mismatch_dict_filter[microrna].index
    for i in indices:
        print(i, mirbase_high_confidence.loc[i, "sequence_orig"])
    indices = [index for index in indices if index in used_mirna_data.index]
    for index in indices:
        print("---------------------------")
        print(f"expression for miRNA {index}")
        print(used_mirna_data.loc[index, :])
        print("\n")

#### Which microRNAs could have crosstalk based on this information?

In [None]:
"""
##### let-7a-5p and let-7i-5p
These have massive crosstalk, as expected.

##### miR-107
hsa-miR-103a-3p AGCAGCATTGTACAGGGCTATGA
hsa-miR-107     AGCAGCATTGTACAGGGCTATCA

The expression for 103a-3p is almost always higher.
However, these are the same miRNA for the first 21 nts (which is 100% of the bases I use).

#### miR-16-5p
hsa-miR-15a-5p TAGCAGCACATAATGGTTTGTG
hsa-miR-16-5p  TAGCAGCACGTAAATATTGGCG
hsa-miR-15b-5p TAGCAGCACATCATGGTTTACA
hsa-miR-195-5p TAGCAGCACAGAAATATTGGC

Expression is highly correlated. 16-5p has the highest expression, which probably means crosstalk doesn't matter all that much.
The sequences match up until base 11 (with one wobble base pair) 

##### miR-19b-3p
hsa-miR-19a-3p only has one mismatch: a wobble base pair at base 11.
hsa-miR-19a-3p TGTGCAAATCTATGCAAAACTGA
hsa-miR-19b-3p TGTGCAAATCCATGCAAAACTGA

The expression levels are similar. The concrete mutation I used also has a mismatch with 19a-3p:
5.1_miRNA_miR-19b-3p_mut11 AGTTTTGCATTGATTTGCACA

##### mir-21-5p
There is no predicted crosstalk at all.

##### miR-22-3p
Only predicted crosstalk is with miR-656-5p, which tiny expression levels (<100 tpm) in all cell lines

##### mir-23a-3p
There is no predicted crosstalk at all.

Very much just maybe:
hsa-miR-23a-3p ATCACATTGCCAGGGATTTCC
hsa-miR-27a-3p TTCACAGTGGCTAAGTTCCGC
hsa-miR-27b-3p TTCACAGTGGCTAAGTTCTGC
These are highly expressed - maybe this explains somewhat weird deviant behavior?

The target for mut7 is "GGAAATCCCTGGCATTGTGAT" with the reverse complement
target ATCACAATGCCAGGGATTTCC
23a-3p ATCACATTGCCAGGGATTTCC
27a-3p TTCACAGTGGCTAAGTTCCGC
27b-3p TTCACAGTGGCTAAGTTCTGC
This turns a mismatch into a wobble base pair - not sure this is convincing enough to exclude it.

##### miR-24-3p
There is no predicted crosstalk at all. This hold even with strongly relaxed criteria.
The outliers are probably not due to crosstalk.

##### miR-31-3p
There is no predicted crosstalk at all.

##### miR-31-5p
hsa-miR-31-5p  AGGCAAGATGCTGGCATAGCT
hsa-miR-885-3p AGGCAGCGGGGTGTAGTGGATA

Only predicted crosstalk is with miR-885-3p, which has tiny expression levels (<100 tpm) in all cell lines.
The mismatches are probably significant enough that this should not confound the analysis.
I also checked the sequencing data from the Keller lab - the expression of miR-885-3p is < 1000 tpm in all cell lines.

##### miR-365a-3p
hsa-miR-365a-3p TAATGCCCCTAAAAATCCTTAT
hsa-miR-365b-3p TAATGCCCCTAAAAATCCTTAT
Predicted crosstalk is with miR-365b-3p. This is not even in any of the expression data.
Also, the sequence is absolutely identical.

# ------------------------------------------------------------------------------------------------
Conclusion: let-7a-5p and let-7i-5p need to be filtered to build the initial model. miR-16-5p might be an issue.
The others can be included without confounding the results. 
"""

## 5.2.3 Add the mutation pattern as columns

In [22]:
mirna_length = 21

def one_hot_encoding(sequences: Union[list, str], alph: str = "DNA"):
    """Expects a list of sequences or a single string.

    Returns a one-hot encoded numpy array."""
    alphabet = get_alphabet(alph)

    # create mapping
    char_to_int = {c: i for i, c in enumerate(alphabet)}
    unpack_flag = False
    if not isinstance(sequences, list):
        unpack_flag = True
        sequences = [sequences]
    one_hot = np.zeros((len(sequences), len(sequences[0]), len(alphabet)))
    for index, sequence in enumerate(sequences):
        # convert to integer encoding
        integer_encoded = [char_to_int[c] for c in sequence]
        # one hot encoding
        one_hot[index, :, :] = np.eye(len(alphabet))[integer_encoded]
    if unpack_flag:
        one_hot = one_hot.squeeze(axis=0)
    return one_hot

def get_alphabet(alph: str = "DNA"):
    """Returns the alphabet as a list."""
    DNA_alphabet = ["A", "C", "G", "T"]
    RNA_alphabet = ["A", "C", "G", "U"]

    if alph == "DNA":
        return DNA_alphabet
    elif alph == "RNA":
        return RNA_alphabet
    else:
        raise ValueError("Invalid alphabet. Please choose 'DNA' or 'RNA'.")

def determine_mismatch_type(target_letter, mirna_letter):
    match_dict = {"A": "U", "U": "A", "C": "G", "G": "C"}
    wobble_dict = {"A": "", "U": "G", "C": "", "G": "U"}
    
    if match_dict[mirna_letter] == target_letter:
        return 0
    elif wobble_dict[mirna_letter] == target_letter:
        return 2
    else:
        return 1

def extract_pattern_from_seq(target, mirna):
    target = target[::-1]
    # generate the initial pattern
    pattern = [0]*len(target)
    
    for i in range(len(target)):
        # check for seed A
        if i==0 and target[i] == "A" and mirna[0] != "U":
            # ignore this pattern
            pattern[0] = 0
            continue
        pattern[i] = determine_mismatch_type(target[i], mirna[i])
        
    return pattern

In [None]:
# get the pattern
for i, row in mut_data.iterrows():
    target = row["target"]
    mirna = row["mirna_sequence"]
    pattern = extract_pattern_from_seq(target=target, mirna=mirna)
    mut_data.loc[i, "pattern"] = str(pattern)

In [None]:
# count the total number of mismatches and wobbles
mut_data["mismatches"] = mut_data["pattern"].apply(lambda x: ast.literal_eval(x).count(1))
mut_data["wobbles"] = mut_data["pattern"].apply(lambda x: ast.literal_eval(x).count(2))

In [None]:
# get only those with a single mismatch or wobble
single_mm = mut_data[(mut_data["mismatches"] == 1) & (mut_data["wobbles"] == 0)]
single_wobble = mut_data[(mut_data["wobbles"] == 1) & (mut_data["mismatches"] == 0)]

### Add the pattern info

Here, we create a "training_df" which contains a) the pattern info of the mutations and b) is unrolled along the cell lines

In [None]:
position_columns = [f"pos_{i+1}" for i in range(mirna_length)]
training_df_columns = position_columns.copy()
training_df_columns.append("knockdown_orig")
training_df_columns.append("knockdown_mut")
training_df_columns.append("orig_mi")
training_df_columns.append("cell_line")
training_df_columns.append("mismatches")
training_df_columns.append("wobbles")

# create a new empty dataframe
training_df = pd.DataFrame(columns=training_df_columns)

In [None]:
# add the knockdown of the non-mutated mirna
# even though this is called knockdown, it's currently just the log10 of the stability
# it'll get converted to knockdown later
for i, row in mut_data.iterrows():
    # get the original miRNA name
    orig_mi = row["orig_mi"]
    
    # get rid of the "hsa-" prefix
    orig_mi_short = "-".join(orig_mi.split("-")[1:])
    
    non_mut_index = f"5.0_miRNA_{orig_mi_short}"
    for cell_line in cell_lines_measured:
        # add the knockdown for the non-mutated miRNA
        knockdown_orig = mut_data.loc[non_mut_index, cell_line]
        
        # ALTERNATIVE: TRY THE PREDICTED KNOCKDOWN
        # this is probably not a good idea, since, e.g., miR-21 is off in some cell lines
        # knockdown_orig = np.log10(predicted_knockdown.loc[orig_mi, cell_line])
        
        # get the other data
        knockdown_mut = row[cell_line]
        mismatches = row["mismatches"]
        wobbles = row["wobbles"]
        pattern = ast.literal_eval(row["pattern"])
        new_index = f"{i}_{cell_line}"
        # orig_mi = row["orig_mi"]
        
        # add the new row
        new_row = pattern + [knockdown_orig, knockdown_mut, orig_mi, cell_line, mismatches, wobbles]
        training_df.loc[new_index] = new_row

### Filter out let-7 microRNAs (these are harder to interpret due to crosstalk)

In [None]:
confounding_microRNAs = ["hsa-let-7a-5p", "hsa-let-7i-5p"]
training_df_confound = training_df[training_df["orig_mi"].isin(confounding_microRNAs)]
training_df = training_df[~training_df["orig_mi"].isin(confounding_microRNAs)]

In [None]:
training_df_confound["orig_mi"].unique()

## 5.2.4 - Single-mutation analysis

In [None]:
# filter to those with only one mutation or wobble
training_df_filter_mismatch = training_df[(training_df["mismatches"] == 1) & (training_df["wobbles"] == 0)]
training_df_filter_wobble = training_df[(training_df["mismatches"] == 0) & (training_df["wobbles"] == 1)]

In [None]:
# add a column that specifies the name of the region where the mismatch occurs (column name)
for i, row in training_df_filter_mismatch.iterrows():
    # bin based on individual positions
    for col in position_columns:
        # for mismatch: 1; for wobble: 2
        if row[col] == 1:
            training_df_filter_mismatch.loc[i, "region"] = col.split("_")[1]
            break

for i, row in training_df_filter_wobble.iterrows():
    # bin based on individual positions
    for col in position_columns:
        # for mismatch: 1; for wobble: 2
        if row[col] == 2:
            training_df_filter_wobble.loc[i, "region"] = col.split("_")[1]
            break

#### convert to linear-fold change data (knockdown)

In [None]:
# this converts the stability to knockdown (1/stability)
training_df_filter_mismatch["knockdown_orig"] = 1/10**training_df_filter_mismatch["knockdown_orig"]
training_df_filter_mismatch["knockdown_mut"] = 1/10**training_df_filter_mismatch["knockdown_mut"]

training_df_filter_wobble["knockdown_orig"] = 1/10**training_df_filter_wobble["knockdown_orig"]
training_df_filter_wobble["knockdown_mut"] = 1/10**training_df_filter_wobble["knockdown_mut"]

In [None]:
# set values smaller than 1 to 1
training_df_filter_mismatch.loc[training_df_filter_mismatch["knockdown_orig"] < 1, "knockdown_orig"] = 1
training_df_filter_mismatch.loc[training_df_filter_mismatch["knockdown_mut"] < 1, "knockdown_mut"] = 1

training_df_filter_wobble.loc[training_df_filter_wobble["knockdown_orig"] < 1, "knockdown_orig"] = 1
training_df_filter_wobble.loc[training_df_filter_wobble["knockdown_mut"] < 1, "knockdown_mut"] = 1

In [None]:
# filter to those with non-mutated knockdown between two limits
upper_limit = 100
lower_limit = 3

df_filter_plot_mm = training_df_filter_mismatch[(training_df_filter_mismatch["knockdown_orig"] > lower_limit ) &\
                                                (training_df_filter_mismatch["knockdown_orig"] < upper_limit)]
df_filter_plot_wobble = training_df_filter_wobble[(training_df_filter_wobble["knockdown_orig"] > lower_limit ) & \
                                                (training_df_filter_wobble["knockdown_orig"] < upper_limit)]

#### Convert to relative fold-change data and merge the dataframes

In [None]:
# Normalize the knockdown values
# We want the relative knockdown, so we divide by the non-mutated knockdown
# One is subtracted from both so that 0 knockdown means no effect
df_filter_plot_mm["knockdown_mut"] = (df_filter_plot_mm["knockdown_mut"]-1)/(df_filter_plot_mm["knockdown_orig"]-1)
df_filter_plot_wobble["knockdown_mut"] = (df_filter_plot_wobble["knockdown_mut"]-1)/(df_filter_plot_wobble["knockdown_orig"]-1)

# Add an identifier column to each DataFrame to distinguish mismatches and wobbles
df_filter_plot_mm['type'] = 'Mismatch'
df_filter_plot_wobble['type'] = 'Wobble'

# Concatenate the two DataFrames
df_combined = pd.concat([df_filter_plot_mm, df_filter_plot_wobble])

### Define limits to median mutation impacts

In [None]:
high_impact_limit = 0.08
medium_impact_limit = 0.3
low_impact_limit = 0.65

impact_dict_mm = {}
impact_dict_wobble = {}
median_list_mm = []
median_list_wobble = []
for column in position_columns:
    median_mm = df_filter_plot_mm[df_filter_plot_mm["region"] == column.split("_")[-1]]["knockdown_mut"].median()
    median_wobble = df_filter_plot_wobble[df_filter_plot_wobble["region"] == column.split("_")[-1]]["knockdown_mut"].median()
    median_list_mm.append(median_mm)
    median_list_wobble.append(median_wobble)
    
    if median_mm < high_impact_limit:
        impact_dict_mm[column] = "high"
    elif median_mm < medium_impact_limit:
        impact_dict_mm[column] = "mid"
    elif median_mm < low_impact_limit:
        impact_dict_mm[column] = "low"
    else:
        impact_dict_mm[column] = "no"
    
    if median_wobble < high_impact_limit:
        impact_dict_wobble[column] = "high"
    elif median_wobble < medium_impact_limit:
        impact_dict_wobble[column] = "mid"
    elif median_wobble < low_impact_limit:
        impact_dict_wobble[column] = "low"
    else:
        impact_dict_wobble[column] = "no"

In [None]:
# save the impact dictionaries as a dataframe
output_folder = f"../outputs/5_mutations"

# create it if it does not exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# create the df
mutation_impact_df = pd.DataFrame.from_dict(impact_dict_mm, orient="index", columns=["mismatch"])
# add wobble data
mutation_impact_df["wobble"] = mutation_impact_df.index.to_series().apply(lambda x: impact_dict_wobble[x])

# wherever wobble is "high" and mismatch is "mid", make wobble "mid"
mutation_impact_df["wobble"] = np.where((mutation_impact_df["wobble"] == "high") & (mutation_impact_df["mismatch"] == "mid"), "mid", mutation_impact_df["wobble"])
# wherever mismatch is "no" and wobble is "low", make wobble "no"
mutation_impact_df["wobble"] = np.where((mutation_impact_df["wobble"] == "low") & (mutation_impact_df["mismatch"] == "no"), "no", mutation_impact_df["wobble"])

mutation_impact_df.to_csv(os.path.join(output_folder, "mutation_impact.csv"))    

In [None]:
mutation_impact_df

#### Create boxplots without p-values

In [None]:
order = [str(i) for i in np.arange(1, 22, 1)]

plt.figure(figsize=(3, 1.6))
sns.boxplot(x="region", y="knockdown_mut", data=df_filter_plot_mm, order=order, dodge=True, color="skyblue",
            boxprops={'edgecolor': 'black', },
            whiskerprops={'color': 'black'},
            capprops={'color': 'black'},
            medianprops={'color': 'black'},
            linewidth=1,
            showfliers=True,
            flierprops=dict(marker='o', markersize=3, markerfacecolor='black', markeredgewidth=0, linestyle='none'),
            zorder=2)

# draw lines at the impact limits
plt.axhline(y=high_impact_limit, color='red', linestyle='--', alpha=0.5, linewidth=1)
plt.axhline(y=medium_impact_limit, color='red', linestyle='--', alpha=0.5, linewidth=1)
plt.axhline(y=low_impact_limit, color='red', linestyle='--', alpha=0.5, linewidth=1)

# rotate xticks by 90 degrees
plt.xticks(rotation=90)
plt.xlabel("position along microRNA")
plt.ylabel("ratio of knockdown\n( mutated / non-mutated )")
plt.title(f"Effect of a single target site mutation", fontsize=7)

plt.text(21, -0.05, "h", color="black", fontsize=7, rotation=0)
plt.text(21, 0.15, "m", color="black", fontsize=7, rotation=0)
plt.text(21, 0.45, "l", color="black", fontsize=7, rotation=0)
plt.text(21, 0.85, "n", color="black", fontsize=7, rotation=0)

plt.ylim(0, 1.3)

plt.tight_layout()
for format in [".png", ".svg"]:
    plt.savefig(f"{plot_folder}/5.2.4_single_mut_boxplot_no_wobble_{lower_limit}to{upper_limit}{format}", dpi=300)

In [None]:
order = [str(i) for i in np.arange(1, 22, 1)]

plt.figure(figsize=(6, 3))
sns.boxplot(x="region", y="knockdown_mut", hue="type", data=df_combined, order=order, dodge=True,
            showfliers=True, flierprops=dict(marker='o', markersize=3, markerfacecolor='black', linestyle='none'))

# draw lines at 0.2 and 0.5
plt.axhline(y=high_impact_limit, color='grey', linestyle='--', alpha=0.5)
plt.axhline(y=medium_impact_limit, color='grey', linestyle='--', alpha=0.5)
plt.axhline(y=low_impact_limit, color='grey', linestyle='--', alpha=0.5)

plt.xlabel("Position along microRNA")
plt.ylabel("Ratio of knockdown (mut/non-mut)")
plt.title(f"Effect of a single mutation\n{lower_limit} < original knockdown < {upper_limit}")
plt.legend()

plt.tight_layout()
for format in [".png", ".svg"]:
    plt.savefig(f"{plot_folder}/5.2.4_single_mut_boxplot_{lower_limit}to{upper_limit}{format}", dpi=300)

In [None]:
%%capture output
# for each miRNA, plot a histogram of "knockdown_mut"
for mirna in df_filter_plot_mm["orig_mi"].unique():
    df_mirna = df_filter_plot_mm[df_filter_plot_mm["orig_mi"] == mirna]
    plt.figure(figsize=(2.4, 1.8))
    sns.histplot(data=df_mirna, x="knockdown_mut", bins=20)
    plt.title(f"{mirna}")
    plt.xlabel("Ratio of knockdown (mut/non-mut)")
    plt.ylabel("Frequency")
    plt.tight_layout()
    for format in [".png", ".svg"]:
        plt.savefig(f"{plot_folder}/5.2.5_single_mut_hist_{mirna}{format}", dpi=300)

In [None]:
df_mirna = df_filter_plot_mm[df_filter_plot_mm["orig_mi"] == "hsa-miR-23a-3p"]
df_mirna.sort_values(by="knockdown_mut", inplace=True, ascending=False)
df_mirna[df_mirna["region"] == "7"]

#### Plot with p-values

In [None]:
from scipy.stats import mannwhitneyu

# Perform Mann-Whitney U Test for each region
results = [] 
for region in df_combined['region'].unique():
    group1 = df_combined[(df_combined['region'] == region) & (df_combined['type'] == 'Mismatch')]['knockdown_mut']
    group2 = df_combined[(df_combined['region'] == region) & (df_combined['type'] == 'Wobble')]['knockdown_mut']
    # check if both groups are non-empty
    if len(group1) == 0 or len(group2) == 0:
        continue
    stat, p_value = mannwhitneyu(group1, group2)
    results.append((region, p_value))

# Convert results to a DataFrame
df_p_values = pd.DataFrame(results, columns=['region', 'p_value'])

# Sort df_p_values according to the predefined order
df_p_values['order'] = df_p_values['region'].apply(lambda x: order.index(x))
df_p_values_sorted = df_p_values.sort_values('order').drop('order', axis=1)

In [None]:
def annotate_significance(ax, df_p_values, df_combined, order, height_factor=1.05, height=False):
    if not height:
        y_max = df_combined['knockdown_mut'].max()
        line_offset = y_max*0.02
    else:
        y_max = height
        line_offset = y_max*0.02
    
    for region in order:  # Iterate based on the specified order
        p_value = df_p_values.loc[df_p_values_sorted['region'] == region, 'p_value'].values
        # if non-empy, extract position 0
        p_value = p_value[0] if len(p_value) > 0 else None
        # Annotation text based on p-value
        if p_value is None:
            text = 'ns'
        elif p_value < 0.001:
            text = '***'
        elif p_value < 0.01:
            text = '**'
        elif p_value < 0.05:
            text = '*'
        else:
            text = 'ns'
        
        # get the current y value (the highest value in the current region)
        # y_max = df_combined[df_combined['region'] == region]['knockdown_mut'].max()
        
        x_loc = order.index(region)  # x location is based directly on the order list
        ax.text(x_loc, y_max * height_factor, text, ha='center', va='bottom')
        
        line_y = y_max * (height_factor - 0.5 * line_offset)  # Adjust line position
        ax.hlines(line_y, x_loc - 0.2, x_loc + 0.2, color="black", linewidth=2)

In [None]:
# Plotting and annotating
plt.figure(figsize=(6, 2.4))
ax = sns.boxplot(x="region", y="knockdown_mut", hue="type", data=df_combined, order=order, dodge=True,
                 showfliers=True, flierprops=dict(marker='o', markersize=3, markerfacecolor='black', linestyle='none'))
annotate_significance(ax, df_p_values_sorted, df_combined, order, height=1.55)

# draw lines at 0.2 and 0.5
plt.axhline(y=high_impact_limit, color='grey', linestyle='--', alpha=0.7)
plt.axhline(y=medium_impact_limit, color='grey', linestyle='--', alpha=0.7)
plt.axhline(y=low_impact_limit, color='grey', linestyle='--', alpha=0.7)

plt.xlabel("Position along the microRNA")
plt.ylabel("Ratio of knockdown (mut/non-mut)")

plt.xlim(-1, 21)
plt.ylim(0, 1.8)
plt.title(f"Effect of a single mismatch or wobble on the relative knockdown [knockdown = (1/stability) - 1]\nrestricted to {lower_limit} < non-mutated knockdown < {upper_limit} to reduce noise", fontsize=7.5)
plt.legend(loc = [0.15, 0.65])

plt.text(21.55, -0.05, "high", color="black", fontsize=7, rotation=0)
plt.text(21.5, 0.15, "medium", color="black", fontsize=7, rotation=0)
plt.text(21.5, 0.45, "low", color="black", fontsize=7, rotation=0)
plt.text(21.5, 0.85, "none", color="black", fontsize=7, rotation=0)
plt.text(21.5, 1.2, "impact:", color="black", fontsize=8, rotation=0)

plt.tight_layout()
for format in [".png", ".svg"]:
    plt.savefig(f"{plot_folder}/5.2.4_single_mut_boxplot_{lower_limit}to{upper_limit}_with_significance{format}", dpi=300)

## 5.2.6 - Boxplots by the pre-defined prefix (essentially, location in or outside the seed)

In [None]:
training_df["prefix"] = training_df.index.str[:4]
training_df_confound["prefix"] = training_df_confound.index.str[:4]
training_df["prefix"] = training_df["prefix"].str.replace("_", "")
training_df_confound["prefix"] = training_df_confound["prefix"].str.replace("_", "")

In [None]:
# for those with the prefix 5.1, add an a if mm_1_9 is 1, else add a b
for i, row in training_df.iterrows():
    # find the position of the mismatch
    pattern = row[position_columns].to_list()
    if row["prefix"] == "5.1":
        try:
            pos = pattern.index(1)
            if pos < 8:
                training_df.loc[i, "prefix"] = "5.1a"
            else:
                training_df.loc[i, "prefix"] = "5.1b"
        except ValueError:
            training_df.loc[i, "prefix"] = "5.0"
            
    if row["prefix"] == "5.2":
        try:
            pos = pattern.index(2)
            if pos < 8:
                training_df.loc[i, "prefix"] = "5.2a"
            else:
                training_df.loc[i, "prefix"] = "5.2b"
        except ValueError:
            training_df.loc[i, "prefix"] = "5.0"
         
for i, row in training_df_confound.iterrows():
    # find the position of the mismatch
    pattern = row[position_columns].to_list()
    if row["prefix"] == "5.1":
        try:
            pos = pattern.index(1)
            if pos < 8:
                training_df_confound.loc[i, "prefix"] = "5.1a"
            else:
                training_df_confound.loc[i, "prefix"] = "5.1b"
        except ValueError:
            training_df_confound.loc[i, "prefix"] = "5.0"
            
    if row["prefix"] == "5.2":
        try:
            pos = pattern.index(2)
            if pos < 8:
                training_df_confound.loc[i, "prefix"] = "5.2a"
            else:
                training_df_confound.loc[i, "prefix"] = "5.2b"
        except ValueError:
            training_df_confound.loc[i, "prefix"] = "5.0"      

In [None]:
# drop those with prefix 5.3 (A in seed, doesn't act as a mutation)
training_df = training_df[training_df["prefix"] != "5.3"]
training_df_confound = training_df_confound[training_df_confound["prefix"] != "5.3"]

In [None]:
prefix_dict = {
 '5.0': {},
 '5.1a': {'n_mut_seed': 1},
 '5.1b': {'n_mut_outside': 1},
 '5.2a': {'n_wobble_seed': 1},
 '5.2b': {'n_wobble_outside': 1},
 '5.4': {'n_mut_seed': 2},
 '5.5': {'n_wobble_seed': 2},
 '5.6': {'n_mut_seed': 1, 'n_mut_outside': 1},
 '5.7': {'n_wobble_seed': 1, 'n_mut_outside': 1},
 '5.8': {'n_mut_outside': 2},
 '5.9': {'n_mut_outside': 1, 'n_wobble_outside': 1},
 '5.10': {'n_wobble_outside': 2},
 '5.11': {'n_mut_seed': 2, 'n_mut_outside': 1},
 '5.12': {'n_mut_seed': 1, 'n_mut_outside': 2},
 '5.13': {'n_mut_seed': 1, 'n_mut_outside': 1, 'n_wobble_outside': 1},
 '5.14': {'n_mut_outside': 3},
 '5.15': {'n_mut_outside': 2, 'n_wobble_outside': 1},
 '5.16': {'n_mut_outside': 1, 'n_wobble_outside': 2},
 '5.17': {'n_wobble_outside': 3},
 '5.18': {'n_mut_seed': 1, 'n_mut_outside': 3},
 '5.19': {'n_mut_seed': 1, 'n_mut_outside': 2, 'n_wobble_outside': 1},
 '5.20': {'n_mut_seed': 1, 'n_mut_outside': 1, 'n_wobble_outside': 2},
 '5.21': {'n_mut_outside': 4},
 '5.22': {'n_mut_outside': 3, 'n_wobble_outside': 1},
 '5.23': {'n_mut_outside': 2, 'n_wobble_outside': 2},
 '5.24': {'n_mut_outside': 1, 'n_wobble_outside': 3},
 '5.25': {'n_mut_seed': 1, 'n_mut_outside': 4},
 '5.26': {'n_mut_seed': 1, 'n_mut_outside': 3, 'n_wobble_outside': 1},
 '5.27': {'n_mut_seed': 1, 'n_mut_outside': 2, 'n_wobble_outside': 2},
 '5.28': {'n_mut_outside': 5},
 '5.29': {'n_mut_outside': 3, 'n_wobble_outside': 2},
 '5.30': {'n_mut_seed': 1, 'n_mut_outside': 5},
 '5.31': {'n_mut_outside': 6},
 '5.32': {'n_mut_outside': 4, 'n_wobble_outside': 2}
}

# create xticklabels based on this
xticklabels = []
median_classification_dict = {}
check_keys = ["n_mut_seed", "n_mut_outside", "n_wobble_seed", "n_wobble_outside"]
for key, value in prefix_dict.items():
    key_numbers = []
    for check_key in check_keys:
        if check_key in value:
            key_numbers.append(value[check_key])
        else:
            key_numbers.append(0)
            
    xticklabels.append(f"{key_numbers[0]}\n{key_numbers[1]}\n{key_numbers[2]}\n{key_numbers[3]}")

In [None]:
plt.figure(figsize=(6, 2.4))

sns.boxplot(x="prefix", y="knockdown_mut", data=training_df, showfliers=True, order=prefix_dict.keys(), color="tab:blue",
            flierprops=dict(marker='o', markersize=1, markerfacecolor='black', linestyle='none'))
plt.xticks(ticks=np.arange(len(prefix_dict)), labels=xticklabels)

plt.ylabel(r"log$_{10}$(stability)")
plt.tight_layout()
plt.xlabel("   # mutations in seed\n" +
            "            # mutations outside seed\n" +
            "# wobbles in seed\n"+
            "        # wobbles outside seed")
for format in [".png", ".svg"]:
    plt.savefig(f"{plot_folder}/5.2.6_training_df_boxplot.{format}", dpi=300)

In [None]:
plt.figure(figsize=(6, 2.4))

sns.boxplot(x="prefix", y="knockdown_mut", data=training_df_confound, showfliers=True, order=prefix_dict.keys(), color="tab:blue",
            flierprops=dict(marker='o', markersize=1, markerfacecolor='black', linestyle='none'))
plt.xticks(ticks=np.arange(len(prefix_dict)), labels=xticklabels)

plt.ylabel(r"log$_{10}$(stability)")
plt.tight_layout()
plt.xlabel("   # mutations in seed\n" +
            "            # mutations outside seed\n" +
            "# wobbles in seed\n"+
            "        # wobbles outside seed")
for format in [".png", ".svg"]:
    plt.savefig(f"{plot_folder}/5.2.6_training_df_confound_boxplot.{format}", dpi=300)

## 5.2.7 - Boxplot by the classified mutation impact

In [None]:
def get_median_mut_classification(input_df, impact_df):
    df = input_df.copy()
    position_columns = [col for col in df.columns if "pos_" in col]
    
    for index, row in df.iterrows():
        # get all mutation positions
        positions_mut = []
        for col in position_columns:
            if row[col] == 1:
                positions_mut.append(col)
                
        # get all wobble positions
        positions_wobble = []
        for col in position_columns:
            if row[col] == 2:
                positions_wobble.append(col)
                
        no_high_impact = 0
        no_mid_impact = 0
        no_low_impact = 0
        no_no_impact = 0
        
        for pos in positions_mut:
            if impact_df.loc[pos, "mismatch"] == "high":
                no_high_impact += 1
            elif impact_df.loc[pos, "mismatch"] == "mid":
                no_mid_impact += 1
            elif impact_df.loc[pos, "mismatch"] == "low":
                no_low_impact += 1
            elif impact_df.loc[pos, "mismatch"] == "no":
                no_no_impact += 1
        for pos in positions_wobble:
            if impact_df.loc[pos, "wobble"] == "high":
                no_high_impact += 1
            elif impact_df.loc[pos, "wobble"] == "mid":
                no_mid_impact += 1
            elif impact_df.loc[pos, "wobble"] == "low":
                no_low_impact += 1
            elif impact_df.loc[pos, "wobble"] == "no":
                no_no_impact += 1
                
        df.loc[index, "no_high_impact"] = no_high_impact
        df.loc[index, "no_mid_impact"] = no_mid_impact
        df.loc[index, "no_low_impact"] = no_low_impact
        df.loc[index, "no_no_impact"] = no_no_impact
        df.loc[index, "no_total_impact"] = no_high_impact + no_mid_impact + no_low_impact + no_no_impact
        
    df["classification"] = df["no_high_impact"].astype(int).astype(str) + "\n" +\
                            df["no_mid_impact"].astype(int).astype(str) + "\n" + \
                            df["no_low_impact"].astype(int).astype(str) + "\n" + \
                            df["no_no_impact"].astype(int).astype(str) + "\n" + \
                            df["no_total_impact"].astype(int).astype(str)
    
    # for those with a total larger than 4, set the classification to >4
    df.loc[df["no_total_impact"] > 4, "classification"] = "\n\n\n\n>4"
    
     # Summarize classification with threshold
    def summarize_impact(x, threshold=1):
        return '>{}'.format(threshold) if x > threshold else str(int(x))
    
    df["summary_high_impact"] = df["no_high_impact"].apply(lambda x: summarize_impact(x, 1))
    df["summary_mid_impact"] = df["no_mid_impact"].apply(lambda x: summarize_impact(x, 2))
    df["summary_low_impact"] = df["no_low_impact"].apply(lambda x: summarize_impact(x, 3))
    df["summary_no_impact"] = df["no_no_impact"].apply(lambda x: summarize_impact(x, 3))
    
    df["classification_summary"] = df["summary_high_impact"] + "\n" + df["summary_mid_impact"] + "\n" + \
                                    df["summary_low_impact"] + "\n" + df["summary_no_impact"]
    
    return df

In [None]:
training_df  = get_median_mut_classification(training_df, mutation_impact_df)
training_df_confound = get_median_mut_classification(training_df_confound, mutation_impact_df)

In [None]:
plt.figure(figsize=(6.5, 2.4))

sns.boxplot(x="classification", y="knockdown_mut", data=training_df, showfliers=True, color="tab:blue",
            flierprops=dict(marker='o', markersize=1, markerfacecolor='black', linestyle='none'))
plt.ylabel(r"log$_{10}$(stability)")

plt.xticks(fontsize = 6)
plt.tight_layout()
plt.xlabel("          # mutations high impact\n" +
        "                # mutations medium impact\n" +
        "         # mutations low impact\n" +
        "        # mutations no impact\n" +
        "# mutations total")

for format in [".png", ".svg"]:
    plt.savefig(f"{plot_folder}/5.2.7_training_df_boxplot_impact.{format}", dpi=300)        

In [None]:
plt.figure(figsize=(6.5, 2.4))

sns.boxplot(x="classification", y="knockdown_mut", data=training_df_confound, showfliers=True, color="tab:blue",
            flierprops=dict(marker='o', markersize=1, markerfacecolor='black', linestyle='none'))
plt.ylabel(r"log$_{10}$(stability)")

plt.xticks(fontsize = 6)
plt.tight_layout()
plt.xlabel("          # mutations high impact\n" +
        "                # mutations medium impact\n" +
        "         # mutations low impact\n" +
        "        # mutations no impact\n" +
        "# mutations total")

for format in [".png", ".svg"]:
    plt.savefig(f"{plot_folder}/5.2.7_training_df_confound_boxplot_impact.{format}", dpi=300)        

# 5.3 - Can we predict mutation impact using a heuristic model?

Here, we try to predict the impact of mutations by looking up the impact of individual mutations for the same miRNA and in the same cell line. We look either at a) the mutation with the maximum impact, b) the sum of the impact of the mutation, c) the product of the impact of the mutation.

In [None]:
all_mutations = training_df.copy()

In [None]:
# filter out single mutations and non-mutated miRNAs

no_mutations_index = [design for design in all_mutations.index if "5.0_miRNA" in design]
single_mutations_index = [design for design in all_mutations.index if "5.1_miRNA" in design]
single_mutations = all_mutations.loc[single_mutations_index, :]
single_wobble_index = [design for design in all_mutations.index if "5.2_miRNA" in design]
single_wobbles = all_mutations.loc[single_wobble_index, :]
all_mutations = all_mutations.drop(single_mutations_index)
all_mutations = all_mutations.drop(single_wobble_index)
all_mutations = all_mutations.drop(no_mutations_index)

In [None]:
# convert single mutations to a dictionary containing cell line, microRNA, and position of the mutation
single_mutation_dict = {}
for i, row in single_mutations.iterrows():
    cell_line = row["cell_line"]
    miRNA = row["orig_mi"]
    position = [int(pos.split("_")[-1]) for pos in position_columns if row[pos] == 1]
    if len(position) == 0:
        continue
    position = position[0]
    knockdown_mut = row["knockdown_mut"]
    
    single_mutation_dict[f"{cell_line}_{miRNA}_{position}"] = knockdown_mut

single_wobble_dict = {}
for i, row in single_wobbles.iterrows():
    cell_line = row["cell_line"]
    miRNA = row["orig_mi"]
    position = [int(pos.split("_")[-1]) for pos in position_columns if row[pos] == 2]
    if len(position) == 0:
        continue
    position = position[0]
    knockdown_mut = row["knockdown_mut"]
    
    single_wobble_dict[f"{cell_line}_{miRNA}_{position}"] = knockdown_mut

In [None]:
for index, row in all_mutations.iterrows():
    # get all mutation positions
    positions_mut = []
    for col in position_columns:
        if row[col] == 1:
            positions_mut.append(col.split("_")[-1])
    # get all wobble positions
    positions_wobble = []
    for col in position_columns:
        if row[col] == 2:
            positions_wobble.append(col.split("_")[-1])
    
    # get the cell line and microRNA
    cell_line = row["cell_line"]
    miRNA = row["orig_mi"]
    
    # --------------------------------------------------------------------------------------------
    # for each mutation and wobble, find the entry in single_mutations_cell_line that matches
    knockdown_single = []
    for position in positions_mut:
        try:
            knockdown_single.append(single_mutation_dict[f"{cell_line}_{miRNA}_{position}"])
        except KeyError:
            continue
    for position in positions_wobble:
        try:
            knockdown_single.append(single_wobble_dict[f"{cell_line}_{miRNA}_{position}"])
        except KeyError:
            continue
    
    # --------------------------------------------------------------------------------------------
    # MAXIMUM EFFECT
    # get the mutation with the largest impact
    # the data should be log10(stability) at this point, so the maximum is the mutation with the highest stability
    if len(knockdown_single) > 0:
        all_mutations.loc[index, "knockdown_strongest"] = max(knockdown_single)
    else:
        all_mutations.loc[index, "knockdown_strongest"] = np.nan
    
    # --------------------------------------------------------------------------------------------
    if len(knockdown_single) == 0:
        continue
    
    # for testing purposes
    print_flag = False
    if index == "5.32_miRNA_miR-31-3p_smut_swob_omut16.18.19.21_owob14.17_JEG3":
        print_flag = True
    
    if print_flag:
        print(row["knockdown_orig"])
    
    # --------------------------------------------------------------------------------------------
    # convert the original stability to a linear value
    knockdown_orig = 10**row["knockdown_orig"]
    # if the stability is larger than 0.99, set it to 0.99)
    knockdown_orig = knockdown_orig if knockdown_orig < 0.99 else 0.99
    # convert to an actual knockdown value (minus one so that no change is 0)
    knockdown_orig = (1/knockdown_orig)-1
    if print_flag:
        print(knockdown_orig)
    
    # convert the single mutations stabilities to a linear value
    knockdown_single = [10**x for x in knockdown_single]
    # if the stability is larger than 0.99, set it to 0.99)
    knockdown_single = [x if x < 0.99 else 0.99 for x in knockdown_single]
    # convert to an actual knockdown value (minus one so that no change is 0)
    knockdown_single = [(1/x)-1 for x in knockdown_single]
    # --------------------------------------------------------------------------------------------
    
    # at this point knockdown_single tells you how much the expression changes for each mutation
    # we convert this a relative fold change compared to the original knockdown
    knockdown_single = [x/knockdown_orig for x in knockdown_single]
    
    # if any of the resultant values is larger than 1, set it to 1
    # the assumption is that the non-mutated microRNA is the most potent
    # doing this can reduce noise and weird behavior in the following
    knockdown_single = [x if x < 1 else 1 for x in knockdown_single]
    
    # we then invert this relationship such that we can sum or add the values
    # multiple mutations increase the loss of activity
    knockdown_single = [1/x for x in knockdown_single]
    if print_flag:
        print(knockdown_single)
    
    # calculate the aggregate using sum or product
    # for the sum, if all values are 1, we want a sum of one
    aggregate_knockdown_sum = np.sum(knockdown_single) - len(knockdown_single) + 1
    aggregate_knockdown_product = np.product(knockdown_single)
    if print_flag:
        print(aggregate_knockdown_sum)
    # invert the two again and multiply them by the original knockdown
    aggregate_knockdown_sum = knockdown_orig/aggregate_knockdown_sum
    aggregate_knockdown_product = knockdown_orig/aggregate_knockdown_product
    
    # convert the two back to log10 expression data
    aggregate_knockdown_sum = np.log10(1/(aggregate_knockdown_sum+1))
    aggregate_knockdown_product = np.log10(1/(aggregate_knockdown_product+1))
    
    all_mutations.loc[index, "knockdown_aggregate_sum"] = aggregate_knockdown_sum
    all_mutations.loc[index, "knockdown_aggregate_product"] = aggregate_knockdown_product
      
# drop nan
all_mutations = all_mutations.dropna(subset=["knockdown_strongest"])
all_mutations = all_mutations.dropna(subset=["knockdown_aggregate_sum"])
all_mutations = all_mutations.dropna(subset=["knockdown_aggregate_product"])

In [None]:
plt.figure(figsize=(2.4, 1.8))

r2_single = stats.pearsonr(all_mutations["knockdown_strongest"], all_mutations["knockdown_mut"])[0]**2
r2_agg_sum = stats.pearsonr(all_mutations["knockdown_aggregate_sum"], all_mutations["knockdown_mut"])[0]**2
r2_agg_prod = stats.pearsonr(all_mutations["knockdown_aggregate_product"], all_mutations["knockdown_mut"])[0]**2

plt.scatter(all_mutations["knockdown_strongest"], all_mutations["knockdown_mut"], s=1, color="blue", label="maximum effect, r$^2$: " + f"{r2_single:.2f}", rasterized=True)
plt.scatter(all_mutations["knockdown_aggregate_sum"], all_mutations["knockdown_mut"], s=1, color="red", label="sum of effects, r$^2$: " + f"{r2_agg_sum:.2f}", rasterized=True)
plt.scatter(all_mutations["knockdown_aggregate_product"], all_mutations["knockdown_mut"], s=1, color="green", label="product of effects, r$^2$: " + f"{r2_agg_prod:.2f}", rasterized=True)

plt.plot([-1.5, 0], [-1.5, 0], linestyle="--", color="black")
plt.xlabel(r"log$_{10}$(stability predicted)")
plt.ylabel(r"log$_{10}$(stability measured)")

plt.tight_layout()
plt.legend(loc = [0.7, 0.0])
for format in ["png", "svg"]:
    plt.savefig(f"{base_plot_folder}/5.3-heuristic_model.{format}", dpi=600)

In [None]:
# show the top 10 with the largest difference between predicted and measured for aggregate
all_mutations["difference"] = all_mutations["knockdown_aggregate_product"] - all_mutations["knockdown_mut"]
all_mutations["abs_difference"] = np.abs(all_mutations["difference"])
all_mutations_sorted = all_mutations.sort_values("abs_difference", ascending=False)
all_mutations_sorted.head(10)[["knockdown_mut", "knockdown_orig", "knockdown_aggregate_product"]]

# 5.4 - Build a tree model to predict mutation impact

In [None]:
# save the dataframes
output_folder = f"../outputs/5_mutations"

# create it if it does not exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [None]:
# save the training_df to the output_folder
training_df.to_csv(f"{output_folder}/5.4_training_df.csv")
training_df_confound.to_csv(f"{output_folder}/5.4_training_df_confound.csv")

In [None]:
# load the training_df
training_df = pd.read_csv(f"{output_folder}/5.4_training_df.csv", index_col=0)
training_df_confound = pd.read_csv(f"{output_folder}/5.4_training_df_confound.csv", index_col=0)

In [None]:
# create the plot folder
plot_folder = os.path.join(base_plot_folder, "5.4_tree_models")

# create it if it does not exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)

## 5.4.1 - Data preprocessing

### Convert the impact mutation count columns to integers

In [None]:
impact_columns = ["no_high_impact", "no_mid_impact", "no_low_impact", "no_no_impact"]

# convert positions columns to categorical
for col in impact_columns:
    training_df[col] = training_df[col].astype("int")

### Split off a single mirna for testing purposes

In [None]:
training_df["orig_mi"].unique()

In [None]:
def process_training_df(df):
    df = df.copy()
    
    # save the original (we will need it later)
    df["knockdown_orig_archive"] = df["knockdown_orig"]
    df["knockdown_mut_archive"] = df["knockdown_mut"]

    # make the data linear
    df["knockdown_orig"] = 10**df["knockdown_orig"]
    df["knockdown_mut"] = 10**df["knockdown_mut"]

    # invert it
    df["knockdown_mut"] = 1/df["knockdown_mut"]
    df["knockdown_orig"] = 1/df["knockdown_orig"]

    # normalize the knockdown values by the original knockdown
    df["knockdown_mut"] = (df["knockdown_mut"]-1)/(df["knockdown_orig"]-1)

    # set values smaller than 0 to 0
    df["knockdown_mut"] = df["knockdown_mut"].apply(lambda x: x if x > 0 else 0)
    # set values larger than 1 to 1
    df["knockdown_mut"] = df["knockdown_mut"].apply(lambda x: x if x < 1 else 1)
    
    return df

In [None]:
training_df = process_training_df(training_df)
training_df_confound = process_training_df(training_df_confound)

In [None]:
# here, we exclude on of the miRNAs to later use it as test data
excluded_mirna = "hsa-miR-31-5p"
training_df_excluded = training_df[training_df["orig_mi"] == excluded_mirna]
training_df = training_df[training_df["orig_mi"] != excluded_mirna]

### Filter the data such that the original knockdown is reasonably high
#### This is necessary for the ratio to be meaningful

In [None]:
# print the length of both
print(len(training_df))
print(len(training_df_excluded))

# filter to those with "knockdown_orig" > 3 (same as we did above)
# this is necessary to reduce noise
training_df_filter = training_df[training_df["knockdown_orig"] > 3]
training_df_excluded_filter = training_df_excluded[training_df_excluded["knockdown_orig"] > 3]

print(len(training_df_filter))
print(len(training_df_excluded_filter))

### Generate X and y

In [None]:
# Here, we define what information we actually want to use for the training
# The simplest way is to use the impact columns only and give no information about the original knockdown
training_columns = impact_columns

# define X and y columns
X = training_df_filter[training_columns]
y = training_df_filter["knockdown_mut"]
X_excluded = training_df_excluded_filter[training_columns]
y_excluded = training_df_excluded_filter["knockdown_mut"]

# also save the original knockdown so that the data can be converted back to expression data later
y_restore = training_df_filter["knockdown_orig"]
y_restore_excluded = training_df_excluded_filter["knockdown_orig"]

In [None]:
# rename them to X_train, y_train, X_test, y_test
X_train = X
y_train = y

X_test = X_excluded
y_test = y_excluded

## 5.4.2 - Train an XGBoost tree

In [None]:
import xgboost as xgb

In [None]:
estimators = [50, 100, 200, 500, 1000, 1500]
max_depths = [5, 6, 7, 8, 9]
learning_rates = [0.1, 0.2, 0.3]
reg_lambdas = [1, 2, 3]

total_combs = len(estimators)*len(max_depths)*len(learning_rates)*len(reg_lambdas)

In [None]:
screen_params = pd.DataFrame(columns=["n_estimators", "max_depth", "learning_rate", "training_r2", "test_r2"])

i = 0
for n_estimators, max_depth, learning_rate, reg_lambda in itertools.product(estimators, max_depths, learning_rates, reg_lambdas):
    # create model instance
    bst = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, enable_categorical=False, reg_lambda=reg_lambda)
    # fit model
    bst.fit(X_train, y_train)
    # make predictions
    preds_train = bst.predict(X_train)
    preds_test = bst.predict(X_test)
    
    # restore the values fromn ratio to regular log10 expression
    preds_train = np.log10(1/(((y_restore - 1) * preds_train) + 1))
    preds_test = np.log10(1/(((y_restore_excluded - 1) * preds_test) + 1))
    
    y_train_restored = np.log10(1/(((y_restore - 1) * y_train) + 1))
    y_test_restored = np.log10(1/(((y_restore_excluded - 1) * y_test) + 1))
    
    # calculate r2 and the mse
    r2_train = stats.pearsonr(preds_train, y_train_restored)[0]**2
    r2_test = stats.pearsonr(preds_test, y_test_restored)[0]**2
    
    mse_train = np.mean((preds_train - y_train_restored)**2)
    mse_test = np.mean((preds_test - y_test_restored)**2)
    
    # add the new row to the screen_params dataframe
    new_row = {"n_estimators": n_estimators, "max_depth": max_depth, "learning_rate": learning_rate,
               "reg_lambda": reg_lambda, "training_r2": r2_train, "test_r2": r2_test,
               "mse_train": mse_train, "mse_test": mse_test}
    screen_params = pd.concat([screen_params, pd.DataFrame([new_row])])
    
    i += 1
    if i % 10 == 0:
        print(f"Progress: {i}/{total_combs}")

In [None]:
parameters = ["n_estimators", "max_depth", "learning_rate", "reg_lambda"]
optimal_screen_params = screen_params.sort_values("mse_test", ascending=True).head(1)
print(optimal_screen_params)
optimal_screen_params = dict(optimal_screen_params[parameters].iloc[0])

bst = xgb.XGBRegressor(**optimal_screen_params)
bst.fit(X_train, y_train)

# make predictions
preds_train = bst.predict(X_train)
preds_test = bst.predict(X_test)

# restore the values fromn ratio to regular log10 expression
preds_train = np.log10(1/(((y_restore - 1) * preds_train) + 1))
preds_test = np.log10(1/(((y_restore_excluded - 1) * preds_test) + 1))

y_train_restored = np.log10(1/(((y_restore - 1) * y_train) + 1))
y_test_restored = np.log10(1/(((y_restore_excluded - 1) * y_test) + 1))

### create a log plot of the results (XGBoost)

In [None]:
plt.figure(figsize=(2.4, 1.8))

# calculate r2
r2_test = stats.pearsonr(y_test_restored, preds_test)[0]**2
r2_train = stats.pearsonr(y_train_restored, preds_train)[0]**2

# plot predictions vs true values
plt.scatter(y_train_restored, preds_train, color = "blue", s=2, label="train, r$^2$: " + f"{r2_train:.2f}")
plt.scatter(y_test_restored, preds_test, color = "red", s=2, label=f"test, r$^2$: " + f"{r2_test:.2f}\n({excluded_mirna})")

# plot a line for x=y
plt.plot([-1.8, 0], [-1.8, 0], color="black", linestyle="--", linewidth=2)

plt.xlabel(r"log$_{10}$(measured stability)")
plt.ylabel(r"log$_{10}$(predicted stability)")

plt.tight_layout()
plt.legend(loc = [1,0.5], fontsize=7)
plt.savefig(f"{plot_folder}/5.4.2-XGBoost_{excluded_mirna}.png", dpi=300)

### predict on the entire data (minus the non-mutated miRNAs!)

In [None]:
training_df_wo_nonmut = training_df.copy()
training_df_wo_nonmut_excluded = training_df_excluded.copy()

# exclude original mirnas
training_df_wo_nonmut = training_df_wo_nonmut[~training_df_wo_nonmut.index.str.contains("5.0_")]
training_df_wo_nonmut_excluded = training_df_wo_nonmut_excluded[~training_df_wo_nonmut_excluded.index.str.contains("5.0_")]

In [None]:
# define X and y columns
X = training_df_wo_nonmut[training_columns]
y = training_df_wo_nonmut["knockdown_mut_archive"]
X_excluded = training_df_wo_nonmut_excluded[training_columns]
y_excluded = training_df_wo_nonmut_excluded["knockdown_mut_archive"]

# also save the original knockdown so that the data can be converted back to expression data later
y_restore_full = training_df_wo_nonmut["knockdown_orig"]
y_restore_excluded_full = training_df_wo_nonmut_excluded["knockdown_orig"]

In [None]:
# make predictions
preds_train = bst.predict(X)
preds_test = bst.predict(X_excluded)

# restore the values fromn ratio to regular log10 expression
preds_train = np.log10(1/(((y_restore_full - 1) * preds_train) + 1))
preds_test = np.log10(1/(((y_restore_excluded_full - 1) * preds_test) + 1))

# y_train_restored = np.log10(1/(((y_restore_full - 1) * y) + 1))
# y_test_restored = np.log10(1/(((y_restore_excluded_full - 1) * y_excluded) + 1))

In [None]:
# save these 
preds_train_xgboost = preds_train.copy()
preds_test_xgboost = preds_test.copy()

In [None]:
plt.figure(figsize=(2.4, 1.8))

# calculate r2
r2_test = stats.pearsonr(y_excluded, preds_test)[0]**2
r2_train = stats.pearsonr(y, preds_train)[0]**2

# plot predictions vs true values
plt.scatter(y, preds_train, color = "blue", s=2, label="train, r$^2$: " + f"{r2_train:.2f}", rasterized=True)
plt.scatter(y_excluded, preds_test, color = "red", s=2, label=f"test, r$^2$: " + f"{r2_test:.2f}\n({excluded_mirna})", rasterized=True)

# plot a line for x=y
plt.plot([-1.8, 0], [-1.8, 0], color="black", linestyle="--", linewidth=2)

plt.xlabel(r"log$_{10}$(measured stability)")
plt.ylabel(r"log$_{10}$(predicted stability)")

plt.tight_layout()
plt.legend(loc = [1,0.5], fontsize=7)
plt.savefig(f"{plot_folder}/5.4.2-XGBoost{excluded_mirna}_full.png", dpi=600)

## 5.4.3 Train a single regression tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
import itertools

max_depths = [5, 6, 7, 8, 9, 10]
min_samples_splits = [2, 3, 5, 10]
min_samples_leafs = [1, 2, 3, 4]
max_leaf_nodes = [None, 10, 20, 100]  # None means unlimited
total_combs = len(max_depths)*len(min_samples_splits)*len(min_samples_leafs)*len(max_leaf_nodes)

# DataFrame to store the results
screen_params = pd.DataFrame(columns=[
    "max_depth", "min_samples_split", "min_samples_leaf", "max_leaf_nodes",
    "training_r2", "test_r2", "mse_train", "mse_test"
])

i = 0
# Iterating over the parameter grid
for max_depth, min_samples_split, min_samples_leaf, max_leaf_node in itertools.product(
    max_depths, min_samples_splits, min_samples_leafs, max_leaf_nodes):
    
    # Create the model with current parameter combination
    tree = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=min_samples_split,
                                 min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_node,
                                 criterion="squared_error")
    
    # Fit model
    tree.fit(X_train, y_train)
    
    # Make predictions
    preds_train = tree.predict(X_train)
    preds_test = tree.predict(X_test)
    
    # restore the values fromn ratio to regular log10 expression
    preds_train = np.log10(1/(((y_restore - 1) * preds_train) + 1))
    preds_test = np.log10(1/(((y_restore_excluded - 1) * preds_test) + 1))
    
    y_train_restored = np.log10(1/(((y_restore - 1) * y_train) + 1))
    y_test_restored = np.log10(1/(((y_restore_excluded - 1) * y_test) + 1))
    
    # Calculate R^2 and MSE
    r2_train = r2_score(y_train_restored, preds_train)
    r2_test = r2_score(y_test_restored, preds_test)
    mse_train = mean_squared_error(y_train_restored, preds_train)
    mse_test = mean_squared_error(y_test_restored, preds_test)
    
    # Append the results to the DataFrame
    new_row = {
        "max_depth": max_depth, "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf, "max_leaf_nodes": max_leaf_node,
        "training_r2": r2_train, "test_r2": r2_test,
        "mse_train": mse_train, "mse_test": mse_test,
        "total_leaf_nodes": tree.get_n_leaves()
    }
    screen_params = pd.concat([screen_params, pd.DataFrame([new_row])], ignore_index=True)
    
    i += 1
    if i % 10 == 0:
        print(f"Progress: {i}/{total_combs}")

In [None]:
parameters = ["max_depth", "min_samples_split", "min_samples_leaf", "max_leaf_nodes"]
# add 0.01 * max_leaf_nodes to the mse_test to penalize large trees
screen_params_mse = screen_params.copy()
screen_params_mse["mse_test"] = screen_params_mse["mse_test"] + 0.0001 * screen_params_mse["max_leaf_nodes"]
optimal_screen_params = screen_params_mse.sort_values("mse_test", ascending=True).head(10)
print(optimal_screen_params)
optimal_screen_params = dict(optimal_screen_params[parameters].iloc[0])

# Create model instance
tree = DecisionTreeRegressor(**optimal_screen_params, criterion="squared_error")

# Fit model
tree.fit(X_train, y_train)

# Make predictions
preds_train = tree.predict(X_train)
preds_test = tree.predict(X_test)

# restore the values fromn ratio to regular log10 expression
preds_train = np.log10(1/(((y_restore - 1) * preds_train) + 1))
preds_test = np.log10(1/(((y_restore_excluded - 1) * preds_test) + 1))

y_train_restored = np.log10(1/(((y_restore - 1) * y_train) + 1))
y_test_restored = np.log10(1/(((y_restore_excluded - 1) * y_test) + 1))

# Calculate R^2 and MSE
r2_train = r2_score(y_train_restored, preds_train)
r2_test = r2_score(y_test_restored, preds_test)
mse_train = mean_squared_error(y_train_restored, preds_train)
mse_test = mean_squared_error(y_test_restored, preds_test)

# Print the results
print(f"Training R^2: {r2_train:.4f}")
print(f"Test R^2: {r2_test:.4f}")
print(f"Training MSE: {mse_train:.4f}")
print(f"Test MSE: {mse_test:.4f}")

In [None]:
screen_params.sort_values("mse_test", ascending=True).to_csv(f"{output_folder}/5.4_tree_models_screen_params.csv")

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(9,2.4))

nodes = plot_tree(tree, filled=True, feature_names=list(X_train.columns), rounded=True)

# Process the nodes to adjust labels
for node in nodes:
    node.set_fontsize(7)
    lines = node.get_text().split('\n')
    # If the node is a leaf
    if len(lines) > 3:
        text = lines[0]
        text = text.split("_")
        text[0] = "#"
        text = " ".join(text)
        node.set_text(text)
    else:
        node.set_text(lines[-1].split('value = ')[1])

# plot_tree(tree, filled=True, feature_names=X_train.columns, rounded=True)
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/5.4.3 - decision_tree.{format}")

In [None]:
import joblib

# Save the model to a file
filename = '5.4.3_decision_tree_model.joblib'
filepath = os.path.join(output_folder, filename)
joblib.dump(tree, filepath)

print(f"Model saved to {filepath}")

### create a log plot of the results (regression tree)

In [None]:
plt.figure(figsize=(2.4, 1.8))

# calculate r2
r2_test = stats.pearsonr(y_test_restored, preds_test)[0]**2
r2_train = stats.pearsonr(y_train_restored, preds_train)[0]**2

# plot predictions vs true values
plt.scatter(y_train_restored, preds_train, color = "blue", s=2, label="train, r$^2$: " + f"{r2_train:.2f}")
plt.scatter(y_test_restored, preds_test, color = "red", s=2, label=f"test, r$^2$: " + f"{r2_test:.2f}\n({excluded_mirna})")

# plot a line for x=y
plt.plot([-1.8, 0], [-1.8, 0], color="black", linestyle="--", linewidth=2)

plt.xlabel(r"log$_{10}$(measured stability)")
plt.ylabel(r"log$_{10}$(predicted stability)")

plt.tight_layout()
plt.legend(loc = [1,0.5], fontsize=7)
plt.savefig(f"{plot_folder}/5.4.3-RegressionTree_{excluded_mirna}.png", dpi=600)

### predict on the entire data (regression tree)

In [None]:
training_df_wo_nonmut = training_df.copy()
training_df_wo_nonmut_excluded = training_df_excluded.copy()
training_df_confound_wo_nonmut = training_df_confound.copy()

# exclude original mirnas
training_df_wo_nonmut = training_df_wo_nonmut[~training_df_wo_nonmut.index.str.contains("5.0_")]
training_df_wo_nonmut_excluded = training_df_wo_nonmut_excluded[~training_df_wo_nonmut_excluded.index.str.contains("5.0_")]
training_df_confound_wo_nonmut = training_df_confound_wo_nonmut[~training_df_confound_wo_nonmut.index.str.contains("5.0_")]

In [None]:
# define X and y columns
X = training_df_wo_nonmut[training_columns]
y = training_df_wo_nonmut["knockdown_mut_archive"]
X_excluded = training_df_wo_nonmut_excluded[training_columns]
y_excluded = training_df_wo_nonmut_excluded["knockdown_mut_archive"]
X_confound = training_df_confound_wo_nonmut[training_columns]
y_confound = training_df_confound_wo_nonmut["knockdown_mut_archive"]

# also save the original knockdown so that the data can be converted back to expression data later
y_restore_full = training_df_wo_nonmut["knockdown_orig"]
y_restore_excluded_full = training_df_wo_nonmut_excluded["knockdown_orig"]
y_restore_confound = training_df_confound_wo_nonmut["knockdown_orig"]

In [None]:
df_index_array = np.array(training_df.index)

# find the numerical indices (.iloc) of anything with orig_mi hsa-miR-21-5p
indices_21 = training_df_wo_nonmut[training_df_wo_nonmut["orig_mi"] == "hsa-miR-21-5p"].index
# Find the positions of your specific indices in the DataFrame's index array
positions_21 = np.where(np.isin(df_index_array, indices_21))

# find the numerical indices (.iloc)of anything with orig_mi hsa-miR-19b-3p
indices_19 = training_df_wo_nonmut[training_df_wo_nonmut["orig_mi"] == "hsa-miR-19b-3p"].index
# Find the positions of your specific indices in the DataFrame's index array
positions_19 = np.where(np.isin(df_index_array, indices_21))

In [None]:
# make predictions
preds_train = tree.predict(X)
preds_test = tree.predict(X_excluded)
preds_confound = tree.predict(X_confound)

# restore the values fromn ratio to regular log10 expression
preds_train = np.log10(1/(((y_restore_full - 1) * preds_train) + 1))
preds_test = np.log10(1/(((y_restore_excluded_full - 1) * preds_test) + 1))
preds_confound = np.log10(1/(((y_restore_confound - 1) * preds_confound) + 1))

In [None]:
plt.figure(figsize=(2.4, 1.8))

# calculate r2
r2_test = stats.pearsonr(y_excluded, preds_test)[0]**2
r2_train = stats.pearsonr(y, preds_train)[0]**2

# plot predictions vs true values
plt.scatter(y, preds_train, color = "tab:blue", s=2, label="train, r$^2$: " + f"{r2_train:.2f}", rasterized=True)
plt.scatter(y_excluded, preds_test, color = "tab:red", s=2, label=f"test, r$^2$: " + f"{r2_test:.2f}", rasterized=True) #\n({excluded_mirna})"

# plot a line for x=y
plt.plot([-1.8, 0], [-1.8, 0], color="black", linestyle="--", linewidth=2)

plt.xlabel(r"log$_{10}$(meas. stability)")
plt.ylabel(r"log$_{10}$(pred. stability)")
plt.xticks([-1.5, -1, -0.5, 0])
plt.yticks([-1.5, -1, -0.5, 0])

plt.tight_layout()
plt.legend(loc = [0.7,0.0], fontsize=7)
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/5.4.3-RegressionTree_{excluded_mirna}_full.{format}", dpi=600)

In [None]:
plt.figure(figsize=(2.4, 1.8))

# plot predictions vs true values
# plt.scatter(y, preds_train, color = "blue", s=5, label="train")
plt.scatter(y[indices_21], preds_train[indices_21], color = "tab:red", s=2, label="miR-21-5p", rasterized=True)
plt.scatter(y[indices_19], preds_train[indices_19], color = "tab:cyan", s=2, label="miR-19b-3p", rasterized=True)

# plot a line for x=y
plt.plot([-1.8, 0], [-1.8, 0], color="black", linestyle="--", linewidth=2)

plt.xlabel(r"log$_{10}$(meas. stability)")
plt.ylabel(r"log$_{10}$(pred. stability)")

plt.tight_layout()
plt.legend(loc = [0.7,0.0], fontsize=7)
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/5.4.3-RegressionTree_highlighted_individuality.{format}", dpi=600)

In [None]:
plt.figure(figsize=(2.4, 1.8))

# calculate r2
r2_confound = stats.pearsonr(y_confound, preds_confound)[0]**2
r2_train = stats.pearsonr(y, preds_train)[0]**2

# plot predictions vs true values
plt.scatter(y, preds_train, color = "tab:blue", s=2, label="train, r$^2$: " + f"{r2_train:.2f}", rasterized=True)
plt.scatter(y_confound, preds_confound, color = "tab:red", s=2, label=f"let-7, r$^2$: " + f"{r2_confound:.2f}", rasterized=True) #\n({excluded_mirna})"

# plot a line for x=y
plt.plot([-1.8, 0], [-1.8, 0], color="black", linestyle="--", linewidth=2)

plt.xlabel(r"log$_{10}$(meas. stability)")
plt.ylabel(r"log$_{10}$(pred. stability)")
plt.xticks([-1.5, -1, -0.5, 0])
plt.yticks([-1.5, -1, -0.5, 0])

plt.tight_layout()
plt.legend(loc = [0.7,0.0], fontsize=7)
for format in ["png", "svg"]:
    plt.savefig(f"{plot_folder}/5.4.3-RegressionTree_let7_full.{format}", dpi=600)

### compare XGBoost and Regression Tree predictions

In [None]:
plt.figure(figsize=(2.4, 1.8))
plt.scatter(preds_train, preds_train_xgboost)
plt.scatter(preds_test, preds_test_xgboost)

# 5.5 - Predict on miRbase miRNAs

In [None]:
# first, get all relevant microRNAs, either those in high confidence in mirbase or those in mirgenedb
with open("../microrna_data/likely_real_mirnas.pkl", "rb") as f:
    relevant_mirna_list = pickle.load(f)
    
# load the regression tree
filename = '5.4.3_decision_tree_model.joblib'
filepath = os.path.join(output_folder, filename)
tree = joblib.load(filepath)

In [None]:
mirbase_all_relevant = mirbase.loc[relevant_mirna_list, :]

In [None]:
mirna_length = 21
position_columns = ["pos_" + str(i) for i in range(1, mirna_length+1)]
impact_columns = ["no_high_impact", "no_mid_impact", "no_low_impact", "no_no_impact"]

# DO NOT RUN THIS CODE EVERY TIME - TAKES AROUND 43 MIN

This crosstalk dict gives the crosstalk for a specific target site (miRNA x) for all other miRNAs y.

In [None]:
for index, row in mirbase.iterrows():
    if index == "hsa-let-7b-5p":
        # create the dataframe
        df = pd.DataFrame(columns=position_columns, index=mirbase_all_relevant.index)
        target = row["target"]
        
        # add position-wise mutation info
        for mirna_index, mirna_row in mirbase_all_relevant.iterrows():
            mirna_seq = mirna_row["sequence_norm"]
            pattern = extract_pattern_from_seq(target=target, mirna=mirna_seq)
            df.loc[mirna_index, position_columns] = pattern
            
        # summarize mutation impact
        df = get_median_mut_classification(df, mutation_impact_df)
        
        # convert positions columns to categorical
        for col in impact_columns:
            df[col] = df[col].astype("int")
        
        # use the tree to predict crosstalk
        X = df[impact_columns]
        predictions = tree.predict(X)
        df["crosstalk"] = predictions

In [None]:
def get_family_mirnas(family):
    return mirbase[mirbase['family_extended'] == family].index.to_list()

let7_family = get_family_mirnas("let-7-5p")

df.loc[let7_family, "crosstalk"]

In [None]:
full_crosstalk_dict = {}
for index, row in mirbase.iterrows():
    # create the dataframe
    df = pd.DataFrame(columns=position_columns, index=mirbase_all_relevant.index)
    target = row["target"]
    
    # add position-wise mutation info
    for mirna_index, mirna_row in mirbase_all_relevant.iterrows():
        mirna_seq = mirna_row["sequence_norm"]
        pattern = extract_pattern_from_seq(target=target, mirna=mirna_seq)
        df.loc[mirna_index, position_columns] = pattern
        
    # summarize mutation impact
    df = get_median_mut_classification(df, mutation_impact_df)
    
    # convert positions columns to categorical
    for col in impact_columns:
        df[col] = df[col].astype("int")
    
    # use the tree to predict crosstalk
    X = df[impact_columns]
    predictions = tree.predict(X)
    df["crosstalk"] = predictions
    
    # save to dict
    full_crosstalk_dict[index] = df

In [None]:
# save the crosstalk dict to a file
output_folder = "../outputs/5_mutations"

with open(f"{output_folder}/5.5_full_crosstalk_dict.pkl", "wb") as f:
    pickle.dump(full_crosstalk_dict, f)

# 5.7 - Crosstalk filtering for Notebook 3

In [18]:
plot_folder = os.path.join(base_plot_folder, "5.7_crosstalk_filtering")
# create folder if it doesn't exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)

In [19]:
from library2_utils.transfer_functions import transfer_function

with open(f"../outputs/3_fitting/combined_dataset/combined_dataset_popt_wo_crosstalk.pkl", "rb") as f:
    popt = pickle.load(f)
with open(f"../outputs/3_fitting/combined_dataset/combined_dataset_scale_dict_wo_crosstalk.pkl", "rb") as f:
    scale_dict = pickle.load(f)

x_range_log = np.arange(0, 5.5, 0.1)
x_range_lin = 10**x_range_log    
y_transfer = np.log10(transfer_function(x_range_lin, *popt))

df_expression = df_combined.copy()
cell_lines_expression = ["HEK293T", "HeLa", "MCF7", "A549", "HaCaT", "HUH7", "PC3", "JEG3", "Tera1", "SKNSH"]

# add the scale to the data
for cell_line in cell_lines_expression:
    df_expression[cell_line] = df_expression[cell_line] + scale_dict[cell_line] 

### Calculate heuristically

In [16]:
output_folder = f"../outputs/5_mutations"

# load full_crosstalk_dict
with open(f"{output_folder}/5.5_full_crosstalk_dict.pkl", "rb") as f:
    full_crosstalk_dict = pickle.load(f)

In [20]:
from library2_utils.crosstalk import merge_identical_mirnas

df_expression_orig = df_expression.copy()
df_expression, groups = merge_identical_mirnas(df_expression, mirbase)

In [89]:
df_knockdown_predicted = np.log10((10**df_expression).apply(lambda x: transfer_function(x, *popt)))   

crosstalk_filter_df = pd.DataFrame(columns=cell_lines_expression, index=df_expression.index)
crosstalk_filter_df.loc[:, :] = False

for key in full_crosstalk_dict.keys():
    if not key in df_knockdown_predicted.index:
        continue
    
    df = full_crosstalk_dict[key].copy()
    df = df[df.index != key]
    
    # check for total less than 5 mutations
    df = df[(df["no_total_impact"]) < 5]
    
    # check for high impact mutations
    df = df[df["no_high_impact"] < 2]
    
    # check for mid_impact mutations
    df = df[(df["no_mid_impact"]+df["no_high_impact"]) < 4]
    
    # check for all mutations
    df = df[(df["no_low_impact"]+df["no_mid_impact"]+df["no_high_impact"]) < 5]

    # add an expression row to the df
    df = df[df.index.isin(df_knockdown_predicted.index)]
    
    if len(df) > 0:
        df.loc[:, cell_lines_expression] = df_knockdown_predicted.loc[df.index, cell_lines_expression]
    else:
        continue
    
    # this is the knockdown across all remaining miRNAs after identifying those with likely crosstalk
    min_by_cell_line = df[cell_lines_expression].min(axis=0)
    
    # this is the expected knockdown for the miRNA itself
    knockdown_orig = df_knockdown_predicted.loc[key, cell_lines_expression]
    
    # we only filter if there is substantial expression of at least one of the potentially crosstalking miRNAs
    crosstalk_filter_df.loc[key, :] = ((10**knockdown_orig)/3 > 10**min_by_cell_line) & (min_by_cell_line < -0.5)

In [90]:
crosstalk_filter_dict = {}
for cell_line in cell_lines_expression:
    crosstalk_filter_dict[cell_line] = list(crosstalk_filter_df[crosstalk_filter_df[cell_line] == True].index)

## Plot crosstalk filtering to check if it makes sense

In [91]:
# these are substantially shifted to the right
ratio = 10**df_expression / 10**df_expression_orig
ratio = ratio > 3

In [None]:
plt.figure(figsize=(2,1.5))
x_range_log = np.linspace(0, 5.5, 1000)
y_transfer = np.log10(transfer_function(10**x_range_log, *popt))

x_vals = []
y_vals = []
for i, cell_line in enumerate(df_expression.columns):
    df_ex = df_expression.loc[:, cell_line]
    df_knock = df_knockdown.loc[:, cell_line]
    
    # grab miRNA classification
    crosstalk_mirnas = crosstalk_filter_dict[cell_line]
    
    # filter crosstalk_mirnas to those in df_ex
    crosstalk_mirnas = [mirna for mirna in crosstalk_mirnas if mirna in df_ex.index]
    shifted_mirnas = ratio[cell_line][ratio[cell_line] == True].index
    all_other_mirnas = df_ex.index.difference(crosstalk_mirnas).difference(shifted_mirnas)
    
    plt.scatter(df_ex.loc[all_other_mirnas], df_knock.loc[all_other_mirnas], s=4, alpha=0.4, edgecolor="none",
                color="tab:blue", zorder=1, rasterized=True)
    plt.scatter(df_ex.loc[crosstalk_mirnas],
                df_knock.loc[crosstalk_mirnas], color="tab:red", s=4, zorder=2, edgecolor="none",
                label="filtered crosstalk" if i==0 else "", rasterized=True)
    plt.scatter(df_ex.loc[shifted_mirnas],
                df_knock.loc[shifted_mirnas], color="tab:orange", s=4, zorder=3, edgecolor="none",
                label="merged miRNAs" if i==0 else "", rasterized=True)
    
    x_vals.extend(np.log10(transfer_function(10**df_ex.loc[all_other_mirnas], *popt)))
    y_vals.extend(df_knock.loc[all_other_mirnas])
    
    if i == 0:
        plt.plot(x_range_log, y_transfer, ls="--", lw=1, color="black")

x_vals_flatten = np.array(x_vals).flatten()
y_vals_flatten = np.array(y_vals).flatten()
r2 = stats.pearsonr(x_vals_flatten, y_vals_flatten)[0]**2
rmsd = np.sqrt(np.mean((x_vals_flatten - y_vals_flatten)**2))

#plt.title(f"r$^2$: {r2:.4f}, RMSD: {rmsd:.4f}")

plt.xlabel(r"log$_{10}$"+f"(miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")

plt.xlim(0, 5.5)
plt.xticks([1,2,3,4,5])
plt.yticks([-2,-1,0])
plt.ylim(-2, 0.25)
plt.tight_layout()
plt.legend(loc="lower left", frameon=False, fontsize=7)
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"5.7_crosstalk_filtering.{format}"), dpi=600)

In [None]:
plt.figure(figsize=(2,1.5))
x_range_log = np.linspace(0, 5.5, 1000)
y_transfer = np.log10(transfer_function(10**x_range_log, *popt))

x_vals = []
y_vals = []
for i, cell_line in enumerate(df_expression.columns):
    df_ex = df_expression.loc[:, cell_line]
    df_ex_orig = df_expression_orig.loc[:, cell_line]
    df_knock = df_knockdown.loc[:, cell_line]
    
    # filter crosstalk_mirnas to those in df_ex
    shifted_mirnas = ratio[cell_line][ratio[cell_line] == True].index
    
    plt.scatter(df_ex_orig.loc[shifted_mirnas],
                df_knock.loc[shifted_mirnas], color="tab:blue", s=4, zorder=2, edgecolor="none",
                label="before merging" if i==0 else "", rasterized=True)
    plt.scatter(df_ex.loc[shifted_mirnas],
                df_knock.loc[shifted_mirnas], color="tab:orange", s=4, zorder=3, edgecolor="none",
                label="after merging" if i==0 else "", rasterized=True)
    # add text to the points
    # for mirna in shifted_mirnas:
    #     if "23b" in mirna:
    #         plt.text(df_ex.loc[mirna], df_knock.loc[mirna], "-".join(mirna.split("-")[2:]), fontsize=4, color="black")
    
    if i == 0:
        plt.plot(x_range_log, y_transfer, ls="--", lw=1, color="black")

plt.xlabel(r"log$_{10}$"+f"(miRNA expression)")
plt.ylabel(r"log$_{10}$(stability)")

plt.xlim(0, 5.5)
plt.xticks([1,2,3,4,5])
plt.yticks([-2,-1,0])
plt.ylim(-2, 0.25)
plt.tight_layout()
plt.legend(loc="lower left", frameon=False, fontsize=7)
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"5.7_shifted_mirnas.{format}"), dpi=600)

In [51]:
# # save the crosstalk dict to a file
output_folder = "../outputs/5_mutations"

with open(f"{output_folder}/5.7_crosstalk_filter_dict.pkl", "wb") as f:
    pickle.dump(crosstalk_filter_dict, f)