In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import os
import pickle
import library2_utils.NA_sequence_utilities as seq_utils
from library2_utils.transfer_functions import transfer_function, inverse_transfer
from library2_utils.color_scheme import cell_line_colors, cell_line_symbols
from library2_utils.plotting_utilities import HandlerSize, return_pvalue_text

import warnings
# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

from nupack import *
my_model = Model(material='rna', celsius=37)

# Suppress FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

cell_lines_subset = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH7", "A549"]
cell_lines_rest = ["HaCaT", "JEG3", "Tera1", "PC3"]
cell_lines_expression = cell_lines_subset + cell_lines_rest
cell_lines_measured = cell_lines_subset + cell_lines_rest
cell_lines_measured_UTR = [cell_line + "_3UTR" for cell_line in cell_lines_measured]

rename_dict = {cell_line_UTR: cell_line for cell_line, cell_line_UTR in zip(cell_lines_measured, cell_lines_measured_UTR)}

plot_folder = "../plots/6_context/"
# create folder if it does not exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)
    
output_folder = "../outputs/6_context_impact"
# create if if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [2]:
data_dir_input = "../measured_data/2_normalized_log10"

# get the name of all files in "reference" folder
reference_files = os.listdir(data_dir_input)

# read them into a dictionary
reference_dict = {}
for reference_file in reference_files:
    if reference_file.endswith(".csv"):
        reference_dict[reference_file.split('.')[0]] = pd.read_csv(os.path.join(data_dir_input, reference_file), index_col=0)

In [11]:
context_data = reference_dict["0_lib2_controls"]
mirna_in_context = reference_dict["4_miRNA_full_single_context_controls"]

# rename the columns using rename_dict
context_data.rename(columns=rename_dict, inplace=True)
mirna_in_context.rename(columns=rename_dict, inplace=True)

used_mirnas = mirna_in_context["miRNA1"].unique()

In [4]:
# get mirbase
mirbase = pd.read_csv("../microrna_data/mirbase_extended.csv", index_col=0)

## Load fit values

In [5]:
used_mirna_data = "Alles_2019"

if used_mirna_data == "combined_dataset":
    with open(f"../outputs/3_fitting/combined_dataset/combined_dataset_popt_wo_crosstalk.pkl", "rb") as f:
        popt = pickle.load(f)
    with open(f"../outputs/3_fitting/combined_dataset/combined_dataset_scale_dict_wo_crosstalk.pkl", "rb") as f:
        scale_dict = pickle.load(f)

    c1 = popt[0]

    df_expression = pd.read_csv("../microrna_data/3_output/Alles_Keller_combined_expression_with_crosstalk.csv", index_col=0)

    df_expression_unscaled = df_expression.copy()
    for cell_line in cell_lines_subset+cell_lines_rest:
        df_expression[cell_line] = df_expression[cell_line] + scale_dict[cell_line]
        
if used_mirna_data == "Alles_2019_unscaled":
    with open(f"../outputs/3_fitting/Alles2019/Alles2019_popt_unscaled.pkl", "rb") as f:
        popt = pickle.load(f)
    with open(f"../outputs/3_fitting/Alles2019/Alles2019_scale_dict.pkl", "rb") as f:
        scale_dict = pickle.load(f)

    c1 = popt[0]
    
    df_expression = pd.read_csv("../microrna_data/2_output/Alles2019_conormalized.csv", index_col=0)

if used_mirna_data == "Alles_2019":
    with open(f"../outputs/3_fitting/Alles2019/Alles2019_popt.pkl", "rb") as f:
        popt = pickle.load(f)
    with open(f"../outputs/3_fitting/Alles2019/Alles2019_scale_dict.pkl", "rb") as f:
        scale_dict = pickle.load(f)

    c1 = popt[0]
    
    df_expression = pd.read_csv("../microrna_data/2_output/Alles2019_conormalized.csv", index_col=0)
    for cell_line in scale_dict.keys():
        df_expression[cell_line] = 10**(np.log10(df_expression[cell_line]) + scale_dict[cell_line])
    
plot_folder = os.path.join(plot_folder, used_mirna_data)
# create folder if it does not exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)

# 6.1 - Investigate context stability itself (no mirnas)

In [6]:
current_plot_folder = os.path.join(plot_folder, "6.1_context_stability")
# create it if it doesn't exist
if not os.path.exists(current_plot_folder):
    os.makedirs(current_plot_folder)

In [None]:
# # only look at the context sequences without engineered secondary structure
# context_data = context_data.iloc[:31]
# print(len(context_data))

In [None]:
# calculate the GC content of "seq"
GC_content = context_data["seq"].apply(lambda x: seq_utils.GC_content(x))
# plot a histogram of the GC content
plt.figure(figsize=(2.4, 2))
plt.hist(GC_content, bins=20, color="tab:blue", alpha=0.5)
plt.xlabel("GC content")
plt.ylabel("Number of sequences")
plt.tight_layout()
plt.savefig(os.path.join(current_plot_folder, "GC_content_histogram.svg"))

In [None]:
# plot the context stabilities as box plots across the cell lines
plt.figure(figsize=(3, 2))
sns.boxplot(data=context_data[cell_lines_measured], palette=cell_line_colors.values(),
            showfliers=True, flierprops=dict(marker='o', markersize=2))
plt.xticks(rotation=45)
plt.ylabel(r"log$_{10}$(stability)")
plt.ylim(-0.6,0.6)
plt.title("context stabilities without miRNA target sites")
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(current_plot_folder, f"context_stabilities_boxplot.{format}"), dpi=300)

In [15]:
# get the relative context stability
relative_stability = context_data[cell_lines_measured].mean(axis=0)

# normalize to 293T
relative_stability = relative_stability - relative_stability["HEK293T"]

# save for later use
relative_stability.to_csv(os.path.join(output_folder, "relative_context_stability.csv"))

## 6.1.2 - Estimate the impact on the transfer function

In [None]:
context_data_lin = 10**context_data[cell_lines_measured]

# what is the mean stability?
mean_stability = context_data_lin.mean()
# iterate over the mean stability series and print it
print("Mean stability:")
for cell_line, stability in mean_stability.items():
    print(f"{cell_line}: {stability:.2f}")

# investigate context 1 stability
context1_stability = context_data_lin.iloc[0,:]
print("\nContext 1 stability:")
for cell_line, stability in context1_stability.items():
    print(f"{cell_line}: {stability:.2f}")

print("\nThis is suprisingly far away from 1 given our normalization method.\n" +
       "Looking at a single construct as a reference is probably quite noisy.\nAssume it is 1 instead.")

The constant c is given by $c=k_{deg}/k_{on}$. If we assume that the mean stability is approximately the same across cell lines, then the relative k_deg of context 1 across cell lines can be estimated as $k_{deg, 1} \approx 1/\mathrm{mRNA_{mean}}$

In [None]:
adjustment_factor = np.log10(1 / mean_stability)
for cell_line, factor in adjustment_factor.items():
    print(f"{cell_line}: {factor:.3f}")

We should compare this to the fitted constant:

In [18]:
comp_df = pd.DataFrame(columns = scale_dict.keys(), index = ["scaling factor", "context factor"])
comp_df.loc["context factor", :]  = adjustment_factor.values
comp_df.loc["scaling factor", :] = list(scale_dict.values())

In [None]:
# Setting the positions and width for the bars
positions = np.arange(comp_df.shape[1])
width = 0.35

fig, ax = plt.subplots(figsize=(4.2, 1.8))
# Plotting the bars for 'scaling' and 'from_context'
rects1 = ax.bar(positions - width/2, comp_df.loc['scaling factor', :], width, label='from fitting the transfer function')
rects2 = ax.bar(positions + width/2, comp_df.loc['context factor', :], width, label='inferred factor from context stability')

# Adding some labels and title
ax.set_ylabel('scaling factor')
ax.set_xticks(positions)
ax.set_xticklabels(comp_df.columns, rotation=45)
ax.set_ylim(-0.4, 0.4)
ax.legend()

# Adding grid
ax.grid(True, which='both', linestyle='--', linewidth=0.5)

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(current_plot_folder ,f"6.1_context_scale_factor.{format}"), dpi=300)

# 6.2 - Investigate knockdown of microRNAs in differentially stable contexts

In [None]:
# this is the stability data for mirnas in different contexts
m2 = mirna_in_context[mirna_in_context["context"].isin(context_data.index)]
len(mirna_in_context[mirna_in_context["context"].isin(context_data.index)]["context"].unique())

In [21]:
current_plot_folder = os.path.join(plot_folder, "6.2_mirna_context/")
# create folder if it does not exist
if not os.path.exists(current_plot_folder):
    os.makedirs(current_plot_folder)

In [22]:
%%capture output
# plot the distribution of knockdown across cell lines and context values
for mirna in used_mirnas:
    plt.figure(figsize=(7, 4))
    plt.clf()
    df = mirna_in_context[mirna_in_context["miRNA1"] == mirna]
    
    # constrain to those designs that are in the context data
    df = df[df["context"].isin(context_data.index)]
    
    sns.boxplot(data=df[cell_lines_measured])
    sns.stripplot(data=df[cell_lines_measured], color="black", jitter=0.2, size=2)
    plt.xlabel("cell line")
    plt.ylabel("log10(stability)")
    plt.title(mirna)
    plt.tight_layout()
    plt.savefig(os.path.join(current_plot_folder, "distribution_" + mirna + "_unnormalized.png"))

In [23]:
# if miRNA expression is on a log scale, we need to convert it back to linear scale
if df_expression.max().max() < 10:
    mirna_expr = 10**df_expression[cell_lines_expression].copy()
else:
    mirna_expr = df_expression[cell_lines_expression].copy()

## Try to adjust the transfer function

The constant c is given by $c=k_{deg}/k_{on}$. Assuming that $k_{on}$ stays the same, if the base stability of a context is $m2_0$, and the baseline stability used to fit c is $m1_0$,  
then $k_{on}=k_{deg, 1}/c_1$ and $c_2=k_{deg, 2}/k_{on}=c_1\cdot k_{deg, 2}/k_{deg, 1}=c_1\cdot m1_0/m2_0$.

In [24]:
# get the baseline stability of the different contexts
m2_0 = context_data[cell_lines_expression]
# make it linear
m2_0 = 10**m2_0
# use a baseline stability of 1 instead to reduce noise
# (this is because the data is normalized to make this the case)
m1_0 = 1

# calculate the new constant
# original constant
c1 = 10**c1
# constant fear each context
c2 = c1 * m1_0 / m2_0

# convert back to log10
c1 = np.log10(c1)
c2 = np.log10(c2)

In [25]:
for index, row in m2.iterrows():
    mirna = row["miRNA1"]
    context = row["context"]
    
    curr_c2 = c2.loc[context, :]
    mirna_expression = mirna_expr.loc[mirna, :]
    
    # just use the baseline stability of context1 with the microRNA
    predicted_baseline = transfer_function(mirna_expression, c1, popt[1])
    # multiply unadjusted transfer function by the baseline stability of the context
    predicted_expression_non_adjusted = m2_0.loc[context, mirna_expr.columns]*transfer_function(mirna_expression, c1, popt[1])
    # adjust the transfer function by the stability of the context
    predicted_expression = m2_0.loc[context, mirna_expr.columns]*transfer_function(mirna_expression, curr_c2, popt[1])
    
    predicted_columns = ["predicted_" + x for x in mirna_expr.columns]
    predicted_unadjusted_columns = ["predicted_unadjusted_" + x for x in mirna_expr.columns]
    predicted_baseline_columns = ["predicted_baseline_" + x for x in mirna_expr.columns]
    
    m2.loc[index, predicted_baseline_columns] = np.log10(predicted_baseline.values)
    m2.loc[index, predicted_unadjusted_columns] = np.log10(predicted_expression_non_adjusted.values)
    m2.loc[index, predicted_columns] = np.log10(predicted_expression.values)

In [None]:
# plot the distribution of knockdown across cell lines and context values
plt.figure(figsize=(4.8, 3.6))
plt.clf()

df = m2.copy()
df_deviation = pd.DataFrame(columns=cell_lines_expression)
df_deviation[cell_lines_expression] = df[cell_lines_expression].values-df[predicted_unadjusted_columns].values

r2_adjusted = stats.pearsonr(df[cell_lines_expression].values.flatten(), df[predicted_columns].values.flatten())[0]**2
r2_unadjusted = stats.pearsonr(df[cell_lines_expression].values.flatten(), df[predicted_unadjusted_columns].values.flatten())[0]**2
r2_baseline = stats.pearsonr(df[cell_lines_expression].values.flatten(), df[predicted_baseline_columns].values.flatten())[0]**2

rmsd_adjusted = np.sqrt(np.mean((df[cell_lines_expression].values.flatten() - df[predicted_columns].values.flatten())**2))
rmsd_unadjusted = np.sqrt(np.mean((df[cell_lines_expression].values.flatten() - df[predicted_unadjusted_columns].values.flatten())**2))
rmsd_baseline = np.sqrt(np.mean((df[cell_lines_expression].values.flatten() - df[predicted_baseline_columns].values.flatten())**2))

plt.scatter(df[predicted_baseline_columns], df[cell_lines_expression], s=1, alpha=1, color="tab:red", edgecolors="none",
    label=r"baseline, r$^2$: {:.2f}, rmsd: {:.3f}".format(r2_baseline, rmsd_baseline))
plt.scatter(df[predicted_unadjusted_columns], df[cell_lines_expression], s=1, alpha=1, color="tab:blue", edgecolors="none",
            label=r"relative stability, r$^2$: {:.2f}, rmsd: {:.3f}".format(r2_unadjusted, rmsd_unadjusted))
plt.scatter(df[predicted_columns], df[cell_lines_expression], s=1, alpha=1, color="tab:orange", edgecolors="none",
            label="c adjusted, R2: {:.2f}, rmsd: {:.3f}".format(r2_adjusted, rmsd_adjusted))

plt.plot([-2, 0.5], [-2, 0.5], color="black", linestyle="--")
plt.xlabel(r"log$_{10}$(predicted stability)")
plt.ylabel(r"log$_{10}$(measured stability)")
plt.tight_layout()
plt.legend(fontsize=7, loc=[1,0.5])
for format in ["png", "svg"]:
    plt.savefig(os.path.join(current_plot_folder, f"6.2_expression_all_normalized.{format}"), dpi=300)

## Plot the transfer function for individual context sequences

In [27]:
# measured stability with miRNA [all contexts]
m2 = mirna_in_context[mirna_in_context['context'].isin(m2_0.index)]
for index, row in m2.iterrows():
    mirna = row["miRNA1"]
    context = row["context"]

    mirna_expression = mirna_expr.loc[mirna, cell_lines_expression]
    
    # use the transfer function
    predicted_stability = m2_0.loc[context, cell_lines_expression]*transfer_function(mirna_expression, *popt)
    predicted_columns = ["predicted_" + x for x in predicted_stability.index]
    m2.loc[index, predicted_columns] = np.log10(predicted_stability.values)

In [None]:
plt.figure(figsize=(2,1.5))

x_range_log = np.arange(1, 5.5, 0.1)
x_range_lin = 10**x_range_log
y_transfer = np.log10(transfer_function(x_range_lin, *popt))

x_vals = []
y_vals = []
for context in m2["context"].unique():
    
    # these are the measured values!
    m2_filter = m2[m2["context"] == context]
    c2_values = c2.loc[context]
    mirnas = m2_filter["miRNA1"]
    for cell_line in cell_lines_expression:
        if not cell_line in df_expression.columns:
            continue
        
        # high_ddG_index = [index for index in m2_filter.index if index in high_ddG]
        # low_ddG_index = [index for index in m2_filter.index if index in low_ddG]
        
        # I want to plot the stability relative to the background!
        relative_stability = m2_filter[cell_line]-np.log10(m2_0.loc[context, cell_line])
        
        # # calculate the altered "expression" values (don't do this)
        # c2_cell_line = c2_values[cell_line]
        # expression = mirna_expr.loc[mirnas, cell_line]*(10**c1/10**c2_cell_line)
        expression = mirna_expr.loc[mirnas, cell_line]
        
        x_vals.append(np.log10(transfer_function(expression, c1, popt[1])))
        y_vals.append(relative_stability.values)
        
        plt.scatter(np.log10(expression), relative_stability,
                    s=4, alpha=0.4, color="tab:blue", rasterized=True, edgecolor="none")

x_vals_flatten = np.concatenate(x_vals)
y_vals_flatten = np.concatenate(y_vals)
r2 = stats.pearsonr(x_vals_flatten, y_vals_flatten)[0]**2
rmse = np.sqrt(np.mean((x_vals_flatten - y_vals_flatten)**2))

plt.plot(x_range_log, y_transfer, color="black", ls="--")

plt.xlim(1,5.5)
plt.xticks([1,2,3,4,5])
plt.yticks([-2,-1,0])
plt.xlabel(r"log$_{10}$(miRNA expression)")
plt.ylabel(r"log$_{10}$(relative stability)")
plt.text(x=1.5, y=-1.5, s=r"r$^2$ = {:.2f}".format(r2)+"\n"+"rmse = {:.2f}".format(rmse))
#plt.title("miRNA behavior in different contexts")
plt.tight_layout()
for format in ["svg", "png"]:
    plt.savefig(os.path.join(current_plot_folder, f"6.2-knockdown_versus_expression_scaled.{format}"), dpi=600)

In [29]:
%%capture output
individual_plot_folder = os.path.join(current_plot_folder, "invidivual")
if not os.path.exists(individual_plot_folder):
    os.makedirs(individual_plot_folder)

x_range_log = np.arange(1, 5.5, 0.1)
x_range_lin = 10**x_range_log
y_transfer = np.log10(transfer_function(x_range_lin, *popt))

for context in m2["context"].unique():
    plt.figure(figsize=(2,1.5))
    x_vals = []
    y_vals = []
    
    # these are the measured values!
    m2_filter = m2[m2["context"] == context]
    mirnas = m2_filter["miRNA1"]
    
    for cell_line in cell_lines_expression:
        if not cell_line in df_expression.columns:
            continue
        
        # I want to plot the stability relative to the background!
        relative_stability = m2_filter[cell_line]-np.log10(m2_0.loc[context, cell_line])
        expression = mirna_expr.loc[mirnas, cell_line]
        plt.scatter(np.log10(expression), relative_stability,
                    s=4, alpha=0.4, color="tab:blue", rasterized=True, edgecolor="none")
        x_vals.append(np.log10(transfer_function(expression, *popt)))
        y_vals.append(relative_stability.values)

    x_vals_flatten = np.concatenate(x_vals)
    y_vals_flatten = np.concatenate(y_vals)
    r2 = stats.pearsonr(x_vals_flatten, y_vals_flatten)[0]**2

    plt.plot(x_range_log, y_transfer, color="black", ls="--")

    plt.xlim(1,5.5)
    plt.xticks([1,2,3,4,5])
    plt.yticks([-2,-1,0])
    plt.xlabel(r"log$_{10}$(scaled miRNA expression)")
    plt.ylabel(r"log$_{10}$(relative stability)")
    plt.text(x=1.5, y=-1.5, s=r"r$^2$ = {:.2f}".format(r2))
    plt.title(f"{context}\nbase stability: {m2_0.loc[context,:].mean():.2f}", fontsize=7)
    plt.tight_layout()
    for format in ["svg", "png"]:
        plt.savefig(os.path.join(individual_plot_folder, f"6.2-knockdown_{context}.{format}"), dpi=600)

# 6.3 - Investigate secondary structure for microRNAs in different contexts

In [30]:
current_plot_folder = os.path.join(plot_folder, "6.3_natural_context_ddG")
# create it if it doesn't exist
if not os.path.exists(current_plot_folder):
    os.makedirs(current_plot_folder)

In [31]:
fivep_add = "GCCCCGUGCUGCUGCCCGACAACCACUACCUGAGCACCCAGUCCGCCCUGAGCAAAGACCCCAACGAGAAGCGCGAUCACAUGGUCCUGCUGGAGUUCGUGACCGCCGCCGGGAUCACUCUCGGCAUGGACGAGCUGUACAAGUAAUUCUAGUUGUUUAAAGCCCAACGCUAGUUUCCCUACACGACGCUCUUCCGAUCU"
threep_add = "CUCUGGAUUUGCAACCGACAUAGACAAACAGGCAUGCAAGCUGAUCCGGCUGCUAACAAAGCCCGAAAGGAAGCUGAGUUGGCUGCUGCCACCGCUGAGCAAUAACUAGCAUAACCCCUUGGGGCGGCCGCUUCGAGCAGACAUGAUAAGAUACAUUGAUGAGUUUGGACAAACCACAACUAGAAUGCAG"

def calculate_context_ddG(df, nts=20):
    """Calculate the ddG for a single microRNA in context"""
    df = df.copy()
    for row in df.iterrows():
        # ----------------------------------------------------------------------------------------
        # get the sequences
        sequence = row[1]["seq"]
        
        # add the 5' and 3' additions
        sequence = fivep_add + sequence + threep_add
        
        # get target and mirna sequences
        target = row[1]["target"]
        mirna = row[1]["mirna_seq"]
        
        # apply the nt filter
        target = target[len(target)-nts:]
        mirna = mirna[:nts]
        
        # ----------------------------------------------------------------------------------------
        # calculate the ddG for the whole 3' UTR
        ddGs = []
        constrain_ranges = [(150, 170), (100, 120), (60, 80), (40, 60)]
        for i in range(4):
            # constrain
            constrain_range = constrain_ranges[i]
            insert_pos = sequence.find(target)
            seq_constrain = sequence[insert_pos-constrain_range[0]:insert_pos+constrain_range[1]]
            
            # Define strand species
            s_mir = Strand(mirna, name='mir')
            s_tar = Strand(seq_constrain, name='seq')

            set1 = ComplexSet(strands=[s_mir, s_tar],
                        complexes=SetSpec(max_size=2, exclude=[[s_mir, s_mir], [s_tar, s_tar]]))

            complex_results = complex_analysis(complexes=set1, model=my_model, compute=['pfunc'])
            
            dG_mir = complex_results["(mir)"].free_energy
            dG_seq = complex_results["(seq)"].free_energy
            if "(mir+seq)" in str(complex_results.keys()):
                dG_complex = complex_results["(mir+seq)"].free_energy
            else:
                dG_complex = complex_results["(seq+mir)"].free_energy

            ddGs.append(dG_complex - dG_mir - dG_seq)
            
        # average ddG values
        ddG = np.mean(ddGs)
        
        # add as a new column
        df.at[row[0], "ddG_mean"] = ddG
        
        # ----------------------------------------------------------------------------------------
        # also calculate the ddG for just the target interaction
        # define strand species
        s_mir = Strand(mirna, name='mir')
        s_tar = Strand(target, name='seq')
        
        set1 = ComplexSet(strands=[s_mir, s_tar],
                        complexes=SetSpec(max_size=2, exclude=[[s_mir, s_mir], [s_tar, s_tar]]))

        complex_results = complex_analysis(complexes=set1, model=my_model, compute=['pfunc'])
        
        dG_mir = complex_results["(mir)"].free_energy
        dG_seq = complex_results["(seq)"].free_energy
        if "(mir+seq)" in str(complex_results.keys()):
            dG_complex = complex_results["(mir+seq)"].free_energy
        else:
            dG_complex = complex_results["(seq+mir)"].free_energy

        df.at[row[0], "ddG_pure"] = dG_complex - dG_mir - dG_seq
        
        # ----------------------------------------------------------------------------------------
        # also att the difference between the two
        df.at[row[0], "ddG_diff"] = df.at[row[0], "ddG_mean"] - df.at[row[0], "ddG_pure"]
    
    return df

def calculate_local_structure(df):
    df = df.copy()
    for index, row in df.iterrows():
        sequence = row["seq"]
        sequence_length = len(sequence)
        
        target = row["target"]
        target_length = len(target)
        
        # find the insertion position
        insert_pos = sequence.find(target)

        # get the secondary structure
        # Define strand species
        strand = Strand(sequence, name='strand')

        set1 = ComplexSet(strands=[strand],
                    complexes=SetSpec(max_size=1))

        complex_results = complex_analysis(complexes=set1, model=my_model, compute=['pfunc', 'mfe', 'pairs'])
        matrix = complex_results["(strand)"].pairs.to_array()

        # get the trace of the matrix
        trace_indices = np.arange(0, len(sequence))
        trace = matrix[trace_indices, trace_indices]

        # get the local structure at the microRNA target position
        local_indices = trace_indices[:target_length]
        local_structure = trace[insert_pos:insert_pos+target_length]
        
        df.at[index, "local_structure"] = sum(local_structure)
    
    return df

In [32]:
# get the data in "natural" context sequences
df = mirna_in_context[mirna_in_context["context"].isin(context_data.index)].copy()

# use linear data
df.loc[:, cell_lines_measured] = 10**df.loc[:, cell_lines_measured]

# replace Ts by Us
df.loc[:, "target"] = df.loc[:, "target"].str.replace("T", "U")
df.loc[:, "seq"] = df.loc[:, "seq"].str.replace("T", "U")
df.loc[:, "mirna_seq"] = mirbase.loc[df.loc[:, "miRNA1"], "sequence_norm"].values
df.loc[:, "mirna_seq"] = df.loc[:, "mirna_seq"].str.replace("T", "U")

In [33]:
# this takes ~45s to run
df = calculate_context_ddG(df)

In [34]:
high_ddG = df[df["ddG_diff"] > 10].index
low_ddG = df[df["ddG_diff"] < 10].index

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(2.6, 1.8))

x_range_log = np.arange(1, 5.5, 0.1)
x_range_lin = 10**x_range_log
y_transfer = np.log10(transfer_function(x_range_lin, *popt))
df_deviation = pd.DataFrame(columns=cell_lines_expression, index=m2.index)

x_vals = []
y_vals = []
i = 0
sc_plots = []
for context in m2["context"].unique():
    # these are the measured values!
    m2_filter = m2[m2["context"] == context]
    c2_values = c2.loc[context]
    mirnas = m2_filter["miRNA1"]
    for cell_line in cell_lines_expression:
        if not cell_line in df_expression.columns:
            continue
        
        high_ddG_index = [index for index in m2_filter.index if index in high_ddG]
        low_ddG_index = [index for index in m2_filter.index if index in low_ddG]
        
        # I want to plot the stability relative to the background!
        relative_stability = m2_filter[cell_line]-np.log10(m2_0.loc[context, cell_line])
        expression = mirna_expr.loc[mirnas, cell_line]
        
        x_vals.append(np.log10(transfer_function(expression.loc[mirnas.loc[low_ddG_index]], c1, popt[1])))
        y_vals.append(relative_stability.loc[low_ddG_index].values)
        
        df_deviation.loc[relative_stability.index, cell_line] = relative_stability.values - np.log10(transfer_function(expression, c1, popt[1]).values)
        
        sc = plt.scatter(np.log10(expression.loc[mirnas.loc[high_ddG_index]]), relative_stability.loc[high_ddG_index],
                        s=5, alpha=1, color="tab:red", rasterized=True, edgecolor="none", zorder=2,
                        label="$\Delta\Delta$G > 10 kcal/mole" if i == 0 else None)
        if i == 0:
            sc_plots.append(sc)
        
        plt.scatter(np.log10(expression.loc[mirnas.loc[low_ddG_index]]), relative_stability.loc[low_ddG_index],
                    s=4, alpha=0.5, color="tab:blue", rasterized=True, edgecolor="none", zorder=1)
        i += 1

x_vals_flatten = np.concatenate(x_vals)
y_vals_flatten = np.concatenate(y_vals)
r2 = stats.pearsonr(x_vals_flatten, y_vals_flatten)[0]**2
rmse = np.sqrt(np.mean((x_vals_flatten - y_vals_flatten)**2))

plt.plot(x_range_log, y_transfer, color="black", ls="--")

plt.xlim(1,5.5)
plt.xticks([1,2,3,4,5])
plt.yticks([-2,-1,0])
plt.xlabel(r"log$_{10}$(miRNA expression)")
plt.ylabel(r"log$_{10}$(relative stability)")
# plt.text(x=1.5, y=-1.5, s=r"r$^2$ = {:.2f}".format(r2))
#plt.title("miRNA behavior in different contexts")
plt.legend(loc="lower left", frameon=False, fontsize=7, ncols=2, handler_map={sc: HandlerSize(12) for sc in sc_plots})
plt.tight_layout()
for format in ["svg", "png"]:
    plt.savefig(os.path.join(current_plot_folder, f"6.3-knockdown_versus_expression_ddG.{format}"), dpi=600)

In [None]:
# plot a histogram of the difference ddG values
plt.figure(figsize=(2.4, 1.8))
bins = np.arange(0,20,1)
plt.hist(df["ddG_diff"], bins=bins, color="tab:blue")
# plot a vertical line at 10
plt.axvline(10, color="black", linestyle="--")
plt.xlabel(r"$\Delta\Delta$G(actual) - $\Delta\Delta$G(ideal) (kcal/mole)")
plt.ylabel("Count")
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(current_plot_folder, f"6.3-ddG_distribution_target_sites.{format}"), dpi=300)

In [37]:
df_deviation.loc[:, "ddG_diff"] = df["ddG_diff"].values
df_deviation.loc[:, "miRNA1"] = df["miRNA1"].values

In [None]:
df_deviation_unrolled = df_deviation.melt(id_vars=["miRNA1", "ddG_diff"],
                                          value_vars=cell_lines_expression,
                                          var_name="cell_line",
                                          value_name="stability")

for cell_line in cell_lines_expression:
    cell_line_indices = df_deviation_unrolled[df_deviation_unrolled["cell_line"] == cell_line].index
    df_deviation_unrolled.loc[cell_line_indices, "expression"] = \
        np.log10(mirna_expr.loc[df_deviation_unrolled.loc[cell_line_indices, "miRNA1"], cell_line].values)

df_deviation_unrolled = df_deviation_unrolled[df_deviation_unrolled["expression"] > 3.5]

high_ddG_index = df_deviation_unrolled[df_deviation_unrolled["ddG_diff"] > 10].index
low_ddG_index = df_deviation_unrolled[df_deviation_unrolled["ddG_diff"] < 10].index

df_deviation_unrolled["stability"] = df_deviation_unrolled["stability"].astype(float)

df_deviation_unrolled.loc[high_ddG_index, "high_ddG"] = True
df_deviation_unrolled.loc[low_ddG_index, "high_ddG"] = False

# perform a non-parametric test
stat, p_value = stats.mannwhitneyu(df_deviation_unrolled[df_deviation_unrolled["high_ddG"] == True]["stability"],
                           df_deviation_unrolled[df_deviation_unrolled["high_ddG"] == False]["stability"],
                           alternative="greater")

# create a boxplot and indicate significance
plt.figure(figsize=(1.4, 1.8))
sns.boxplot(data=df_deviation_unrolled, x="high_ddG", y="stability", palette=["tab:blue", "tab:red"])
# indicate significance
plt.text(x=0.5, y=1.2, s=return_pvalue_text(p_value), ha="center")
plt.ylabel("deviation from fit")
plt.xticks([0,1], ["<10", ">10"])
plt.xlabel("$\Delta\Delta$G (kcal/mole)")
plt.tight_layout()
plt.savefig(os.path.join(current_plot_folder, "6.3-ddG_deviation_boxplot.svg"), dpi=300)

In [None]:
# create the plot folder if it does not exist
if not os.path.exists(os.path.join(plot_folder, "devation_vs_ddG")):
    os.makedirs(os.path.join(plot_folder, "devation_vs_ddG"))

x_vals = []
y_vals = []

plt.clf()
plt.figure(figsize=(2.6, 1.8))
for cell_line in cell_lines_measured:
    if not cell_line in df_expression.columns:
        continue
    
    # filter to those mirnas with an expression > 3.5
    df_expression_cell_line = df_expression[cell_line]
    df_expression_cell_line = df_expression_cell_line[df_expression_cell_line > 3.5].dropna()
    df_deviation_filter = df_deviation[df_deviation["miRNA1"].isin(df_expression_cell_line.index)].dropna()
    plt.scatter(df_deviation_filter["ddG_diff"], df_deviation_filter[cell_line], alpha=0.5, s=3,
                color = "tab:blue", edgecolor="none")
                # color = cell_line_colors[cell_line], label=cell_line, edgecolor=None)
    
    x_vals.append(df_deviation_filter["ddG_diff"].astype(float))
    y_vals.append(df_deviation_filter[cell_line].astype(float))
    
 # create a vertical line
# flatten
x_vals_flatten = np.concatenate(x_vals)
y_vals_flatten = np.concatenate(y_vals)

# fit a linear model
slope, intercept, r_value, p_value, std_err = stats.linregress(x_vals_flatten, y_vals_flatten)
print(f"p-value: {p_value:.2e}")

# plot the linear fit
x_range = np.arange(0, 21, 1)
y_range = slope*x_range + intercept
plt.plot(x_range, y_range, color="black", linestyle="--", label=r"$r^2$ = {:.2f}".format(r_value**2))

plt.xlabel(r"$\Delta\Delta$G(actual) - $\Delta\Delta$G(ideal) (kcal/mole)")
plt.ylabel("deviation from fit")
plt.legend(loc="lower right", fontsize = 7)
plt.ylim([-1, 1])
plt.title("MicroRNAs with expression > 10$^{3.5}$ tpm", fontsize = 7)
plt.tight_layout()
for format in ["svg", "png"]:
    plt.savefig(os.path.join(current_plot_folder, f"6.3-deviation_vs_ddG.{format}"), dpi=300)

In [None]:
# plot the deviation versus the ddG_mean for HEK293T
# create the plot folder if it does not exist
if not os.path.exists(os.path.join(plot_folder, "devation_vs_ddG")):
    os.makedirs(os.path.join(plot_folder, "devation_vs_ddG"))
for mirna in used_mirnas:
    plt.clf()
    plt.figure(figsize=(2.6, 1.8))
    for cell_line in cell_lines_measured:
        if not cell_line in df_expression.columns:
            continue
        
        # filter to those mirnas with an expression > 3.5
        df_expression_cell_line = df_expression[cell_line]
        df_expression_cell_line = df_expression_cell_line[df_expression_cell_line > 3.5].dropna()
        df_expression_cell_line = df_expression_cell_line[df_expression_cell_line.index.str.contains(mirna)]
        df_deviation_filter = df_deviation[df_deviation["miRNA1"].isin(df_expression_cell_line.index)]
        plt.scatter(df_deviation_filter.dropna()["ddG_diff"], df_deviation_filter.dropna()[cell_line], alpha=0.5, s=3,
                    color = "tab:blue", label=cell_line, edgecolor="none")
                    # color = cell_line_colors[cell_line], label=cell_line, edgecolor=None)
        
    # create a vertical line
    plt.axvline(x=10, ymin=0, ymax=1, color='black', linewidth=2, linestyle='--')

    plt.xlabel(r"$\Delta\Delta$G(actual) - $\Delta\Delta$G(ideal) (kcal/mole)")
    plt.ylabel("deviation from fit")
    #plt.legend(loc=[1.05, -0.1], fontsize = 6.5)
    plt.ylim([-1, 1])
    plt.title(f"{mirna}", fontsize = 7)
    plt.tight_layout()
    for format in ["svg", "png"]:
        plt.savefig(os.path.join(current_plot_folder, f"6.3-deviation_vs_ddG_{mirna}.{format}"), dpi=300)

# 6.4 - NUPACK analysis of the designs with engineered secondary structure

In [58]:
current_plot_folder = os.path.join(plot_folder, "6.4_designed_context_ddG")
# create it if it doesn't exist
if not os.path.exists(current_plot_folder):
    os.makedirs(current_plot_folder)

In [59]:
# grab all data that is NOT in the context data
df = mirna_in_context[~mirna_in_context["context"].isin(context_data.index)]

# delete unwanted designs [these cause a loss of stability simply by being strong hairpins]
# this is unrelated to miRNA function and does not ever occur in natural UTRs, so it is not relevant
unwanted_context1 = "inhib_[-4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]"
unwanted_context2 = "inhib_[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]"
df = df[~(df["context"]==unwanted_context1)]
df = df[~(df["context"]==unwanted_context2)]

# use linear data
df[cell_lines_measured] = 10**df[cell_lines_measured]
# get the mirna seq
df["mirna_seq"] = mirbase.loc[df["miRNA1"], "sequence_norm"].values
# replace Ts by Us
df["target"] = df["target"].str.replace("T", "U")
df["seq"] = df["seq"].str.replace("T", "U")
df["abbrev_seq"] = df["abbrev_seq"].str.replace("T", "U")
df["mirna_seq"] = df["mirna_seq"].str.replace("T", "U")

## Some basic testing

In [None]:
sequence = df.iloc[0]["seq"]
sequence_length = len(sequence)
target = df.iloc[0]["target"]
target_length = len(target)
# find the insertion position
insert_pos = sequence.find(target)
df.iloc[15]["abbrev_seq"]

# Define strand species
strand = Strand(sequence, name='strand')

set1 = ComplexSet(strands=[strand],
            complexes=SetSpec(max_size=1))

complex_results = complex_analysis(complexes=set1, model=my_model, compute=['pfunc', 'mfe', 'pairs'])

matrix = complex_results["(strand)"].pairs.to_array()

trace_indices = np.arange(0, len(sequence))
trace = matrix[trace_indices, trace_indices]

plt.figure(figsize=(20, 3))
plt.plot(trace_indices, trace)

# get the local structure at the microRNA target position
local_indices = trace_indices[:target_length]
local_structure = trace[insert_pos:insert_pos+target_length]

plt.figure(figsize=(20, 3))
plt.plot(local_indices, local_structure)
plt.show()

In [None]:
patterns = df[["context"]].iloc[0:28].values
with open(os.path.join(plot_folder, "patterns.txt"), "w") as f:
    for pattern in patterns:
        print(pattern)
        f.write(str(pattern).split("_")[1][:-2])
        f.write("\n")

# Calculate ddG values

In [None]:
i = 0
for row in df.iterrows():
    # get the sequences
    sequence = row[1]["abbrev_seq"]
    target = row[1]["target"]
    sequence_length = len(sequence)
    target_length = len(target)
    
    # find the insertion position
    insert_pos = sequence.find(target)
    seed_pos = insert_pos + 7
    seed_len = 14
    
    sequence = sequence[seed_pos-30:seed_pos+seed_len+30]
    # Perform NUPACK calculation
    strand = Strand(sequence, name='strand')
    set1 = ComplexSet(strands=[strand],
                complexes=SetSpec(max_size=1))
    complex_results = complex_analysis(complexes=set1, model=my_model, compute=['pfunc', 'mfe', 'pairs'])
    
    # get the matrix
    matrix = complex_results["(strand)"].pairs.to_array()
    
    trace_indices = np.arange(0, seed_len) + 30
    trace = matrix[trace_indices, trace_indices]
    
    # get the local structure at the microRNA target position
    if i == 0:
        print(trace)
    structure_sum = trace.sum()
    
    # add as a new column
    df.at[row[0], "local_structure"] = structure_sum
    i += 1

In [63]:
# this takes around 50s to run
df = calculate_context_ddG(df, nts=20)
df = calculate_local_structure(df)

In [None]:
# plot a histogram of ddG_mean
plt.figure(figsize=(2.4, 1.8))
bins = np.arange(0,30,1)
plt.hist(df["ddG_diff"], bins=bins, color="tab:blue")
plt.xlabel(r"$\Delta\Delta$G(actual) - $\Delta\Delta$G(ideal) (kcal/mole)")
plt.ylabel("Number")
plt.title("Distribution for designed inhibitory sequences")
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(current_plot_folder, f"6.4-ddG_distribution_target_sites.{format}"), dpi=300)

In [None]:
# plot a histogram of local structure
plt.figure(figsize=(2.4, 1.8))
bins = np.arange(0,14,1)
plt.hist(df["local_structure"], bins=bins, color="tab:blue")
plt.xlabel(r"local structure")
plt.ylabel("Number")
plt.title("Distribution for designed inhibitory sequences")
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(current_plot_folder, f"6.4-local_struct_distribution_target_sites.{format}"), dpi=300)

In [None]:
# Plot the fits versus the data
mirnas = used_mirnas
color_palette = sns.color_palette("tab10", len(mirnas))

plt.figure(figsize=(3.5, 1.8))
for i, mirna in enumerate(mirnas):
    df_filtered = df[df["miRNA1"] == mirna]
    # sort by ddG
    df_filtered = df_filtered.sort_values(by="ddG_mean")
    for j, cell_line in enumerate(cell_lines_measured):
        if j == 0:
            plt.plot(df_filtered["ddG_mean"]-df_filtered["ddG_pure"], np.log10(df_filtered[cell_line]), label=mirna,
                    color=color_palette[i], alpha=1)#color=cell_line_colors[cell_line])
        else:
            plt.plot(df_filtered["ddG_mean"]-df_filtered["ddG_pure"], np.log10(df_filtered[cell_line]), 
                    color=color_palette[i], alpha=1)
        x_vals = np.arange(-25, -7, 0.1)-df_filtered["ddG_pure"].iloc[0]
        y_vals = (-0.75+1.5)/12 * (x_vals+25) - 1.75
        
# plt.xlim(0, 25)
plt.xlabel(r"$\Delta\Delta$G(actual) - $\Delta\Delta$G(ideal) (kcal/mole)")
plt.ylabel("log10(expression)")
plt.legend(loc = [1.05, -0.1], fontsize=6)
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(current_plot_folder, f"6.4-ddG_vs_expression.{format}"), dpi=300)

In [None]:
# Assuming 'df' and 'used_mirnas' are predefined data structures
mirnas = used_mirnas
color_palette = sns.color_palette("tab10", len(mirnas))
num_mirnas = len(mirnas)

# Set up the figure with subplots
num_rows = 5 
num_cols = 2 
fig, axes = plt.subplots(num_rows, num_cols, figsize=(2.0 * num_cols, 1.2 * num_rows), sharey='row')

# Flatten the axes array
axes = axes.flatten()

# Plot data in each subplot
for i, mirna in enumerate(mirnas):
    ax = axes[i]  # Select the corresponding subplot
    df_filtered = df[df["miRNA1"] == mirna]
    df_filtered = df_filtered.sort_values(by="ddG_diff")
    
    for j, cell_line in enumerate(cell_lines_measured):
        # ax.plot(df_filtered["ddG_diff"], np.log10(df_filtered[cell_line]), 
        #         color="grey", alpha=0.5)
        ax.plot(df_filtered["ddG_diff"], np.log10(df_filtered[cell_line]), 
                 color=cell_line_colors[cell_line], alpha=1, label=cell_line if i == 4 else None)

    ax.set_xlim(0,30)
    ax.set_ylim(-1.75,0.25)
    ax.set_title(mirna)
    
    # Setting labels and ticks
    if i % num_cols == 0:
        ax.set_ylabel(r"log$_{10}$(stability)")
    if i >= (num_rows - 1) * num_cols:
        ax.set_xlabel(r"$\Delta\Delta$G(actual) - $\Delta\Delta$G(ideal) (kcal/mole)")


plt.tight_layout()
axes[4].legend(loc=[1.05, 0.5], fontsize=6)
# Save figures
for format in ["png", "svg"]:
    plt.savefig(os.path.join(current_plot_folder, f"6.4-ddG_vs_expression-invidual-plots-alt.{format}"), dpi=300)

In [None]:
# Assuming 'df' and 'used_mirnas' are predefined data structures
mirnas = used_mirnas
color_palette = sns.color_palette("tab10", len(mirnas))
num_mirnas = len(mirnas)

# Determine the layout of the subplots
num_rows = 5  # or more, depending on how many plots you want per row
num_cols = 2  # to have one column per miRNA if you want them all in one row

# Set up the figure with subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(3.0 * num_cols, 1.2 * num_rows), sharey='row')

# Flatten the axes array for easy iteration, in case of multiple rows
axes = axes.flatten()

# Plot data in each subplot
for i, mirna in enumerate(mirnas):
    ax = axes[i]  # Select the corresponding subplot
    df_filtered = df[df["miRNA1"] == mirna]
    df_filtered = df_filtered.sort_values(by="local_structure")
    
    for j, cell_line in enumerate(cell_lines_measured):
        ax.plot(df_filtered["local_structure"], np.log10(df_filtered[cell_line]), 
                color="grey", alpha=0.5)

    ax.set_xlim(0,8)
    ax.set_ylim(-1.75,0.25)
    ax.set_title(mirna)

    # Setting labels and ticks
    if i % num_cols == 0:
        ax.set_ylabel(r"log$_10$(stability)")
    if i >= (num_rows - 1) * num_cols:
        ax.set_xlabel(r"structure in seed region")

# Adjust layout to prevent overlap
plt.tight_layout()

# Save figures
for format in ["png", "svg"]:
    plt.savefig(os.path.join(current_plot_folder, f"6.4-local_struct_vs_expression-invidual-plots.{format}"), dpi=300)

# 6.5 - Investigate ddG impact for regular mirnas in the main context

In [72]:
current_plot_folder = os.path.join(plot_folder, "6.5_ddG_impact_individual_mirnas")

# create it if it doesn't exist
if not os.path.exists(current_plot_folder):
    os.makedirs(current_plot_folder)

## 6.5.1 - pairwise target interactions

This code takes 7 min to run.

In [73]:
# get likely real miRNAs
with open("../microrna_data/likely_real_mirnas.pkl", "rb") as f:
    likely_real_mirnas = pickle.load(f)

mirbase_real = mirbase.loc[likely_real_mirnas].copy()
mirbase_real["target"] = mirbase_real["target"].str.replace("T", "U")

In [66]:
from concurrent.futures import ProcessPoolExecutor

def compute_interaction(pair):
    miRNA1, miRNA2 = pair
    target1 = mirbase_real.loc[miRNA1, "target"]
    target2 = mirbase_real.loc[miRNA2, "target"]
    
    s_tar1 = Strand(target1, name='tar1')
    s_tar2 = Strand(target2, name='tar2')
    
    set1 = ComplexSet(strands=[s_tar1, s_tar2], complexes=SetSpec(max_size=2,
              exclude=[[s_tar1, s_tar1], [s_tar2, s_tar2]]))

    complex_results = complex_analysis(complexes=set1, model=my_model, compute=['pfunc'])
    
    dG_tar1 = complex_results["(tar1)"].free_energy
    dG_tar2 = complex_results["(tar2)"].free_energy
    if "(tar1+tar2)" in str(complex_results.keys()):
        dG_complex = complex_results["(tar1+tar2)"].free_energy
    else:
        dG_complex = complex_results["(tar2+tar1)"].free_energy
    
    ddG = dG_complex - dG_tar1 - dG_tar2
    return (miRNA1, miRNA2, ddG)

# Create the dataframe to store results
pairwise_ddG = pd.DataFrame(index=mirbase_real.index, columns=mirbase_real.index)

# Create pairs
pairs = [(miRNA1, miRNA2) for i, miRNA1 in enumerate(mirbase_real.index) for miRNA2 in mirbase_real.index[i:]]

# Compute results
with ProcessPoolExecutor(max_workers=8) as executor:
    results = executor.map(compute_interaction, pairs)

# Setting the results in the DataFrame
for result in results:
    miRNA1, miRNA2, ddG = result
    pairwise_ddG.loc[miRNA1, miRNA2] = ddG
    pairwise_ddG.loc[miRNA2, miRNA1] = ddG

In [67]:
output_folder = "../outputs/6_context_impact"
# create if if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    
import pickle

# pickle dump the two dicts
with open(os.path.join(output_folder, "6_pairwise_target_ddG.pkl"), "wb") as f:
    pickle.dump(pairwise_ddG, f)

In [None]:
plt.figure(figsize=(2.4, 1.8))
plt.hist(pairwise_ddG.values.flatten(), bins=100)
plt.xlabel(r"$\Delta\Delta$G(tar 1 + tar 2) - $\Delta\Delta$G(tar 1) - $\Delta\Delta$G(tar 2)")
plt.ylabel("Number")
plt.savefig(os.path.join(output_folder, "6_pairwise_ddG_hist.svg"), dpi=300)

## 6.5.2 - ddG and deviation

In [74]:
# MAKE SURE THIS ONE DOESN'T REMOVE CONSISTENT OUTLIERS
# this should have removed crosstalk, but not deviating microRNAs
#deviation_df = pd.read_csv("../outputs/3_fitting/combined_dataset/combined_dataset_deviation_bias_aware.csv", index_col=0)
deviation_df = pd.read_csv("../outputs/3_fitting/Alles2019/Alles2019_deviation_all_mirnas.csv", index_col=0)

In [75]:
# get inserted designs
inserted_designs_1 = pd.read_csv("../design_files/inserted_designs/1_mirna_full_single_high_conf_inserted.csv", index_col=0)
inserted_designs_2 = pd.read_csv("../design_files/inserted_designs/2_mirna_full_single_low_conf_mirgenedb_inserted.csv", index_col=0)
inserted_designs = pd.concat([inserted_designs_1, inserted_designs_2])
inserted_designs = inserted_designs.set_index("miRNA1")

In [76]:
deviation_df["seq"] = inserted_designs.loc[deviation_df.index, "seq"]

# add the attacking mirna based on sequence_norm in mirbase
deviation_df["mirna_seq"] = mirbase.loc[deviation_df.index, "sequence_norm"]

# add the target information
deviation_df["target"] = mirbase.loc[deviation_df.index, "target"]

In [77]:
# replace the T's by U's
deviation_df["seq"] = deviation_df["seq"].str.replace("T", "U")
deviation_df["mirna_seq"] = deviation_df["mirna_seq"].str.replace("T", "U")
deviation_df["target"] = deviation_df["target"].str.replace("T", "U")

In [78]:
# this takes ~2 min to run
deviation_df = calculate_context_ddG(deviation_df)

In [None]:
# plot a histogram of ddG_mean
plt.figure(figsize=(2.4, 1.8))
bins = np.arange(0,20,1)
plt.hist(deviation_df["ddG_diff"], bins=bins, color="tab:blue")
plt.xlabel(r"$\Delta\Delta$G(actual) - $\Delta\Delta$G(ideal)")
plt.ylabel("Count")
#plt.title("Distribution for invividual target sites")
# plot a dashed vertical line at x=10
plt.axvline(x=10, color="black", linestyle="--")
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(current_plot_folder, f"6.5-ddG_distribution_actual_target_sites.{format}"), dpi=300)

In [80]:
%%capture output
def hill_func_log_regular(x, c1=3, c2=4.5):
    """The expression is assumed to be normalized to one.
    The microRNA data is assumed to be log10.
    The return value is also log10."""
    x = 10**x
    c1 = 10**c1
    c2 = 10**c2
    
    result = (1 / (1 + x / c1)) * (1 + x / c2)
    return np.log10(result)

df_knockdown = pd.read_csv("../outputs/3_fitting/Alles2019/Alles2019_knockdown.csv", index_col=0)

# get those designs with high ddG
high_ddG_designs = deviation_df[deviation_df["ddG_diff"] > 10].index
low_ddG_designs = deviation_df[~deviation_df.index.isin(high_ddG_designs)].index

# create a figure with 4 subplots
fig, ax = plt.subplots(1, 1, figsize=(2.6, 1.8))

sc_plots = []
for i, cell_line in enumerate(cell_lines_expression):
    
    df_knock = df_knockdown.loc[low_ddG_designs, cell_line].dropna()
    df_ex = np.log10(df_expression.loc[low_ddG_designs, cell_line].dropna())
    ax.scatter(df_ex, df_knock, s=4, alpha=0.5, color="tab:blue", edgecolors="none", zorder=1, rasterized=True)
    
    df_ex = np.log10(df_expression.loc[high_ddG_designs, cell_line].dropna())
    df_knock = df_knockdown.loc[high_ddG_designs, cell_line].dropna()
        
    sc = ax.scatter(df_ex, df_knock, s=5, color="tab:red", edgecolors="none",
               label=r"$\Delta\Delta$G > 10" if i==0 else None, zorder=2, rasterized=True)
    
    if i==0:
        ax.plot(x_range_log, hill_func_log_regular(x_range_log,
                        *popt), color="black", linewidth=1.5, ls="--")
        sc_plots.append(sc)
    
plt.xlim(0, 5.5)
plt.ylim(-1.7, 0.25)

ax.set_xlabel(r"log$_{10}$(miRNA expression)")
ax.set_ylabel(r"log$_{10}$(stability)")
plt.legend(loc="lower left", frameon=False, fontsize=7, handler_map={sc: HandlerSize(12) for sc in sc_plots})

plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(current_plot_folder, f"6.5_ddG_outliers.{format}"), dpi=300)

In [None]:
plt.clf()
plt.figure(figsize=(2.6, 1.8))
x_vals = []
y_vals = []
for cell_line in cell_lines_expression:
    # filter to those mirnas with an expression > 3.5
    df_ex = np.log10(df_expression[cell_line])
    filter_index = df_ex[df_ex > 3.5].dropna().index
    filter_index = deviation_df.index.isin(filter_index)
    deviation_df_filtered = deviation_df.loc[filter_index]
    plt.scatter(deviation_df_filtered["ddG_diff"], deviation_df_filtered[cell_line], alpha=1, s=3,
                color = "tab:blue", edgecolor="none")
    
    x_vals.append(df_deviation_filter.dropna()["ddG_diff"].astype(float))
    y_vals.append(df_deviation_filter.dropna()[cell_line].astype(float))
    
    # create a vertical line
    #plt.axvline(x=10, ymin=0, ymax=1, color='black', linewidth=2, linestyle='--')

# flatten
x_vals_flatten = np.concatenate(x_vals)
y_vals_flatten = np.concatenate(y_vals)

# fit a linear model
slope, intercept, r_value, p_value, std_err = stats.linregress(x_vals_flatten, y_vals_flatten)

# plot the linear fit
x_range = np.arange(0, 21, 1)
y_range = slope*x_range + intercept
plt.plot(x_range, y_range, color="black", linestyle="--", label=r"$r^2$ = {:.2f}".format(r_value**2))

plt.xlabel(r"$\Delta\Delta$G(actual) - $\Delta\Delta$G(ideal) (kcal/mole)")

plt.ylabel("deviation from fit")
plt.legend(loc="lower right", fontsize = 7)
plt.ylim([-1.05, 1.05])
plt.xlim([0,17])
plt.title("MicroRNAs with an expression > 10$^{3.5}$ tpm", fontsize=7)
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(current_plot_folder, f"6.5-deviation_vs_ddG.{format}"), dpi=300)

In [None]:
deviation_df["miRNA1"] = deviation_df.index
df_deviation_unrolled = deviation_df.melt(id_vars=["miRNA1", "ddG_diff"],
                                          value_vars=cell_lines_expression,
                                          var_name="cell_line",
                                          value_name="stability")
for cell_line in cell_lines_expression:
    cell_line_indices = df_deviation_unrolled[df_deviation_unrolled["cell_line"] == cell_line].index
    df_deviation_unrolled.loc[cell_line_indices, "expression"] = \
        np.log10(mirna_expr.loc[df_deviation_unrolled.loc[cell_line_indices, "miRNA1"], cell_line].values)

df_deviation_unrolled = df_deviation_unrolled[df_deviation_unrolled["expression"] > 3.5]
# drop nan in expression or stability
df_deviation_unrolled = df_deviation_unrolled.dropna(subset=["expression", "stability"])

high_ddG_index = df_deviation_unrolled[df_deviation_unrolled["ddG_diff"] > 10].index
low_ddG_index = df_deviation_unrolled[df_deviation_unrolled["ddG_diff"] < 10].index

df_deviation_unrolled["stability"] = df_deviation_unrolled["stability"].astype(float)

df_deviation_unrolled.loc[high_ddG_index, "high_ddG"] = True
df_deviation_unrolled.loc[low_ddG_index, "high_ddG"] = False

# perform a non-parametric test
stat, p_value = stats.mannwhitneyu(df_deviation_unrolled[df_deviation_unrolled["high_ddG"] == True]["stability"],
                           df_deviation_unrolled[df_deviation_unrolled["high_ddG"] == False]["stability"],
                           alternative="greater")

# create a boxplot and indicate significance
plt.figure(figsize=(1.4, 1.8))
sns.boxplot(data=df_deviation_unrolled, x="high_ddG", y="stability", palette=["tab:blue", "tab:red"])
# indicate significance
plt.text(x=0.5, y=1.2, s=return_pvalue_text(p_value), ha="center")
plt.ylabel("deviation from fit")
plt.ylim(-1, 1)
plt.xticks([0,1], ["<10", ">10"])
plt.xlabel("$\Delta\Delta$G (kcal/mole)")
plt.tight_layout()
plt.savefig(os.path.join(current_plot_folder, "6.5-ddG_deviation_boxplot.svg"), dpi=300)

# 6.6 - Look into the ddG distribution for all designs with multiple target sites

In [59]:
current_plot_folder = os.path.join(plot_folder, "6.6_ddG_impact_multiple")
# create it if it doesn't exist
if not os.path.exists(current_plot_folder):
    os.makedirs(current_plot_folder)

In [None]:
ref_keys = reference_dict.keys()

relevant_keys = ["6_", "7_", "8_", "9_", "10_", "11_", "12_", "13_", "14_", "15_", "24_", "25_", "26_", "27_", "28_", "29_", "30_", "31_", "32_", "33_", "34_", "35_"]
ref_keys = [key for key in ref_keys if any([key.startswith(relevant_key) for relevant_key in relevant_keys])]
ref_keys

#### This code takes ~45 min to run

One option for calculating this is to mask the sequence of all other identical miRNAs for repeat data. The downside of this is that if the repeats interact, we're not going to see it... For now, let's not do that

In [61]:
with open(os.path.join(output_folder, "6_pairwise_target_ddG.pkl"), "rb") as f:
    pairwise_ddG = pickle.load(f)

In [None]:
def find_substring_positions(sequence, substrings):
    positions = []
    start_index = 0  # Starting index for the search

    for substring in substrings:
        found_index = sequence.find(substring, start_index)
        if found_index != -1:
            positions.append(found_index)
            # Move past the found substring
            start_index = found_index + len(substring)  
        else:
            positions.append("Not found")
    
    return positions

fivep_add = "GCCCCGUGCUGCUGCCCGACAACCACUACCUGAGCACCCAGUCCGCCCUGAGCAAAGACCCCAACGAGAAGCGCGAUCACAUGGUCCUGCUG" + \
    "GAGUUCGUGACCGCCGCCGGGAUCACUCUCGGCAUGGACGAGCUGUACAAGUAAUUCUAGUUGUUUAAAGCCCAACGCUAGUUUCCCUACACGACGCUCUUCCGAUCU"
threep_add = "CUCUGGAUUUGCAACCGACAUAGACAAACAGGCAUGCAAGCUGAUCCGGCUGCUAACAAAGCCCGAAAGGAAGCUGAGUUGGCUGCUGCCA" + \
    "CCGCUGAGCAAUAACUAGCAUAACCCCUUGGGGCGGCCGCUUCGAGCAGACAUGAUAAGAUACAUUGAUGAGUUUGGACAAACCACAACUAGAAUGCAG"

inserted_design_dict = {}
for key in ref_keys:
    print(key)
    df = reference_dict[key]
    inserted_designs = pd.read_csv(f"../design_files/inserted_designs/{key}_inserted.csv", index_col=0)

    # get the mirna and target columns
    mirna_columns = [col for col in inserted_designs.columns if "miRNA" in col]
    target_columns = [col for col in inserted_designs.columns if "target" in col and col != "target"]
    mirna_len = 21

    # replace T by U
    inserted_designs["seq"] = inserted_designs["seq"].str.replace("T", "U")
    for target_column in target_columns:
        inserted_designs[target_column] = inserted_designs[target_column].str.replace("T", "U")

    for k, (row_index, row) in enumerate(inserted_designs.iterrows()):
        all_targets = []
        for target_column in target_columns:
            all_targets.append(row[target_column])
        seq = row["seq"]
        
        for curr_index in range(len(mirna_columns)):
            seq = row["seq"]
            mirna = row[mirna_columns[curr_index]]
            target = row[target_columns[curr_index]]

            # ----------------------------------------------------------------------------------------            
            # add the 5' and 3' additions
            seq = fivep_add + seq + threep_add
            
            # get the sequence of the miRNA
            mirna_seq = seq_utils.reverse_complement(target, alph="RNA")
            ddGs = []

            constrain_ranges = [(150, 170), (100, 120)]
            for i in range(len(constrain_ranges)):
                # constrain
                constrain_range = constrain_ranges[i]
                # the total length of the sequence is 200 + 164 + 190 = 554 nt. The middle is therefore approximately at position 270.
                insert_pos = 270
                seq_constrain = seq[insert_pos-constrain_range[0]:insert_pos+constrain_range[1]]
                
                # Define the strand species
                s_mir = Strand(mirna_seq, name='mir')
                s_tar = Strand(seq_constrain, name='seq')

                set1 = ComplexSet(strands=[s_mir, s_tar],
                            complexes=SetSpec(max_size=2, exclude=[[s_mir, s_mir], [s_tar, s_tar]]))

                complex_results = complex_analysis(complexes=set1, model=my_model, compute=['pfunc'])
                
                dG_mir = complex_results["(mir)"].free_energy
                dG_seq = complex_results["(seq)"].free_energy
                if "(mir+seq)" in str(complex_results.keys()):
                    dG_complex = complex_results["(mir+seq)"].free_energy
                else:
                    dG_complex = complex_results["(seq+mir)"].free_energy

                ddGs.append(dG_complex - dG_mir - dG_seq)
                    
            # average ddG values
            ddG_mean = np.mean(ddGs)
            inserted_designs.at[row_index, f"ddG_mean_{curr_index+1}"] = ddG_mean
            df.at[row_index, f"ddG_mean_{curr_index+1}"] = ddG_mean

            # ----------------------------------------------------------------------------------------
            # also calculate the ddG for just the target interaction
            # Define strand species
            s_mir = Strand(mirna_seq, name='mir')
            s_tar = Strand(target, name='seq')

            set1 = ComplexSet(strands=[s_mir, s_tar],
                            complexes=SetSpec(max_size=2, exclude=[[s_mir, s_mir], [s_tar, s_tar]]))

            complex_results = complex_analysis(complexes=set1, model=my_model, compute=['pfunc'])

            dG_mir = complex_results["(mir)"].free_energy
            dG_seq = complex_results["(seq)"].free_energy
            if "(mir+seq)" in str(complex_results.keys()):
                dG_complex = complex_results["(mir+seq)"].free_energy
            else:
                dG_complex = complex_results["(seq+mir)"].free_energy

            ddG_pure = dG_complex - dG_mir - dG_seq
            inserted_designs.at[row_index, f"ddG_pure_{curr_index+1}"] = ddG_pure
            df.at[row_index, f"ddG_pure_{curr_index+1}"] = ddG_pure
            
            # add the difference
            inserted_designs.at[row_index, f"ddG_diff_{curr_index+1}"] = ddG_mean - ddG_pure
            df.at[row_index, f"ddG_diff_{curr_index+1}"] = ddG_mean - ddG_pure
            
            # ----------------------------------------------------------------------------------------
            # also calculate the pairwise target interactions with other targets
            other_mirnas = [row[col] for col in mirna_columns if col != mirna_columns[curr_index]]
            interaction_energies = [pairwise_ddG.loc[mirna, target] for target in other_mirnas]
            inserted_designs.at[row_index, f"ddG_pairwise_target_{curr_index+1}"] = np.min(interaction_energies)
            df.at[row_index, f"ddG_pairwise_target_{curr_index+1}"] = np.min(interaction_energies)
        
    inserted_design_dict[key] = inserted_designs
    reference_dict[key] = df

In [63]:
# pickle dump the two dicts
with open(os.path.join(output_folder, "6_inserted_design_dict_with_ddG.pkl"), "wb") as f:
    pickle.dump(inserted_design_dict, f)
with open(os.path.join(output_folder, "6_reference_dict_with_ddG.pkl"), "wb") as f:
    pickle.dump(reference_dict, f)

In [None]:
# load the dicts
with open(os.path.join(output_folder, "6_inserted_design_dict_with_ddG.pkl"), "rb") as f:
    inserted_design_dict = pickle.load(f)
with open(os.path.join(output_folder, "6_reference_dict_with_ddG.pkl"), "rb") as f:
    reference_dict = pickle.load(f)

In [66]:
%%capture output
bins = np.arange(0,30,1)

exclusion_dict_ddG = {}
for key in ref_keys:
    df = reference_dict[key]
    ddG_mean_columns = [col for col in df.columns if "ddG_mean" in col]
    ddG_pure_columns = [col for col in df.columns if "ddG_pure" in col]
    
    # create ddG_diff_columns
    ddG_diff_columns = [f"ddG_diff_{i}" for i in range(1, len(ddG_mean_columns)+1)]
    df[ddG_diff_columns] = df[ddG_mean_columns].values - df[ddG_pure_columns].values
    
    mirna_columns = [col for col in df.columns if "miRNA" in col and col != "miRNA"]
    
    exclusion_df = pd.DataFrame(index=df.index, columns=cell_lines_measured)
    for index, row in df.iterrows():
        # get all mirna sites with a high ddG
        high_ddG_sites = row[ddG_diff_columns] > 15
        # get the expression for all sites
        mirnas = row[mirna_columns]
        expression = df_expression.loc[mirnas, :]
        for cell_line in cell_lines_measured:
            # get the mirna with the highest expression
            highest_expr_mirna = expression[cell_line].idxmax()
            
            indices = mirnas[mirnas == highest_expr_mirna].index
            first_index = indices[0] if len(indices) > 0 else None
            first_index = first_index[-1]
            
            # get the number of the associated miRNA site
            number = "ddG_diff_" + first_index
            
            # is this mirna in the high_ddG_sites?
            if high_ddG_sites[number]:
                exclusion_df.at[index, cell_line] = True
            else:
                exclusion_df.at[index, cell_line] = False
    exclusion_dict_ddG[key] = exclusion_df

In [None]:
high_ddG_designs = []
for key in exclusion_dict_ddG.keys():
    df = exclusion_dict_ddG[key].copy()
    # are any of the values True?
    df["any"] = df.any(axis=1)
    # filter to those that are True
    df = df[df["any"]]
    high_ddG_designs.append(df.index)
    
high_ddG_designs = np.concatenate(high_ddG_designs)
with open(os.path.join(output_folder, "6_high_ddG_designs.pkl"), "wb") as f:
    pickle.dump(high_ddG_designs, f)
with open(os.path.join(output_folder, "6_exclusion_dict_ddG.pkl"), "wb") as f:
    pickle.dump(exclusion_dict_ddG, f)