In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
import ast
from lib.NA_sequence_utilities import reverse_complement
from lib.transfer_functions import transfer_function
from lib.context_insertion import distance_to_start_codon, determine_generic_ins_positions, insert_miRNA_sites

plot_folder = "../plots/4_context/"
# create folder for plots if it doesn't exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)

# load mirbase
mirbase_df = pd.read_csv('../input_data/mirbase_with_families_and_targets.csv', index_col=0)

mirbase_df["ATG_pos"] = mirbase_df["ATG_pos"].apply(ast.literal_eval)
mirbase_df["ATG_pos_mod3"] = mirbase_df["ATG_pos_mod3"].apply(ast.literal_eval)

# 4.1 - Test miRNAs in normal context sequences

## Load contexts

In [9]:
# load microRNA expression data
mirna_expression = pd.read_csv('../input_data/miRNA_expression_data/1_output/1.10_alles_quantile_no_crosstalk_filter.csv', index_col=0)
mirna_expression = 10**mirna_expression

In [None]:
# length of the variable region the the oligo
var_region_len = 164
# length of the 3' sequence (distance to start codon for 5' UTR designs)
three_p_seq_len = 61
# length of the target sequence
target_len = 21
# distance between the insertion positions
dist_between = 6

# get the insertion position
insertion_positions = determine_generic_ins_positions(
                                                var_region_len=var_region_len,
                                                three_p_seq_len=three_p_seq_len,
                                                target_len=target_len,
                                                dist_between=dist_between,
                                                no_of_inserts=1)

insertion_positions

In [11]:
# load context sequences
context_sequences = pd.read_csv("../designs/0_lib2_controls.csv", index_col=0)
# only take the first 30 sequences
context_sequences = context_sequences.iloc[1:31,:] 

## Insert the miRNAs into the context

In [12]:
# these are the (manually chosen) miRNAs we wish to look at in different contexts
context_mirnas = [
    "hsa-let-7a-5p",
    "hsa-miR-19b-3p",
    'hsa-miR-365a-3p',
    'hsa-miR-23a-3p',
    'hsa-miR-21-5p',
    'hsa-miR-31-5p',
    'hsa-miR-22-3p',
    'hsa-miR-16-5p',
    'hsa-miR-107',
    'hsa-miR-24-3p'
]

In [13]:
result = pd.DataFrame(columns=["context", "miRNA1", "target", "seq"])
i = 1
for mirna in context_mirnas:
    for label, row in context_sequences.iterrows():
        context = row["seq"]
        index = f"4_miRNA_full_single_context_controls_{i}"
        result.loc[index, "miRNA1"] = mirna
        result.loc[index, "target"] = mirbase_df.loc[mirna, "target"]
        result.loc[index, "context"] = label
        result.loc[index, "seq"] = insert_miRNA_sites(context, insertion_positions, mirbase_df, [mirna])
        i += 1

# 4.2 - Design context sequences with specific ddG values

In [None]:
from nupack import *
my_model = Model(material='rna', celsius=37)

In [19]:
with open("../designs/universal_lib2_context.txt") as f:
    base_context = f.read()

In [20]:
complement_dict = {
    'A': 'T',
    'C': 'G',
    'G': 'C',
    'T': 'A',
}

non_complement_dict = {
    'A': ['A', 'C', 'G'],
    'C': ['A', 'C', 'T'],
    'G': ['A', 'G'],
    'T': ['C', 'T'],
}

In [None]:
# here, we define patterns of complementarity relative to the miRNA
patterns = [
    # 5 nt blocks
    [-2, -1, 0, 1, 2],
    [3, 4, 5, 6, 7],
    [11, 12, 13, 14, 15],
    # 2 times 4 nt blocks
    [-2, -1, 0, 1, 5, 6, 7, 8],
    [0, 1, 2, 3, 7, 8, 9, 10],
    [8, 9, 10, 11, 15, 16, 17, 18],
    # 7 nt blocks
    [-3, -2, -1, 0, 1, 2, 3],
    [0, 1, 2, 3, 4, 5, 6],
    [7, 8, 9, 10, 11, 12, 13],
    [14, 15, 16, 17, 18, 19, 20],
    # 2 times 6 nt blocks
    [-3, -2, -1, 0, 1, 2, 5, 6, 7, 8, 9, 10],
    [0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13],
    [7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20],
    # 9 nt blocks
    [-4, -3, -2, -1, 0, 1, 2, 3, 4],
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
    [12, 13, 14, 15, 16, 17, 18, 19, 20],
    # 2 times 8 nt blocks
    [-4, -3, -2, -1, 0, 1, 2, 3, 9, 10, 11, 12, 13, 14, 15, 16],
    [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16],
    [4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20],
    # 11 nt blocks
    [-4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6],
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    # 13 nt block
    [-4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8],
    [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
    # 15 nt block
    [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
    # 17 nt block
    [-4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
    # 19 nt block
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
    # everything
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    # full block
    [-4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
]
len(patterns)

In [22]:
# we want to insert somewhat upstream of the target position
insertion_pos_inhib = insertion_positions[0] - 26

# determine the bases right next to the mirna position (to be able to generate the inhibitory sequence)
adjacent_bases = base_context[insertion_positions[0]+21:insertion_positions[0]+25]

i = len(result) + 1
for mirna in context_mirnas:
    tar_seq = mirbase_df.loc[mirna, "target"]
    for pattern in patterns:
        context = base_context
        inhib_insert = ''
        for pos, letter in enumerate(tar_seq[::-1]):
            if pos in pattern: 
                inhib_insert += complement_dict[letter]
            else:
                inhib_insert += random.choice(non_complement_dict[letter])
                
        context = context[:insertion_pos_inhib] + inhib_insert + context[insertion_pos_inhib+len(inhib_insert):]
        # check for negative positions
        altered_adjacent = adjacent_bases
        for pos, letter in enumerate(adjacent_bases):
            if -(pos+1) in pattern:
                context = context[:insertion_pos_inhib-(pos+1)] + complement_dict[letter] + context[insertion_pos_inhib-pos:]
        
        index = f"4_miRNA_full_single_context_controls_{i}"
        result.loc[index, "miRNA1"] = mirna
        result.loc[index, "context"] = 'inhib_' + str(pattern)
        result.loc[index, "target"] = mirbase_df.loc[mirna, "target"]
        result.loc[index, "seq"] = insert_miRNA_sites(context, insertion_positions, mirbase_df, [mirna])
        i += 1

### Calculate ddG values

In [29]:
ddG_list = []
for key, row in result.iterrows():
    tar = row["target"]
    mirna = reverse_complement(tar, alph="DNA")
    
    # this is the full variable sequence (164 nt)
    seq = row["seq"]
    
    mirna = mirna.replace("T", "U")
    seq = seq.replace("T", "U")
    
    # Define strand species
    s_mir = Strand(mirna, name='mir')
    s_tar = Strand(seq, name='seq')

    set1 = ComplexSet(strands=[s_mir, s_tar],
                  complexes=SetSpec(max_size=2, exclude=[[s_mir, s_mir], [s_tar, s_tar]]))

    complex_results = complex_analysis(complexes=set1, model=my_model, compute=['pfunc'])
    
    dG_mir = complex_results["(mir)"].free_energy
    dG_seq = complex_results["(seq)"].free_energy
    if "(mir+seq)" in str(complex_results.keys()):
        dG_complex = complex_results["(mir+seq)"].free_energy
    else:
        dG_complex = complex_results["(seq+mir)"].free_energy
        
    ddG = dG_complex - dG_mir - dG_seq
    ddG_list.append(ddG)

In [31]:
# Add ddGs to the result dataframe
result["ddG"] = ddG_list
# The abbreviated sequence cuts off some of the flanking regions to make the analysis cleaner
result['abbrev_seq'] = result['seq'].map(lambda x: x[insertion_pos_inhib-10:insertion_positions[0]+31])

In [None]:
sns.boxplot(x="miRNA1", y="ddG", data=result)

# 4.3 - Investigate the designs before saving them

In [None]:
# are there polyadenylation signals in the context?
point_mutants_or = '|'.join(['AATAAA', 'ATTAAA', 'AGTAAA', 'TATAAA', 'ACTAAA'])
result["seq"].str.contains(point_mutants_or).value_counts()

In [None]:
# are there restriction sites in the context?
# these are for BsaI
restriction_sites_or = "GAGACC|GGTCTC"
result["seq"].str.contains(restriction_sites_or).value_counts()

In [None]:
# are there ATGs in the context?
result["seq"].str.contains('ATG').value_counts()

In [38]:
result.to_csv("../designs/4_miRNA_full_single_context_controls.csv")