In [1]:
%load_ext autoreload
%autoreload 2

# Desiging a synthetic ChiX with imperfect binding to GFP mRNA

## Imports

In [2]:
import pandas as pd
import numpy as np
from synbio_morpher.srv.io.manage.script_manager import script_preamble
from synbio_morpher.srv.parameter_prediction.simulator import process_raw_stdout
from synbio_morpher.utils.common.setup import prepare_config, expand_config
from synbio_morpher.utils.data.data_format_tools.common import load_json_as_dict
from synbio_morpher.utils.data.data_format_tools.manipulate_fasta import load_seq_from_FASTA, write_fasta_file
from synbio_morpher.utils.misc.type_handling import flatten_listlike
from subprocess import Popen, PIPE

# Load data

Load ChiX, synChiX, and GFP mRNA.

In [3]:
fn_muts = 'data/scott_mutations.fasta'
fn_fps = 'data/mRNA.fasta'
fn_all = '../data/sRNA/merged_EcoCyc_RNAInter_sRNATarBase.csv'
fn_tus = '../data/sRNA/EcoCyc/EcoCyc_TUs.csv'

muts = load_seq_from_FASTA(fn_muts, as_type='dict')
fps = load_seq_from_FASTA(fn_fps, as_type='dict')
data = pd.read_csv(fn_all, index_col=0)
tus = pd.read_csv(fn_tus, index_col=0)

In [4]:
fn_sim = './data/tests/2023_12_12_170150/inter_data_raw.json'
sim_data_l = load_json_as_dict(fn_sim)

default_vals = {
    'id1': '', 'id2': '', 'E': 0.0, 'E_norm': 0.0, 'bpList': '', 'hybridDPfull': '', 'seedPu1': '', 'seedPu2': '', 'seedStart1': '', 'seedStart2': '', 'seedEnd1': '', 'seedEnd2': ''
}
mrnas = list(set(flatten_listlike([list(v.keys()) for v in sim_data_l.values()])))
for s, v in sim_data_l.items():
    if len(v.values()) < len(mrnas):
        diffs = set(mrnas) - set(v.keys())
        for d in diffs:
            sim_data_l[s][d] = default_vals
        

In [5]:
sim_data_l['Syn_ChiX_20-D']['cyRFP1']

{'id1': 'cyRFP1',
 'id2': 'Syn_ChiX_20-D',
 'E': '-7.51',
 'E_norm': '-0.676255',
 'bpList': '(392,58):(393,57):(394,56):(395,55):(396,54):(397,53):(401,51):(403,49):(404,48):(405,47):(406,46):(407,45):(408,44):(409,43):(410,42)',
 'hybridDPfull': '.......................................................................................................................................................................................................................................................................................................................................................................................................((((((...(.((((((((.................................................................................................................................................................................................................................................................................................................................&...................

In [6]:
def convert_seed_to_int(seedstr):
    return list(map(lambda x: int(x), seedstr.split(':')))

seed_region = list(map(lambda x: int(x.strip('()').split(',')[-1]), sim_data_l['Syn_ChiX_20-D']['EGFP']['bpList'].split(':')))
seed_region = sorted([seed_region[0], seed_region[-1]])
seed_region

[44, 64]

# Mutate seed region

Do 1, 2, 3, 5, 7, and 10 mutations.

In [21]:
def choose_positions(start, end, bpcount):
    return np.array(np.random.choice(np.arange(start, end), bpcount, replace=False))


def generate_unique_tuples(tot, tup_len, start, end):
    unique_tuples = set()
    while len(unique_tuples) < tot:
        new_tuple = tuple(np.random.choice(np.arange(start, end), tup_len, replace=False))
        unique_tuples.add(new_tuple)
    return np.array(list(unique_tuples))


def choose_types(seq, positions, nuc_map, nucs, bpcount):
    int_types = np.random.randint(0, len(nucs)-1, bpcount)
    mutation_types = []
    for bp, i in zip(seq[positions], int_types):
        mutation_types.append(nuc_map[bp][i])
    return np.array(mutation_types)


def implement_mutation(mutation_count: int, seq: str, seed_region: tuple, nucs: list, nuc_map: dict, mutations_per_count: int):
    positions = generate_unique_tuples(tot=mutations_per_count, tup_len=mutation_count, start=seed_region[0], end=seed_region[-1]).T
    mutation_types = choose_types(np.array(list(seq)), positions.flatten(), nuc_map, nucs, bpcount=mutation_count*mutations_per_count)
    mutation_types = mutation_types.reshape((mutation_count, mutations_per_count))

    # seq_ints = np.array(list(map(lambda x: nucs.index(x), seq)))
    mutants = np.repeat(np.array(list(seq))[None, :], repeats=mutations_per_count, axis=1).reshape(
        (len(seq), mutations_per_count)).T
    mut_index = (np.repeat(np.arange(mutations_per_count)[:, np.newaxis], repeats=mutation_count, axis=1).T, positions)
    mutants[mut_index] = mutation_types
    
    assert np.sum((mutants[mut_index] == mutation_types) == False) == 0, 'Mutants have repeting types / positions and are not unique.'
    return mutants

In [22]:
nucs = sorted(['A', 'C', 'G', 'T'])

nuc_map = {}
for n1 in nucs:
    nuc_map[n1] = sorted([n2 for n2 in nucs if n2 != n1])
# nuc_map_ints = {}
# for k, v in nuc_map.items():
#     nuc_map_ints[k] = list(map(lambda x: nucs.index(x), v))

In [24]:
mutation_counts = [1,2,3,5,7,10]
mutations_per_count = 20
seq = muts['Syn_ChiX_20-D']
all_mutants = {}
for mutation_count in mutation_counts:
    all_mutants[mutation_count] = implement_mutation(mutation_count, seq, seed_region, nucs, nuc_map, mutations_per_count*mutation_count)


In [34]:
def pad_integers_with_zeros(num, n):
    return str(num).zfill(n)

highest_iters = len(str(mutations_per_count * max(mutation_counts)))
mutant_fasta = {}
for c, m in all_mutants.items():
    for i, mut in enumerate(m):
        mname = 'SynChiX_mutated_' + str(c) + 'x_id-' + pad_integers_with_zeros(i, highest_iters)
        mutant_fasta[mname] = ''.join(mut)
mutant_fasta

{'SynChiX_mutated_1x_id-000': 'ACACCGTCGCTTAAAGTGACGGCATAATAATAAAAAAATGAAATTCTTCTCGTTTACGCATATGGCCAATAGCGATATTGGCCATTTTTTT',
 'SynChiX_mutated_1x_id-001': 'ACACCGTCGCTTAAAGTGACGGCATAATAATAAAAAAATGAAATTCTTCTCCTTTACACATATGGCCAATAGCGATATTGGCCATTTTTTT',
 'SynChiX_mutated_1x_id-002': 'ACACCGTCGCTTAAAGTGACGGCATAATAATAAAAAAATGAAATTCTTCTCCTTTACGCATATAGCCAATAGCGATATTGGCCATTTTTTT',
 'SynChiX_mutated_1x_id-003': 'ACACCGTCGCTTAAAGTGACGGCATAATAATAAAAAAATGAAATTCTTCTCCTTTACGCAAATGGCCAATAGCGATATTGGCCATTTTTTT',
 'SynChiX_mutated_1x_id-004': 'ACACCGTCGCTTAAAGTGACGGCATAATAATAAAAAAATGAAATCCTTCTCCTTTACGCATATGGCCAATAGCGATATTGGCCATTTTTTT',
 'SynChiX_mutated_1x_id-005': 'ACACCGTCGCTTAAAGTGACGGCATAATAATAAAAAAATGAAATTCTCCTCCTTTACGCATATGGCCAATAGCGATATTGGCCATTTTTTT',
 'SynChiX_mutated_1x_id-006': 'ACACCGTCGCTTAAAGTGACGGCATAATAATAAAAAAATGAAATTCTTCTTCTTTACGCATATGGCCAATAGCGATATTGGCCATTTTTTT',
 'SynChiX_mutated_1x_id-007': 'ACACCGTCGCTTAAAGTGACGGCATAATAATAAAAAAATGAAATTCTTCTCCTTTAAGCATATGGCCAATAGCGATATTGGCCATTTTTTT',


# Simulate interactions

GFP vs. file with all mutants in it. Set thread count to max.

In [36]:
fn = 'data/15_imperfect_binding/mutants.fasta'
write_fasta_file(out_path = fn, data=mutant_fasta, byseq=True) 

In [None]:
run_intarna = True

config = {
    "experiment": {
        "purpose": "tests",
        },
        "data": {},
        "system_type": "RNA",
        "interaction_simulator": {
            "name": "IntaRNA",
            "postprocess": True,
            "simulator_kwargs": {
                "outcsvcols": "id1, id2, E, E_norm, bpList, hybridDPfull, seedPu1, seedPu2, seedStart1, seedStart2, seedEnd1, seedEnd2",
                "threads": 12,
                "n": 1,
                "raw_stdout": True
            }
        },
        "molecular_params": {
            "avg_mRNA_per_cell": 100,
            "cell_doubling_time": 1200,
            "creation_rate": 2.35,
            "starting_copynumbers": 200,
            "degradation_rate": 0.01175,
            "association_binding_rate": 1000000
        }
    }


if run_intarna:
    data_writer = None
    config, data_writer = script_preamble(config, data_writer)
    config = prepare_config(expand_config(config=config))

In [None]:

def simulate_IntaRNA_local(fn_query: str,
                           fn_targets: str,
                           sim_kwargs={}):
    sim_kwargs['query'] = fn_query
    sim_kwargs['target'] = fn_targets

    def run(query: str, target: str, qidxpos0: int, tidxpos0: int, outcsvcols: str, threads: int, n: int = 1,
            param_file: str = '', extra_params: list = [], raw_stdout: bool = False):
        p = Popen(['IntaRNA', '-q', query, '-t', target,
                   '--outMode=C', f'--outcsvcols={outcsvcols}',
                   f'--qIdxPos0={qidxpos0}',
                   f'--tIdxPos0={tidxpos0}',
                   f'--outNumber={n}',
                   f'--threads={threads}', param_file]
                  + extra_params, stdout=PIPE, stderr=PIPE, universal_newlines=True)
        stdout, stderr = p.communicate()
        return process_raw_stdout(stdout)

    return run(**sim_kwargs)


if run_intarna:
    sim_data = {}
    for s, sseq in muts.items():
        sim_data[s] = {}
        sim_data[s] = simulate_IntaRNA_local(query={s: sseq},
                                            targets=mrnas,
                                            sim_kwargs=config['interaction_simulator']['simulator_kwargs'])

        data_writer.output(data=sim_data, out_type='json',
                        out_name='inter_data_raw', overwrite=True)
    print(data_writer.write_dir)

# Pick scale of lowest - highest sRNA binders