In [1]:
%load_ext autoreload
%autoreload 2

# Desiging a synthetic ChiX with imperfect binding to GFP mRNA

## Imports

In [2]:
import pandas as pd
import numpy as np
from synbio_morpher.utils.data.data_format_tools.common import load_json_as_dict
from synbio_morpher.utils.data.data_format_tools.manipulate_fasta import load_seq_from_FASTA
from synbio_morpher.utils.misc.type_handling import flatten_listlike

# Load data

Load ChiX, synChiX, and GFP mRNA.

In [3]:
fn_muts = 'data/scott_mutations.fasta'
fn_fps = 'data/mRNA.fasta'
fn_all = '../data/sRNA/merged_EcoCyc_RNAInter_sRNATarBase.csv'
fn_tus = '../data/sRNA/EcoCyc/EcoCyc_TUs.csv'

muts = load_seq_from_FASTA(fn_muts, as_type='dict')
fps = load_seq_from_FASTA(fn_fps, as_type='dict')
data = pd.read_csv(fn_all, index_col=0)
tus = pd.read_csv(fn_tus, index_col=0)

In [4]:
fn_sim = './data/tests/2023_12_12_170150/inter_data_raw.json'
sim_data_l = load_json_as_dict(fn_sim)

default_vals = {
    'id1': '', 'id2': '', 'E': 0.0, 'E_norm': 0.0, 'bpList': '', 'hybridDPfull': '', 'seedPu1': '', 'seedPu2': '', 'seedStart1': '', 'seedStart2': '', 'seedEnd1': '', 'seedEnd2': ''
}
mrnas = list(set(flatten_listlike([list(v.keys()) for v in sim_data_l.values()])))
for s, v in sim_data_l.items():
    if len(v.values()) < len(mrnas):
        diffs = set(mrnas) - set(v.keys())
        for d in diffs:
            sim_data_l[s][d] = default_vals
        

In [5]:
sim_data_l['Syn_ChiX_20-D']['cyRFP1']

{'id1': 'cyRFP1',
 'id2': 'Syn_ChiX_20-D',
 'E': '-7.51',
 'E_norm': '-0.676255',
 'bpList': '(392,58):(393,57):(394,56):(395,55):(396,54):(397,53):(401,51):(403,49):(404,48):(405,47):(406,46):(407,45):(408,44):(409,43):(410,42)',
 'hybridDPfull': '.......................................................................................................................................................................................................................................................................................................................................................................................................((((((...(.((((((((.................................................................................................................................................................................................................................................................................................................................&...................

In [6]:
def convert_seed_to_int(seedstr):
    return list(map(lambda x: int(x), seedstr.split(':')))

seed_region = list(map(lambda x: int(x.strip('()').split(',')[-1]), sim_data_l['Syn_ChiX_20-D']['EGFP']['bpList'].split(':')))
seed_region = sorted([seed_region[0], seed_region[-1]])
seed_region

[44, 64]

# Mutate seed region

Do 1, 2, 3, 5, 7, and 10 mutations.

In [46]:
def choose_positions(start, end, bpcount):
    return np.array(np.random.choice(np.arange(start, end), bpcount, replace=False))


def generate_unique_tuples(tot, tup_len, tup1, tup2):
    unique_tuples = set()
    while len(unique_tuples) < tot:
        t1 = list(np.random.randint(tup1[0], tup1[1]) for _ in range(tup_len))
        t2 = list(np.random.randint(tup2[0], tup2[1]) for _ in range(tup_len))
        new_tuple = tuple(t1 + t2)
        unique_tuples.add(new_tuple)
    return list(unique_tuples)


def choose_types(seq, positions, nuc_map, nucs, bpcount):
    int_types = np.random.randint(0, len(nucs)-1, bpcount)
    mutation_types = []
    for bp, i in zip(seq[positions], int_types):
        mutation_types.append(nuc_map[bp][i])
    return np.array(mutation_types)


def implement_mutation(mutation_count: int, seq: str, seed_region: tuple, nucs: list, nuc_map: dict, mutations_per_count: int):
    positions_types = generate_unique_tuples(tot=mutations_per_count, tup_len=mutation_count, tup1=seed_region, tup2=[0, len(nucs)])
    positions = np.concatenate([choose_positions(
        start=seed_region[0], end=seed_region[-1] + 1, bpcount=mutation_count) for c in range(mutations_per_count)])
    mutation_types = choose_types(np.array(list(
        seq)), positions, nuc_map=nuc_map, nucs=nucs, bpcount=mutations_per_count*mutation_count)
    positions, mutation_types = positions.reshape((mutations_per_count, mutation_count)).astype(
        int).T, mutation_types.reshape((mutation_count, mutations_per_count))

    seq_ints = np.array(list(map(lambda x: nucs.index(x), seq)))
    mutants = np.repeat(seq_ints[None, :], repeats=mutations_per_count, axis=1).reshape(
        (len(seq), mutations_per_count)).T
    mut_index = (np.repeat(np.arange(mutations_per_count)[:, np.newaxis], repeats=mutation_count, axis=1).T, positions)
    mutants[mut_index] = mutation_types
    
    wrongs = np.where((mutants[mut_index] == mutation_types) == False)
    for w in zip(wrongs[0], wrongs[1]):
        mutants[w[-1], positions[w]] = mutation_types[w]

    return mutants

In [42]:
nucs = sorted(['A', 'C', 'G', 'T'])

nuc_map = {}
for n1 in nucs:
    nuc_map[n1] = sorted([n2 for n2 in nucs if n2 != n1])
nuc_map_ints = {}
for k, v in nuc_map.items():
    nuc_map_ints[k] = list(map(lambda x: nucs.index(x), v))

In [47]:
mutation_counts = [2,3,5,7,10]
mutations_per_count = 20
seq = muts['Syn_ChiX_20-D']
all_mutants = {}
for mutation_count in mutation_counts:
    all_mutants[mutation_count] = implement_mutation(mutation_count, seq, seed_region, nucs, nuc_map_ints, mutations_per_count)


[(64, 73, 11), (19, 21, 70), (45, 70, 73), (100, 32, 0), (79, 75, 100)]


In [28]:
all_mutants[10].shape

(10, 20, 91)

# Simulate interactions

GFP vs. file with all mutants in it. Set thread count to max.

# Pick scale of lowest - highest sRNA binders