In [29]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Desiging a synthetic ChiX with imperfect binding to GFP mRNA

## Imports

In [30]:
import pandas as pd
import numpy as np
from synbio_morpher.utils.data.data_format_tools.common import load_json_as_dict
from synbio_morpher.utils.data.data_format_tools.manipulate_fasta import load_seq_from_FASTA
from synbio_morpher.utils.misc.type_handling import flatten_listlike

# Load data

Load ChiX, synChiX, and GFP mRNA.

In [31]:
fn_muts = 'data/scott_mutations.fasta'
fn_fps = 'data/mRNA.fasta'
fn_all = '../data/sRNA/merged_EcoCyc_RNAInter_sRNATarBase.csv'
fn_tus = '../data/sRNA/EcoCyc/EcoCyc_TUs.csv'

muts = load_seq_from_FASTA(fn_muts, as_type='dict')
fps = load_seq_from_FASTA(fn_fps, as_type='dict')
data = pd.read_csv(fn_all, index_col=0)
tus = pd.read_csv(fn_tus, index_col=0)

In [32]:
fn_sim = './data/tests/2023_12_12_170150/inter_data_raw.json'
sim_data_l = load_json_as_dict(fn_sim)

default_vals = {
    'id1': '', 'id2': '', 'E': 0.0, 'E_norm': 0.0, 'bpList': '', 'hybridDPfull': '', 'seedPu1': '', 'seedPu2': '', 'seedStart1': '', 'seedStart2': '', 'seedEnd1': '', 'seedEnd2': ''
}
mrnas = list(set(flatten_listlike([list(v.keys()) for v in sim_data_l.values()])))
for s, v in sim_data_l.items():
    if len(v.values()) < len(mrnas):
        diffs = set(mrnas) - set(v.keys())
        for d in diffs:
            sim_data_l[s][d] = default_vals
        

In [33]:
sim_data_l['Syn_ChiX_20-D']['cyRFP1']

{'id1': 'cyRFP1',
 'id2': 'Syn_ChiX_20-D',
 'E': '-7.51',
 'E_norm': '-0.676255',
 'bpList': '(392,58):(393,57):(394,56):(395,55):(396,54):(397,53):(401,51):(403,49):(404,48):(405,47):(406,46):(407,45):(408,44):(409,43):(410,42)',
 'hybridDPfull': '.......................................................................................................................................................................................................................................................................................................................................................................................................((((((...(.((((((((.................................................................................................................................................................................................................................................................................................................................&...................

In [34]:
def convert_seed_to_int(seedstr):
    return list(map(lambda x: int(x), seedstr.split(':')))

seed_region = list(map(lambda x: int(x.strip('()').split(',')[-1]), sim_data_l['Syn_ChiX_20-D']['EGFP']['bpList'].split(':')))
seed_region = sorted([seed_region[0], seed_region[-1]])
seed_region

[44, 64]

# Mutate seed region

Do 1, 2, 3, 5, 7, and 10 mutations.

In [42]:
def choose_positions(start, end, bpcount):
    return np.random.choice(np.arange(start, end), bpcount, replace=False)


def choose_types(seq, positions, nuc_map, nucs, bpcount):
    int_types = np.random.randint(0, len(nucs)-1, bpcount)
    mutation_types = []
    for bp, i in zip(seq[positions], int_types):
        mutation_types.append(nuc_map[bp][i])
    return mutation_types


def implement_mutation(mutation_count: int, seq: str, seed_region: tuple, nucs: list, nuc_map: dict, mutations_per_count: int):
    positions = choose_positions(start=seed_region[0], end=seed_region[-1] + 1, bpcount=mutations_per_count*mutation_count)
    mutation_types = choose_types(np.array(list(seq)), p, nuc_map=nuc_map, nucs=nucs, bpcount=mutations_per_count*mutation_count)
    positions, mutation_types = positions.reshape((mutations_per_count, mutation_count)).astype(int), mutation_types.reshape((mutations_per_count, mutation_count))
    
    mutants = np.repeat(np.array(list(seq)), repeats=mutations_per_count).reshape((mutations_per_count, len(seq)))
    mutants[positions] = mutation_types
        
    return mutants

In [36]:
nucs = ['A', 'C', 'G', 'T']
nuc_map = {}
for n1 in nucs:
    nuc_map[n1] = sorted([n2 for n2 in nucs if n2 != n1])

In [40]:
mutation_counts = [1,2,3,5,7,10]
mutations_per_count = 20
seq = muts['Syn_ChiX_20-D']
for mutation_count in mutation_counts:
    mutants = implement_mutation(mutation_count, seq, seed_region, nucs, nuc_map, mutations_per_count)

ValueError: cannot reshape array of size 64 into shape (20,1)

# Simulate interactions

GFP vs. file with all mutants in it. Set thread count to max.

# Pick scale of lowest - highest sRNA binders