In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Orthogonality of all known sRNAs to all known mRNAs, recorded as transcription units, in E. coli

Previously, we looked at how the sRNAs documented in the RNAInter database interact with all of the mRNAs in the RNAInter database, in other words the activity of the sRNAs on the mRNAs known to be regulated by sRNAs in general. Now we will look at how orthogonal the sRNAs are to any mRNA present in E. coli. The transcriptional units from the EcoCyc database represent all known mRNAs and the sRNAs are pulled from both the RNAInter and EcoCyc databases. 

# Imports

In [4]:
import os
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import pandas as pd

from synbio_morpher.srv.io.manage.script_manager import script_preamble
from synbio_morpher.srv.parameter_prediction.IntaRNA.bin.copomus.IntaRNA import IntaRNA
from synbio_morpher.srv.parameter_prediction.simulator import process_raw_stdout
from synbio_morpher.utils.circuit.common.config_setup import retrieve_default_args, load_simulator_kwargs

from subprocess import Popen, PIPE, run
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import jax



font_manager.py:_load_fontmanager():1624: generated new fontManager INFO


In [5]:
fn_tus = os.path.join('..', 'data', 'sRNA', 'EcoCyc', 'EcoCyc_TUs.csv')
fn_merged = os.path.join('..', 'data', 'sRNA', 'merged_EcoCyc_RNAInter.csv')
d_tus = pd.read_csv(fn_tus)
d_merged = pd.read_csv(fn_merged)
try:
    d_tus = d_tus.drop(columns=['Unnamed: 0'])
    d_merged = d_merged.drop(columns=['Unnamed: 0'])
except:
    pass


In [6]:
d_merged.head()

Unnamed: 0,Name,ID,Category,Symbol,Sequence,Database
0,acnA,NCBI:946724,mRNA,acnA,ATTCGGAACGAGGCCTGAAGCAGTGTCGCCGTCCCTCTGCCTTGCA...,RNAInter
1,acrZ,NCBI:945365,mRNA,acrZ,ACTTACTACTGTCTTCGGGGGGTCCGAGGTTTCTGGGGGGTCGTAC...,RNAInter
2,argR,NCBI:947861,mRNA,argR,GACAATGGCGATAGTATGGCGGTTGTTTCTTTCCCATCTCTACTCA...,RNAInter
3,asr,NCBI:945103,mRNA,asr,TGCTCCTTCTGCTGATGCCCCCATGTTTGTGATGGGCGTGAACCAT...,RNAInter
4,chbC,NCBI:945982,mRNA,chbC,NCTATCATACTTTAGAAAAGCCATAGAGGGAAATTTTGTGAATAAG...,RNAInter


In [7]:
print(len(d_tus))
nn = np.sum(np.arange(1, len(d_tus)+1)) + len(d_tus)
print('Expected number of species: ', nn)
print('Expected number of circuits: ', len(d_merged))
d_tus.head()

3698
Expected number of species:  6843149
Expected number of circuits:  144


Unnamed: 0,Common-Name,Transcription-Units,Sequence - DNA sequence,Regulated-By,Regulator
0,ygdG,TU0-13740,GTGGCTGTTCATTTGCTTATTGTCGATGCACTGAATCTTATTCGTC...,,
1,ahpCF,TU0-14761,GTTGTTGCATTTGTAAGGGCAACACCTCAGCCTGCAGGCAGGCACT...,,
2,fur,TU00121,ATTATCTCAAGAGCAAATTCTGTCACTTCTTCTAATGAAGTGAACC...,,
3,ymdAB-clsC,TU0-42664,CCGGACGATCGGGTGAAAATAGTTGTTACTGTTTCTGATGGACAGT...,REG0-16016,CPLX0-3930
4,ydeTSR,TU0-13294,ATGAGTGGTTACACCGTCAAGCCTCCTACCGGAGACACCAATGAGC...,,


In [8]:
srnas = dict(list(d_merged[d_merged['Category'] == 'sRNA'][['Name', 'Sequence']].to_numpy()))
mrnas = dict(list(d_tus[['Common-Name', 'Sequence - DNA sequence']].to_numpy()))

In [9]:
config = {
    "experiment": {
        "purpose": "tests",
        "no_visualisations": False,
        "no_numerical": False,
        "debug_mode": False
    },
    "data": {},
    "system_type": "RNA",
    "interaction_simulator": {
        "name": "IntaRNA",
        "postprocess": True
    }
}
data_writer = None
config, data_writer = script_preamble(config, data_writer)
default_args = retrieve_default_args()
simulator_kwargs = load_simulator_kwargs(default_args, config)
config['interaction_simulator']['simulator_kwargs'] = simulator_kwargs


In [11]:
config['interaction_simulator']['simulator_kwargs']['threads'] = 8

# Simulate

In [13]:
def simulate_IntaRNA_local(query: dict,
                           fn_targets: str,
                           sim_kwargs={}):
    fn1 = data_writer.output(data=query, out_type='fasta', out_name='query', byseq=True, return_path=True)
    sim_kwargs['query'] = fn1
    sim_kwargs['target'] = fn_targets
    
    def run(query: str, target: str, qidxpos0: int, tidxpos0: int, outcsvcols: str, threads: int, n: int = 1,
            param_file: str = '', extra_params: list = [], raw_stdout: bool = False):
        p = Popen(['IntaRNA', '-q', query, '-t', target,
                    '--outMode=C', f'--outcsvcols={outcsvcols}',
                    f'--qIdxPos0={qidxpos0}',
                    f'--tIdxPos0={tidxpos0}',
                    f'--outNumber={n}',
                    f'--threads={threads}', param_file]
                    + extra_params, stdout=PIPE, stderr=PIPE, universal_newlines=True)
        stdout, stderr = p.communicate()
        return process_raw_stdout(stdout)
    
    return run(**sim_kwargs)


fn_targets = data_writer.output(data=mrnas, out_type='fasta', out_name='target', byseq=True, return_path=True)
sim_data = {}
for s, sseq in srnas.items():
    sim_data[s] = {}
    t = datetime.now()

    sim_data[s] = simulate_IntaRNA_local(query={s: sseq},
                                         fn_targets=fn_targets,
                                         sim_kwargs=config['interaction_simulator']['simulator_kwargs'])
    print('Finished ', s, ' in ', (datetime.now() - t).total_seconds(), ' s')

    data_writer.output(data=sim_data, out_type='json',
                       out_name='inter_data_raw', overwrite=True)