In [8]:
%load_ext autoreload
%autoreload 2

# Comparison of binding sequences for documented and predicted sRNA-mRNA interactions

## Imports

In [9]:
import os
import pandas as pd

from synbio_morpher.utils.misc.type_handling import flatten_listlike
from synbio_morpher.utils.data.data_format_tools.common import load_json_as_dict
from synbio_morpher.srv.io.manage.script_manager import script_preamble

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import jax
import subprocess



# Load data

### Database

In [10]:
fn_RNAInter = os.path.join('..', 'data', 'sRNA', 'RNAInter', 'Download_data_RR.csv')
fn_merged = os.path.join('..', 'data', 'sRNA', 'merged_EcoCyc_RNAInter.csv')
data = pd.read_csv(fn_RNAInter)
merged = pd.read_csv(fn_merged)
try:
    data = data.drop(columns=['Unnamed: 0'])
    merged = merged.drop(columns=['Unnamed: 0'])
except:
    pass


### Predicted interactions

In [11]:
fn = './data/05_orthogonality_mRNA_official/2023_11_21_095238/inter_data_raw.json'

sim_data_l = load_json_as_dict(fn)

k1 = list(sim_data_l.keys())[0]
k2 = list(sim_data_l[k1].keys())[0]
header = list(sim_data_l[k1][k2].keys())
default_vals = {
    'id1': '', 'id2': '', 'E': 0.0, 'E_norm': 0.0, 'bpList': '', 'hybridDPfull': ''
}
for s, v in sim_data_l.items():
    if len(v.values()) < len(sim_data_l[k1]):
        diffs = set(sim_data_l[k1].keys()) - set(v.keys())
        for d in diffs:
            sim_data_l[s][d] = default_vals
        
header

['id1', 'id2', 'E', 'E_norm', 'bpList', 'hybridDPfull']

# Get interaction subsequences 

In [12]:
merged

Unnamed: 0,Name,ID,Category,Symbol,Sequence,Database
0,acnA,NCBI:946724,mRNA,acnA,ATTCGGAACGAGGCCTGAAGCAGTGTCGCCGTCCCTCTGCCTTGCA...,RNAInter
1,acrZ,NCBI:945365,mRNA,acrZ,ACTTACTACTGTCTTCGGGGGGTCCGAGGTTTCTGGGGGGTCGTAC...,RNAInter
2,arcZ,NCBI:2847690,sRNA,arcZ,CTCATGTTGACCGCTTGTTTAGCAGCTTCAAGGAAGCTGAAGGGCA...,RNAInter
3,argR,NCBI:947861,mRNA,argR,GACAATGGCGATAGTATGGCGGTTGTTTCTTTCCCATCTCTACTCA...,RNAInter
4,arrS,NCBI:11115378,sRNA,arrS,CTCTCCCTCTCTTTCTCTCTTCTCCGCGGCGATACGATCCGAGATG...,RNAInter
...,...,...,...,...,...,...
344,small regulatory RNA SroC,RNA0-124,sRNA,sroC,ACTAATTACAAGAACCAGGGGCGGAAATTCCAGCCCTCTCGATTGT...,EcoCyc
345,small regulatory RNA ZbiJ,RNA0-417,sRNA,zbiJ,TTTCCCTGTCTGTTTGCCGACAGACGCATATGCTCTAACCCTCATT...,EcoCyc
346,small RNA SibB,RYED-RNA,sRNA,sibB,GAGGGTAGAGCGGGGTTTCCCCCGCCCTGGTAGTCTTAGTAAGCGG...,EcoCyc
347,small RNA SibD,C0730-RNA,sRNA,sibD,ACAAGGGTGAGGGAGGATTTCTCCCCCCTCTGATTGGCTGTTAATA...,EcoCyc


In [13]:
def process_bplist(bplist, idx):
    return [int(i.replace(')', ',').split(',')[idx]) for i in bplist.split('(')[1:]]


def create_groups(x, y):
    """ CGPT """
    groups = []
    group = []
    for i, num in enumerate(x):
        if i in y:
            if group:
                groups.append(group)
                group = []
            group.append(num)
        else:
            group.append(num)
    groups.append(group)
    return groups

In [14]:
for k1 in sim_data_l:
    for k2, v in sim_data_l[k1].items():
        if v['id1'] == '' or v['id2'] == '':
            sim_data_l[k1][k2]['subsequence_groups'] = []
            continue
        seq1, cat1 = merged[(merged['Name'] == v['id1']) | (merged['Symbol'] == v['id1'])][['Sequence', 'Category']].iloc[0].to_list()
        seq2, cat2 = merged[(merged['Name'] == v['id2']) | (merged['Symbol'] == v['id2'])][['Sequence', 'Category']].iloc[0].to_list()
        
        if (cat1 == 'mRNA') and (cat2 == 'mRNA'):
            print(k1, k2)
            break
        
        subsequence, i = (seq1, 0) if cat1 == 'mRNA' else (seq2, 1)
        idxs = np.array(sorted(process_bplist(v['bpList'], i)))
        breakup = np.where(idxs[1:] - idxs[:-1] > 1)[0] + 1
        subsequence_groups = []
        for g in create_groups(idxs, breakup):
            subsequence_groups.append(''.join(np.array(list(subsequence))[np.array(g) - 1]))
        
        sim_data_l[k1][k2]['subsequence_groups'] = subsequence_groups
        

In [15]:
sim_data_l['arcZ']['acnA']['subsequence_groups']

['CTCA', 'C', 'CACCT', 'GTGGG', 'CCAGCACACT', 'TCGGAGGCCA', 'GC']

# Run bpRNA on all sequences

In [16]:
config = {
    "experiment": {
        "purpose": "tests",
        "no_visualisations": False,
        "no_numerical": False,
        "debug_mode": False
    }
}
data_writer = None
config, data_writer = script_preamble(config, data_writer)

In [80]:
def write_dbn(outname, outdir, id_name, seq, db):
    fn = os.path.join(outdir, outname + '.dbn')
    with open(fn, 'w') as f:
        f.writelines('>' + id_name + '\n' +
                     seq + '\n' +
                     db + '\n')
    return fn


# def make_db(bplist, seq_len):
#     db = np.array(list('.' * seq_len))
#     db[process_bplist(bplist, idx=0)] = '('
#     db[process_bplist(bplist, idx=1)] = ')'
#     return ''.join(db)


def execute_perl_script(*args):
    """ Bard """
    script_path = './bpRNA.pl'
    try:
        subprocess.run(["perl", script_path, *args])
    except Exception as e:
        print(f"Error executing Perl script: {e}")


for k1 in sim_data_l:
    data_writer.subdivide_writing('st')
    data_writer.subdivide_writing(k1, safe_dir_change=False)
    data_writer.unsubdivide()
    data_writer.subdivide_writing('dbn')
    data_writer.subdivide_writing(k1, safe_dir_change=False)
    
    for k2 in sim_data_l[k1]:
        bplist = sim_data_l[k1][k2]['bpList']
        # make_db(bplist, seq_len=len(db))
        db = sim_data_l[k1][k2]['hybridDPfull'].replace('&', '')
        seq = merged[merged['Symbol'] == k1]['Sequence'].iloc[0] + \
            merged[merged['Symbol'] == k2]['Sequence'].iloc[0]
        fn = write_dbn(k1 + '_' + k2, data_writer.write_dir, id_name='arcZ', seq=seq, db=db)
        execute_perl_script(fn, fn.replace('.dbn', '').replace('dbn', 'st'))

zone3 at ./bpRNA.pl line 1566, <IN> line 3.
zone3 at ./bpRNA.pl line 1566, <IN> line 3.
zone3 at ./bpRNA.pl line 1566, <IN> line 3.
zone3 at ./bpRNA.pl line 1566, <IN> line 3.
zone3 at ./bpRNA.pl line 1566, <IN> line 3.
zone3 at ./bpRNA.pl line 1566, <IN> line 3.
zone3 at ./bpRNA.pl line 1566, <IN> line 3.


In [None]:
import difflib

def longest_common_substring(x, y):
    """ Bard """
    # Create a sequence matcher object
    matcher = difflib.SequenceMatcher(None, x, y)

    # Find the longest common substring match
    match = matcher.find_longest_match()

    # Extract the longest common substring
    if match.size != 0:
        substring = x[match.a:match.a + match.size]
        return substring
    else:
        return ""
    

In [None]:
# motifs = {}
# for s in sim_data_l:
    
#     subsequences = flatten_listlike([[vv['subsequence_groups'] for vv in v.values()] for v in sim_data_l.values()])
    
#     motifs[s] = []
#     for subseq1 in subsequences:
#         for subseq2 in subsequences:
        
#             if subseq1 != subseq2:
#                 motifs[s].append(longest_common_substring(subseq1, subseq2))
    