In [1]:
%load_ext autoreload
%autoreload 2

# Merge the databases that have information about RNA interactions

## Imports

In [2]:
import os
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from subprocess import Popen, PIPE, run
from datetime import datetime
import pandas as pd

from synbio_morpher.srv.io.manage.script_manager import script_preamble
from synbio_morpher.srv.parameter_prediction.IntaRNA.bin.copomus.IntaRNA import IntaRNA
from synbio_morpher.srv.parameter_prediction.simulator import process_raw_stdout
from synbio_morpher.utils.common.setup import prepare_config, expand_config
from synbio_morpher.utils.data.data_format_tools.common import load_json_as_dict

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np



In [3]:
fn_RNAInter = os.path.join('..', 'data', 'sRNA', 'RNAInter', 'Download_data_RR.csv')
fn_srnatarbase = os.path.join('..', 'data', 'sRNA', 'sRNATarBase', 'sRNATarBase.csv')
fn_merged = os.path.join('..', 'data', 'sRNA', 'merged_EcoCyc_RNAInter.csv')
data_rnainter = pd.read_csv(fn_RNAInter)
data_srnatarbase = pd.read_csv(fn_srnatarbase)
data_merged = pd.read_csv(fn_merged)
try:
    data_rnainter = data_rnainter.drop(columns=['Unnamed: 0', 'level_0'])
    data_srnatarbase = data_srnatarbase.drop(columns=['Unnamed: 0'])
    data_merged = data_merged.drop(columns=['Unnamed: 0'])
except:
    pass


In [4]:
print('Overlapping:', len(set([s for s in data_srnatarbase['sRNA'] if (s in data_rnainter[data_rnainter['Category1'] == 'sRNA']
      ['Interactor1.Symbol'].unique()) or (s in data_rnainter[data_rnainter['Category2'] == 'sRNA']['Interactor2.Symbol'].unique())])))
print('Non-overlapping:', len(set([s for s in data_srnatarbase['sRNA'] if (
    s not in data_rnainter[data_rnainter['Category1'] == 'sRNA']['Interactor1.Symbol'].unique()) and (s not in data_rnainter[data_rnainter['Category1'] == 'sRNA']['Interactor2.Symbol'].unique())])))

print('Overlapping:', len(set([s for s in data_rnainter[data_rnainter['Category1'] == 'sRNA']['Interactor1.Symbol'] if (
    s in data_srnatarbase['sRNA'].unique())])) +
    len(set([s for s in data_rnainter[data_rnainter['Category2'] == 'sRNA']['Interactor2.Symbol'] if (
        s not in data_srnatarbase['sRNA'].unique())])))

Overlapping: 36
Non-overlapping: 32
Overlapping: 23


In [5]:
data_rnainter.head()

Unnamed: 0,index,RNAInterID,Interactor1.Symbol,Category1,Species1,Interactor2.Symbol,Category2,Species2,Raw_ID1,Raw_ID2,score,strong,weak,predict,Sequence1,Sequence2
0,1029724,RR05384747,acnA,mRNA,Escherichia coli str. K-12 substr. MG1655,ryhB,sRNA,Escherichia coli str. K-12 substr. MG1655,NCBI:946724,NCBI:2847761,0.2292,Northern blot//Reporter assay,,,ATTCGGAACGAGGCCTGAAGCAGTGTCGCCGTCCCTCTGCCTTGCA...,TTTGAGAGCGTCTCTGTCCCTCGTTTTGCGGTTAAGCCGCATCCAT...
1,1029725,RR05384845,acrZ,mRNA,Escherichia coli str. K-12 substr. MG1655,omrB,sRNA,Escherichia coli str. K-12 substr. MG1655,NCBI:945365,NCBI:2847747,0.1778,Northern blot,Microarray,,ACTTACTACTGTCTTCGGGGGGTCCGAGGTTTCTGGGGGGTCGTAC...,TGTTCTATACTTGGGTTCGACTTGGGTTAGACTTGTCTTTACTGTC...
2,1029732,RR05387056,arcZ,sRNA,Escherichia coli str. K-12 substr. MG1655,flhD,mRNA,Escherichia coli str. K-12 substr. MG1655,NCBI:2847690,NCBI:945442,0.2314,Northern blot//Reporter assay,,,CTCATGTTGACCGCTTGTTTAGCAGCTTCAAGGAAGCTGAAGGGCA...,CTCGTGCCGAATTCGGCACGAGCGATATTTCATCAGTTATCGGTAA...
3,1029733,RR05387057,arcZ,sRNA,Escherichia coli str. K-12 substr. MG1655,rpoS,mRNA,Escherichia coli str. K-12 substr. MG1655,NCBI:2847690,NCBI:947210,0.1996,Northern blot//RACE//RT-PCR//Beta-galactosidas...,,,CTCATGTTGACCGCTTGTTTAGCAGCTTCAAGGAAGCTGAAGGGCA...,ATCCTCGGGTCTTGCAGGCCACACAGGACACCCTGAACCGTCATGG...
4,1029734,RR05387126,argR,mRNA,Escherichia coli str. K-12 substr. MG1655,dsrA,sRNA,Escherichia coli str. K-12 substr. MG1655,NCBI:947861,NCBI:946470,0.2139,Primer extension assay,,,GACAATGGCGATAGTATGGCGGTTGTTTCTTTCCCATCTCTACTCA...,TGAGCAACTTTATTCACATAATTTCTACACCAAGAACTCGAGGTTA...


In [6]:
data_srnatarbase.head()

Unnamed: 0,Target,sRNA,Target ID,sRNA ID,Regulation,Target Type,sRNA Type,Target Binding Position,sRNA Binding Position,Target Sequence,sRNA Sequence
0,hns,dsrA,"chromosome:NC_000913.3, Gene ID:945829","chromosome:NC_000913.3, Gene ID:946470",Repression,mRNA,trans-encoded antisense RNA,"['7..19', 'NA', '7..19; 401..411']","['31..43', 'NA', '31..54']",ATGAGCGAAGCACTTAAAATTCTGAACAACATCCGTACTCTTCGTG...,AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...
1,rbsD,dsrA,"chromosome:NC_000913.3, Gene ID:948267","chromosome:NC_000913.3, Gene ID:946470",Repression,mRNA,trans-encoded antisense RNA,['NA'],['NA'],ATGAAAAAAGGCACCGTTCTTAATTCTGATATTTCATCGGTGATCT...,AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...
2,argR,dsrA,"chromosome:NC_000913.3, Gene ID:947861","chromosome:NC_000913.3, Gene ID:946470",Repression,mRNA,trans-encoded antisense RNA,['NA'],['NA'],ATGCGAAGCTCGGCTAAGCAAGAAGAACTAGTTAAAGCATTTAAAG...,AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...
3,ilvI,dsrA,"chromosome:NC_000913.3, Gene ID:948793","chromosome:NC_000913.3, Gene ID:946470",Repression,mRNA,trans-encoded antisense RNA,['NA'],['NA'],ATGGAGATGTTGTCTGGAGCCGAGATGGTCGTCCGATCGCTTATCG...,AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...
4,rpoS,dsrA,"chromosome:NC_000913.3, Gene ID:947210","chromosome:NC_000913.3, Gene ID:946470",Induction,mRNA,trans-encoded antisense RNA,"['-119..-97', 'NA', 'NA']","['10..32', 'NA', 'NA']",ATGAGTCAGAATACGCTGAAAGTTCATGATTTAAATGAAGATGCGG...,AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...


# Rename columns

In [7]:
cols = ['Interactor 1', 'Interactor 2', 'ID 1', 'ID 2', 'Category 1', 'Category 2', 'Regulation', 'Binding position 1', 'Binding position 2', 'Sequence 1', 'Sequence 2']
cols_srnatarbase = {
    'sRNA': 'Interactor 1',
    'Target': 'Interactor 2',
    'sRNA ID': 'ID 1',
    'Target ID': 'ID 2',
    'sRNA Type': 'Category 1',
    'Target Type': 'Category 2', 
    'sRNA Binding Position': 'Binding position 1',
    'Target Binding Position': 'Binding position 2',
    'sRNA Sequence': 'Sequence 1',
    'Target Sequence': 'Sequence 2'
}
cols_rnainter = {
    'Interactor1.Symbol': 'Interactor 1',
    'Interactor2.Symbol': 'Interactor 2',
    'Raw_ID1': 'ID 1',
    'Raw_ID2': 'ID 2',
    'Category1': 'Category 1',
    'Category2': 'Category 2', 
    'Sequence1': 'Sequence 1',
    'Sequence2': 'Sequence 2'
}
data_rnainter = data_rnainter.rename(columns=cols_rnainter)
data_srnatarbase = data_srnatarbase.rename(columns=cols_srnatarbase)

data_rnainter['Regulation'] = ''
data_rnainter['Binding position 1'] = ''
data_rnainter['Binding position 2'] = ''
data_rnainter['Source DB'] = 'RNAInter'

data_srnatarbase['Source DB'] = 'sRNATarBase'

In [8]:
data = pd.concat([data_rnainter[cols], data_srnatarbase[cols]], axis=0)
data

Unnamed: 0,Interactor 1,Interactor 2,ID 1,ID 2,Category 1,Category 2,Regulation,Binding position 1,Binding position 2,Sequence 1,Sequence 2
0,acnA,ryhB,NCBI:946724,NCBI:2847761,mRNA,sRNA,,,,ATTCGGAACGAGGCCTGAAGCAGTGTCGCCGTCCCTCTGCCTTGCA...,TTTGAGAGCGTCTCTGTCCCTCGTTTTGCGGTTAAGCCGCATCCAT...
1,acrZ,omrB,NCBI:945365,NCBI:2847747,mRNA,sRNA,,,,ACTTACTACTGTCTTCGGGGGGTCCGAGGTTTCTGGGGGGTCGTAC...,TGTTCTATACTTGGGTTCGACTTGGGTTAGACTTGTCTTTACTGTC...
2,arcZ,flhD,NCBI:2847690,NCBI:945442,sRNA,mRNA,,,,CTCATGTTGACCGCTTGTTTAGCAGCTTCAAGGAAGCTGAAGGGCA...,CTCGTGCCGAATTCGGCACGAGCGATATTTCATCAGTTATCGGTAA...
3,arcZ,rpoS,NCBI:2847690,NCBI:947210,sRNA,mRNA,,,,CTCATGTTGACCGCTTGTTTAGCAGCTTCAAGGAAGCTGAAGGGCA...,ATCCTCGGGTCTTGCAGGCCACACAGGACACCCTGAACCGTCATGG...
4,argR,dsrA,NCBI:947861,NCBI:946470,mRNA,sRNA,,,,GACAATGGCGATAGTATGGCGGTTGTTTCTTTCCCATCTCTACTCA...,TGAGCAACTTTATTCACATAATTTCTACACCAAGAACTCGAGGTTA...
...,...,...,...,...,...,...,...,...,...,...,...
406,ryhB,msrA,"chromosome:NC_000913.3, Gene ID:2847761","chromosome:NC_000913.3, Gene ID:948734",,mRNA,No Interaction,['NA'],['NA'],GCGATCAGGAAGACCCTCGCGGAGAACCTGAAAGCACGACATTGCT...,ATGAGTTTATTTGATAAAAAGCATCTGGTTTCCCCCGCCGATGCCC...
407,gcvB,hdeA,"chromosome:NC_000913.3, Gene ID:2847720","chromosome:NC_000913.3, Gene ID:948025",trans-encoded antisense RNA,mRNA,Induction,['NA'],['NA'],ACTTCCTGAGCCGGAACGAAAAGTTTTATCGGAATGCGTGTTCTGG...,ATGAAAAAAGTATTAGGCGTTATTCTTGGTGGTCTGCTTCTTCTGC...
408,gcvB,hdeB,"chromosome:NC_000913.3, Gene ID:2847720","chromosome:NC_000913.3, Gene ID:948026",trans-encoded antisense RNA,mRNA,Induction,['NA'],['NA'],ACTTCCTGAGCCGGAACGAAAAGTTTTATCGGAATGCGTGTTCTGG...,ATGAATATTTCATCTCTCCGTAAAGCGTTTATTTTTATGGGCGCTG...
409,Esr41,fliC,"chromosome:NC_002695.1, Gene ID:","chromosome:NC_002695.1, Gene ID:",trans-encoded antisense RNA,mRNA,Induction,['NA'],['NA'],GATGCTCTAGGCATCACATTTTCTCCATGGGGTATTCCCTCCGCCG...,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...


# Correct for RNAInter (false) sequences

In [9]:
for i, s in [('Interactor 1', 'Sequence 1'), ('Interactor 2', 'Sequence 2')]:
    for rna, seq in data[[i, s]].value_counts().index.tolist():
        
        if (rna in data_srnatarbase['Interactor 1'].to_list()):
            if data_srnatarbase[data_srnatarbase['Interactor 1'] == rna]['Sequence 1'].iloc[0] != seq:
                data.loc[data[i] == rna, s] = data_srnatarbase[data_srnatarbase['Interactor 1'] == rna]['Sequence 1'].iloc[0]
                
        elif (rna in data_srnatarbase['Interactor 2'].to_list()):
            if data_srnatarbase[data_srnatarbase['Interactor 2'] == rna]['Sequence 2'].iloc[0] != seq:
                data.loc[data[i] == rna, s] = data_srnatarbase[data_srnatarbase['Interactor 2'] == rna]['Sequence 2'].iloc[0] 
            
        elif (rna in data_merged['Symbol'].to_list()):
            if (data_merged[data_merged['Symbol'] == rna]['Sequence'].iloc[0] != seq):
                data.loc[data[i] == rna, s] = data_merged[data_merged['Symbol'] == rna]['Sequence'].iloc[0]
            
        else: 
            print('Could not correct the sequence for', rna)
            

# Drop duplicates

In [10]:
for d in data[['Interactor 1', 'Interactor 2']].value_counts()[data[['Interactor 1', 'Interactor 2']].value_counts() > 1].index:
    
    data = data.drop(index=data[(data['Interactor 1'] == d[0]) & (data['Interactor 2'] == d[1])].index[0])
    
# data[data['Symbol'] == d].index[0]    
    

In [11]:
data

Unnamed: 0,Interactor 1,Interactor 2,ID 1,ID 2,Category 1,Category 2,Regulation,Binding position 1,Binding position 2,Sequence 1,Sequence 2
0,acnA,ryhB,NCBI:946724,NCBI:2847761,mRNA,sRNA,,,,ATGTCGTCAACCCTACGAGAAGCCAGTAAGGACACGTTGCAGGCCA...,GCGATCAGGAAGACCCTCGCGGAGAACCTGAAAGCACGACATTGCT...
1,acrZ,omrB,NCBI:945365,NCBI:2847747,mRNA,sRNA,,,,ATGTTAGAGTTATTAAAAAGTCTGGTATTCGCCGTAATCATGGTAC...,CCCAGAGGTATTGATAGGTGAAGTCAACTTCGGGTTGAGCACATGA...
4,argR,dsrA,NCBI:947861,NCBI:946470,mRNA,sRNA,,,,ATGCGAAGCTCGGCTAAGCAAGAAGAACTAGTTAAAGCATTTAAAG...,AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...
6,asr,rybB,NCBI:945103,NCBI:2847774,mRNA,sRNA,,,,ATGAAAAAAGTATTAGCTCTGGTTGTTGCCGCTGCTATGGGTCTGT...,ACTGCTTTTCTTTGATGTCCCCATTTTGTGGAGCCCATCAACCCCG...
7,bfr,ryhB,NCBI:947839,NCBI:2847761,mRNA,sRNA,,,,ATGAAAGGTGATACTAAAGTTATAAATTATCTCAACAAACTGTTGG...,GCGATCAGGAAGACCCTCGCGGAGAACCTGAAAGCACGACATTGCT...
...,...,...,...,...,...,...,...,...,...,...,...
406,ryhB,msrA,"chromosome:NC_000913.3, Gene ID:2847761","chromosome:NC_000913.3, Gene ID:948734",,mRNA,No Interaction,['NA'],['NA'],GCGATCAGGAAGACCCTCGCGGAGAACCTGAAAGCACGACATTGCT...,ATGAGTTTATTTGATAAAAAGCATCTGGTTTCCCCCGCCGATGCCC...
407,gcvB,hdeA,"chromosome:NC_000913.3, Gene ID:2847720","chromosome:NC_000913.3, Gene ID:948025",trans-encoded antisense RNA,mRNA,Induction,['NA'],['NA'],ACTTCCTGAGCCGGAACGAAAAGTTTTATCGGAATGCGTGTTCTGG...,ATGAAAAAAGTATTAGGCGTTATTCTTGGTGGTCTGCTTCTTCTGC...
408,gcvB,hdeB,"chromosome:NC_000913.3, Gene ID:2847720","chromosome:NC_000913.3, Gene ID:948026",trans-encoded antisense RNA,mRNA,Induction,['NA'],['NA'],ACTTCCTGAGCCGGAACGAAAAGTTTTATCGGAATGCGTGTTCTGG...,ATGAATATTTCATCTCTCCGTAAAGCGTTTATTTTTATGGGCGCTG...
409,Esr41,fliC,"chromosome:NC_002695.1, Gene ID:","chromosome:NC_002695.1, Gene ID:",trans-encoded antisense RNA,mRNA,Induction,['NA'],['NA'],GATGCTCTAGGCATCACATTTTCTCCATGGGGTATTCCCTCCGCCG...,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...


In [32]:
data.to_csv(os.path.join('..', 'data', 'sRNA', 'merged_inter.csv'))