In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Downloading data from sRNATarBase

This script works and webscrapes the sequences and interaction targets for each sRNA from [sRNATarBase](http://aibcenter.com/srnatarbase/index.php?r=srnaTarget/index).

We also used [RNAInter](http://www.rnainter.org/showBrowse/?sType=Species&sItem=Escherichia+coli+str.+K-12+substr.+MG1655&number=Escherichia+coli+str.+K-12+substr.+MG1655%3A+436+entries) as a database for referencing RNA interactions.


# Imports

In [16]:
import os
import requests
from bs4 import BeautifulSoup
import re


import pandas as pd
import numpy as np

# Webscrape

In [17]:
def find_lines_with_keyword(soup, keyword):
    """ Bard """
    lines = []
    for line in soup.find_all():
        if keyword in line.text:
            lines.append(line)
    return lines

def pull_table_info_srnatarbase(soup, rowname):
    interaction_lines = find_lines_with_keyword(soup, rowname)
    interaction_lines = [i for i in interaction_lines if 'width' in i.attrs.get('style', '') and (i.text == rowname)]
    return [i.next_sibling.text for i in interaction_lines]

def scrape_rna_sequences(url):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)
        
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the page content with BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            if 'Escherichia coli' not in pull_table_info_srnatarbase(soup, rowname='Strain Name')[0]:
                return 0
            
            # Find elements containing RNA sequences
            seqs = {}
            seqs['Target'] = pull_table_info_srnatarbase(soup, rowname='Target')[-1]
            seqs['sRNA'] = pull_table_info_srnatarbase(soup, rowname='sRNA')[-1]
            seqs['Target Alias'] = pull_table_info_srnatarbase(soup, rowname='Target Alias')[-1]
            seqs['sRNA Alias'] = pull_table_info_srnatarbase(soup, rowname='sRNA Alias')[-1]
            seqs['Target ID'] = pull_table_info_srnatarbase(soup, rowname='Target')[0]
            matches = re.findall(r'\[([^]]*)\]', seqs['Target ID'])
            seqs['Target ID'] = ', '.join(matches)
            seqs['sRNA ID'] = pull_table_info_srnatarbase(soup, rowname='sRNA')[0]
            matches = re.findall(r'\[([^]]*)\]', seqs['sRNA ID'])
            seqs['sRNA ID'] = ', '.join(matches)
            seqs['Regulation'] = pull_table_info_srnatarbase(soup, rowname='Regulation')[-1]
            seqs['Target Type'] = pull_table_info_srnatarbase(soup, rowname='Target Type')[0]
            seqs['sRNA Type'] = pull_table_info_srnatarbase(soup, rowname='sRNA Type')[0]
            seqs['Target Strand'] = pull_table_info_srnatarbase(soup, rowname='Target Strand')[0]
            seqs['sRNA Strand'] = pull_table_info_srnatarbase(soup, rowname='sRNA Strand')[0]
            seqs['Target Binding Position'] = pull_table_info_srnatarbase(soup, rowname='Target Binding Position')
            seqs['sRNA Binding Position'] = pull_table_info_srnatarbase(soup, rowname='sRNA Binding Position')
            seqs['Target Genome Position'] = pull_table_info_srnatarbase(soup, rowname='Position')[1]
            seqs['sRNA Genome Position'] = pull_table_info_srnatarbase(soup, rowname='Position')[0]
            seqs['Target Sequence'] = pull_table_info_srnatarbase(soup, rowname='Target Seq')[0]
            seqs['sRNA Sequence'] = pull_table_info_srnatarbase(soup, rowname='sRNA Seq')[0]
            
            return pd.DataFrame.from_dict({k: [v] for k, v in seqs.items()})
        # elif response.status_code == 500:
        else:
            print(f"Failed to retrieve data. Status code: {response.status_code}. URL: {url}")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred during the request: {e}\nURL: {url}")
    except Exception as e:
        print(f"An error occurred: {e}\nURL: {url}")

In [18]:

# URL of the page containing RNA sequences
# url = "http://aibcenter.com/srnatarbase/index.php?r=srnaTarget/index"

url = "http://aibcenter.com/srnatarbase/index.php?r=srnaTarget/view&id=" + '1'

# Call the function to scrape RNA sequences
seqs = scrape_rna_sequences(url)
seqs

Unnamed: 0,Target,sRNA,Target Alias,sRNA Alias,Target ID,sRNA ID,Regulation,Target Type,sRNA Type,Target Strand,sRNA Strand,Target Binding Position,sRNA Binding Position,Target Genome Position,sRNA Genome Position,Target Sequence,sRNA Sequence
0,hns,dsrA,B1; bglY; cur; drc; drdX; drs; ECK1232; fimG; ...,ECK1952; IS095; JWR0036,"chromosome:NC_000913.3, Gene ID:945829","chromosome:NC_000913.3, Gene ID:946470",Repression,mRNA,trans-encoded antisense RNA,reverse,reverse,"[7..19, NA, 7..19; 401..411]","[31..43, NA, 31..54]",1292509..1292922,2025227..2025313,ATGAGCGAAGCACTTAAAATTCTGAACAACATCCGTACTCTTCGTG...,AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...


### Run for all available sRNA targets

In [19]:
url = "http://aibcenter.com/srnatarbase/index.php?r=srnaTarget/view&id="
df = pd.DataFrame()
for i in range(1, 772):
    seqs = scrape_rna_sequences(url + str(i))
    if (seqs is not None) and (type(seqs) != int):
        df = pd.concat([df, seqs])
    
    

Failed to retrieve data. Status code: 500. URL: http://aibcenter.com/srnatarbase/index.php?r=srnaTarget/view&id=17
Failed to retrieve data. Status code: 500. URL: http://aibcenter.com/srnatarbase/index.php?r=srnaTarget/view&id=394


In [20]:
df['Target Sequence'] = df['Target Sequence'].str.replace('\n', '')
df['sRNA Sequence'] = df['sRNA Sequence'].str.replace('\n', '')

In [21]:
df = df.reset_index().drop(columns=['index'])
df

Unnamed: 0,Target,sRNA,Target Alias,sRNA Alias,Target ID,sRNA ID,Regulation,Target Type,sRNA Type,Target Strand,sRNA Strand,Target Binding Position,sRNA Binding Position,Target Genome Position,sRNA Genome Position,Target Sequence,sRNA Sequence
0,hns,dsrA,B1; bglY; cur; drc; drdX; drs; ECK1232; fimG; ...,ECK1952; IS095; JWR0036,"chromosome:NC_000913.3, Gene ID:945829","chromosome:NC_000913.3, Gene ID:946470",Repression,mRNA,trans-encoded antisense RNA,reverse,reverse,"[7..19, NA, 7..19; 401..411]","[31..43, NA, 31..54]",1292509..1292922,2025227..2025313,ATGAGCGAAGCACTTAAAATTCTGAACAACATCCGTACTCTTCGTG...,AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...
1,rbsD,dsrA,ECK3742; JW5857; rbsP,ECK1952; IS095; JWR0036,"chromosome:NC_000913.3, Gene ID:948267","chromosome:NC_000913.3, Gene ID:946470",Repression,mRNA,trans-encoded antisense RNA,forward,reverse,[NA],[NA],3933351..3933770,2025227..2025313,ATGAAAAAAGGCACCGTTCTTAATTCTGATATTTCATCGGTGATCT...,AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...
2,argR,dsrA,ECK3226; JW3206; Rarg; xerA,ECK1952; IS095; JWR0036,"chromosome:NC_000913.3, Gene ID:947861","chromosome:NC_000913.3, Gene ID:946470",Repression,mRNA,trans-encoded antisense RNA,forward,reverse,[NA],[NA],3384703..3385173,2025227..2025313,ATGCGAAGCTCGGCTAAGCAAGAAGAACTAGTTAAAGCATTTAAAG...,AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...
3,ilvI,dsrA,ECK0079; JW0076,ECK1952; IS095; JWR0036,"chromosome:NC_000913.3, Gene ID:948793","chromosome:NC_000913.3, Gene ID:946470",Repression,mRNA,trans-encoded antisense RNA,forward,reverse,[NA],[NA],85630..87354,2025227..2025313,ATGGAGATGTTGTCTGGAGCCGAGATGGTCGTCCGATCGCTTATCG...,AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...
4,rpoS,dsrA,abrD; appR; csi2; dpeB; ECK2736; JW5437; katF;...,ECK1952; IS095; JWR0036,"chromosome:NC_000913.3, Gene ID:947210","chromosome:NC_000913.3, Gene ID:946470",Induction,mRNA,trans-encoded antisense RNA,reverse,reverse,"[-119..-97, NA, NA]","[10..32, NA, NA]",2866559..2867551,2025227..2025313,ATGAGTCAGAATACGCTGAAAGTTCATGATTTAAATGAAGATGCGG...,AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406,msrA,ryhB,ECK4215; JW4178; pms; pmsR,ECK3426; IS176; JWR0219; psrA18; sraI,"chromosome:NC_000913.3, Gene ID:948734","chromosome:NC_000913.3, Gene ID:2847761",No Interaction,mRNA,,reverse,reverse,[NA],[NA],4441538..4442176,3580927..3581016,ATGAGTTTATTTGATAAAAAGCATCTGGTTTCCCCCGCCGATGCCC...,GCGATCAGGAAGACCCTCGCGGAGAACCTGAAAGCACGACATTGCT...
407,hdeA,gcvB,ECK3494; JW3478; yhhC; yhiB,ECK2804; IS145; JWR0247; psrA11,"chromosome:NC_000913.3, Gene ID:948025","chromosome:NC_000913.3, Gene ID:2847720",Induction,mRNA,trans-encoded antisense RNA,reverse,forward,[NA],[NA],3656408..3656740,2942696..2942901,ATGAAAAAAGTATTAGGCGTTATTCTTGGTGGTCTGCTTCTTCTGC...,ACTTCCTGAGCCGGAACGAAAAGTTTTATCGGAATGCGTGTTCTGG...
408,hdeB,gcvB,ECK3493; JW5669; yhhD; yhiC,ECK2804; IS145; JWR0247; psrA11,"chromosome:NC_000913.3, Gene ID:948026","chromosome:NC_000913.3, Gene ID:2847720",Induction,mRNA,trans-encoded antisense RNA,reverse,forward,[NA],[NA],3655966..3656292,2942696..2942901,ATGAATATTTCATCTCTCCGTAAAGCGTTTATTTTTATGGGCGCTG...,ACTTCCTGAGCCGGAACGAAAAGTTTTATCGGAATGCGTGTTCTGG...
409,fliC,Esr41,ECs2662,,"chromosome:NC_002695.1, Gene ID:","chromosome:NC_002695.1, Gene ID:",Induction,mRNA,trans-encoded antisense RNA,reverse,forward,[NA],[NA],2624379..2626136,1422406..1422479,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,GATGCTCTAGGCATCACATTTTCTCCATGGGGTATTCCCTCCGCCG...


In [22]:
df = df[df['sRNA Sequence'] != '']
df = df[df['Target Sequence'] != '']

In [23]:
df.to_csv(os.path.join('..', 'data', 'sRNA', 'sRNATarBase', 'sRNATarBase.csv'))

### A few stats

In [24]:
print('Number of sRNAs:', len(df['sRNA'].unique()))
print('\nNumber of targets:', len(df['Target'].unique()))
print('\nNumber of distal binding sites on target (negative binding position):', sum(df['Target Binding Position'].apply(lambda x: '-' in x[0])))
print('\nFraction of incomplete sequences:')
print('\tsRNA sequences:', len(df[df['sRNA Sequence'] == '']) / len(df))
print('\ttarget sequences:', len(df[df['Target Sequence'] == '']) / len(df))
print('\nFraction of unknown binding positions')
print('\tsRNA binding positions:', len(df[df['sRNA Binding Position'].apply(lambda x: x[0] == 'NA' and (len(x) > 1))]) / len(df))
print('\ttarget binding positions:', len(df[df['Target Binding Position'].apply(lambda x: x[0] == 'NA' and (len(x) > 1))]) / len(df))
print('\nTypes of regulation:\n', df['Regulation'].value_counts(), '\n')
print('Types of regulation percent:\n', df['Regulation'].value_counts() / len(df), '\n')

Number of sRNAs: 52

Number of targets: 272

Number of distal binding sites on target (negative binding position): 91

Fraction of incomplete sequences:
	sRNA sequences: 0.0
	target sequences: 0.0

Fraction of unknown binding positions
	sRNA binding positions: 0.02444987775061125
	target binding positions: 0.02444987775061125

Types of regulation:
 Regulation
Repression           193
No Interaction       190
Induction             20
Protein titration      6
Name: count, dtype: int64 

Types of regulation percent:
 Regulation
Repression           0.471883
No Interaction       0.464548
Induction            0.048900
Protein titration    0.014670
Name: count, dtype: float64 

