In [1]:
%load_ext autoreload
%autoreload 2

# Downloading data from sRNATarBase

This script works and webscrapes the sequences and interaction targets for each sRNA from [sRNATarBase](http://aibcenter.com/srnatarbase/index.php?r=srnaTarget/index).

We also used [RNAInter](http://www.rnainter.org/showBrowse/?sType=Species&sItem=Escherichia+coli+str.+K-12+substr.+MG1655&number=Escherichia+coli+str.+K-12+substr.+MG1655%3A+436+entries) as a database for referencing RNA interactions.


# Imports

In [2]:
import os
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

# Webscrape

In [3]:
def find_lines_with_keyword(soup, keyword):
    """ Bard """
    lines = []
    for line in soup.find_all():
        if keyword in line.text:
            lines.append(line)
    return lines

def pull_table_info_srnatarbase(soup, rowname):
    interaction_lines = find_lines_with_keyword(soup, rowname)
    interaction_lines = [i for i in interaction_lines if 'width' in i.attrs.get('style', '') and (i.text == rowname)]
    return [i.next_sibling.text for i in interaction_lines]

def scrape_rna_sequences(url):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)
        
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the page content with BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            if 'Escherichia coli' not in pull_table_info_srnatarbase(soup, rowname='Strain Name')[0]:
                return 0
            
            # Find elements containing RNA sequences
            seqs = {}
            seqs['sRNA'] = pull_table_info_srnatarbase(soup, rowname='sRNA')[-1]
            seqs['Target'] = pull_table_info_srnatarbase(soup, rowname='Target')[-1]
            seqs['Regulation'] = pull_table_info_srnatarbase(soup, rowname='Regulation')[-1]
            seqs['sRNA Binding Position'] = pull_table_info_srnatarbase(soup, rowname='sRNA Binding Position')
            seqs['Target Binding Position'] = pull_table_info_srnatarbase(soup, rowname='Target Binding Position')
            seqs['sRNA Sequence'] = pull_table_info_srnatarbase(soup, rowname='sRNA Seq')[0]
            seqs['Target Sequence'] = pull_table_info_srnatarbase(soup, rowname='Target Seq')[0]
            
            return pd.DataFrame.from_dict({k: [v] for k, v in seqs.items()})
        # elif response.status_code == 500:
        else:
            print(f"Failed to retrieve data. Status code: {response.status_code}. URL: {url}")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred during the request: {e}\nURL: {url}")
    except Exception as e:
        print(f"An error occurred: {e}\nURL: {url}")

In [4]:

# URL of the page containing RNA sequences
# url = "http://aibcenter.com/srnatarbase/index.php?r=srnaTarget/index"

url = "http://aibcenter.com/srnatarbase/index.php?r=srnaTarget/view&id=" + '1'

# Call the function to scrape RNA sequences
seqs = scrape_rna_sequences(url)
seqs

Unnamed: 0,sRNA,Target,Regulation,sRNA Binding Position,Target Binding Position,sRNA Sequence,Target Sequence
0,dsrA,hns,Repression,"[31..43, NA, 31..54]","[7..19, NA, 7..19; 401..411]",AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...,ATGAGCGAAGCACTTAAAATTCTGAACAACATCCGTACTCTTCGTG...


### Run for all available sRNA targets

In [5]:
url = "http://aibcenter.com/srnatarbase/index.php?r=srnaTarget/view&id="
df = pd.DataFrame()
for i in range(1, 772):
    seqs = scrape_rna_sequences(url + str(i))
    if (seqs is not None) and (type(seqs) != int):
        df = pd.concat([df, seqs])
    
    

Failed to retrieve data. Status code: 500. URL: http://aibcenter.com/srnatarbase/index.php?r=srnaTarget/view&id=17
Failed to retrieve data. Status code: 500. URL: http://aibcenter.com/srnatarbase/index.php?r=srnaTarget/view&id=394


In [6]:
df = df.reset_index().drop(columns=['index'])
df

Unnamed: 0,sRNA,Target,Regulation,sRNA Binding Position,Target Binding Position,sRNA Sequence,Target Sequence
0,dsrA,hns,Repression,"[31..43, NA, 31..54]","[7..19, NA, 7..19; 401..411]",AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...,ATGAGCGAAGCACTTAAAATTCTGAACAACATCCGTACTCTTCGTG...
1,dsrA,rbsD,Repression,[NA],[NA],AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...,ATGAAAAAAGGCACCGTTCTTAATTCTGATATTTCATCGGTGATCT...
2,dsrA,argR,Repression,[NA],[NA],AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...,ATGCGAAGCTCGGCTAAGCAAGAAGAACTAGTTAAAGCATTTAAAG...
3,dsrA,ilvI,Repression,[NA],[NA],AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...,ATGGAGATGTTGTCTGGAGCCGAGATGGTCGTCCGATCGCTTATCG...
4,dsrA,rpoS,Induction,"[10..32, NA, NA]","[-119..-97, NA, NA]",AACACATCAGATTTCCTGGTGTAACGAATTTTTTAAGTGCTTCTTG...,ATGAGTCAGAATACGCTGAAAGTTCATGATTTAAATGAAGATGCGG...
...,...,...,...,...,...,...,...
406,ryhB,msrA,No Interaction,[NA],[NA],GCGATCAGGAAGACCCTCGCGGAGAACCTGAAAGCACGACATTGCT...,ATGAGTTTATTTGATAAAAAGCATCTGGTTTCCCCCGCCGATGCCC...
407,gcvB,hdeA,Induction,[NA],[NA],ACTTCCTGAGCCGGAACGAAAAGTTTTATCGGAATGCGTGTTCTGG...,ATGAAAAAAGTATTAGGCGTTATTCTTGGTGGTCTGCTTCTTCTGC...
408,gcvB,hdeB,Induction,[NA],[NA],ACTTCCTGAGCCGGAACGAAAAGTTTTATCGGAATGCGTGTTCTGG...,ATGAATATTTCATCTCTCCGTAAAGCGTTTATTTTTATGGGCGCTG...
409,Esr41,fliC,Induction,[NA],[NA],GATGCTCTAGGCATCACATTTTCTCCATGGGGTATTCCCTCCGCCG...,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...


In [7]:
df.to_csv(os.path.join('..', 'data', 'sRNA', 'sRNATarBase', 'sRNATarBase.csv'))

### A few stats

In [8]:
print('Number of sRNAs:', len(df['sRNA'].unique()))
print('\nNumber of targets:', len(df['Target'].unique()))
print('\nNumber of distal binding sites on target (negative binding position):', sum(df['Target Binding Position'].apply(lambda x: '-' in x[0])))
print('\nFraction of incomplete sequences:')
print('\tsRNA sequences:', len(df[df['sRNA Sequence'] == '']) / len(df))
print('\ttarget sequences:', len(df[df['Target Sequence'] == '']) / len(df))
print('\nFraction of unknown binding positions')
print('\tsRNA binding positions:', len(df[df['sRNA Binding Position'].apply(lambda x: x[0] == 'NA' and (len(x) > 1))]) / len(df))
print('\ttarget binding positions:', len(df[df['Target Binding Position'].apply(lambda x: x[0] == 'NA' and (len(x) > 1))]) / len(df))
print('\nTypes of regulation:\n', df['Regulation'].value_counts(), '\n')
print('Types of regulation percent:\n', df['Regulation'].value_counts() / len(df), '\n')

Number of sRNAs: 54

Number of targets: 273

Number of distal binding sites on target (negative binding position): 91

Fraction of incomplete sequences:
	sRNA sequences: 0.004866180048661801
	target sequences: 0.0024330900243309003

Fraction of unknown binding positions
	sRNA binding positions: 0.024330900243309004
	target binding positions: 0.024330900243309004

Types of regulation:
 Regulation
Repression           194
No Interaction       191
Induction             20
Protein titration      6
Name: count, dtype: int64 

Types of regulation percent:
 Regulation
Repression           0.472019
No Interaction       0.464720
Induction            0.048662
Protein titration    0.014599
Name: count, dtype: float64 

