In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [2]:
import os
from Bio import SeqIO, Entrez
from urllib.error import HTTPError
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
def load_seq_from_FASTA(filename, as_type="list"):
    fasta_records = SeqIO.parse(open(filename),'fasta')
    if as_type=="dict":
        sequences = {}
        for fasta_record in fasta_records:
            sequences[fasta_record.id] = str(fasta_record.seq)
        return sequences
    elif as_type=="list":
        sequences = []
        for fasta_record in fasta_records:
            sequences.append(str(fasta_record.seq))
        return sequences
    else:
        raise ValueError(f"Desired type {as_type} not supported.")


Download the RNA-RNA interactions file from http://www.rnainter.org/download/

In [4]:
fn_db = '../data/sRNA/TableS2_E_coli.xlsx'
data = pd.read_excel(fn_db, sheet_name='sRNAs')

In [5]:
data['name'] = data['attributes'].apply(lambda x: x.split('name=')[1].split(';')[0])
data['strand num'] = data['strand'].apply(lambda x: 1 if x == '+' else 2)
data['sRNA_type'] = data['attributes'].apply(lambda x: x.split('sRNA_type=')[1])
data

Unnamed: 0,seqID,source,feature,start,end,score,strand,phase,attributes,name,strand num,sRNA_type
0,NC_000913.2,RendSeq,sRNA,75517,75608,.,-,.,name=sroA; E_coli_WT; start_dist=-0.0; stop_di...,sroA,2,intergenic
1,NC_000913.2,RendSeq,sRNA,92418,92658,.,+,.,name=ftsI_ftsO; E_coli_WT; start_dist=-67.0; s...,ftsI_ftsO,1,intragenic
2,NC_000913.2,RendSeq,sRNA,92485,92658,.,+,.,name=ftsI_ftsO; E_coli_WT; start_dist=0.0; sto...,ftsI_ftsO,1,intragenic
3,NC_000913.2,RendSeq,sRNA,344476,344638,.,+,.,name=xtpA_yahM; E_coli_WT; start_dist=0.0; sto...,xtpA_yahM,1,5'UTR
4,NC_000913.2,RendSeq,sRNA,506428,506511,.,+,.,name=chiX; E_coli_WT; start_dist=0.0; stop_dis...,chiX,1,independent
...,...,...,...,...,...,...,...,...,...,...,...,...
86,NC_000913.2,RendSeq,sRNA,4188346,4188510,.,-,.,name=sroH; E_coli_WT; start_dist=-0.0; stop_di...,sroH,2,independent
87,NC_000913.2,RendSeq,sRNA,4261197,4261269,.,+,.,name=pspH; E_coli_WT; start_dist=35.0; stop_di...,pspH,1,independent
88,NC_000913.2,RendSeq,sRNA,4525999,4526135,.,+,.,name=ryjB; E_coli_WT; start_dist=-1.0; stop_di...,ryjB,1,independent_w_isoform
89,NC_000913.2,RendSeq,sRNA,4526044,4526135,.,+,.,name=ryjB; E_coli_WT; start_dist=44.0; stop_di...,ryjB,1,independent_w_isoform


# Get sequences

In [6]:
from Bio import Entrez, SeqIO
import ssl

# Disable SSL verification temporarily
ssl._create_default_https_context = ssl._create_unverified_context


def get_dna_sequence(sequence_id, start, end, strand=1):
    """
    Retrieves the DNA sequence for the given sequence ID, start, and end positions.
    
    Parameters:
    sequence_id (str): The ID of the DNA sequence to retrieve.
    start (int): The starting position of the DNA sequence to retrieve.
    end (int): The ending position of the DNA sequence to retrieve.
    
    Returns:
    str: The DNA sequence for the given sequence ID, start, and end positions.
    """
    # Set your email address for Entrez access
    Entrez.email = "olivia.gallup@gmail.com"  # Add your email address
    Entrez.api_key = "8167226cf01abaa985a1c23d9b5283d22208"  # Add your NCBI API key
    
    handle = Entrez.efetch(
        db="nucleotide",
        id=sequence_id,
        rettype="fasta",
        retmode="text",
        strand=strand,
        seq_start=start,
        seq_stop=end,
    )
    record = SeqIO.read(handle, "fasta")
    
    return str(record.seq)
    

data['sequence'] = data.apply(lambda x: get_dna_sequence('NC_000913.2', x['start'], x['end'], x['strand num']), axis=1)

In [7]:
data.to_csv('../data/sRNA/TableS2_E_coli_with_seq.csv', index=False)
data

Unnamed: 0,seqID,source,feature,start,end,score,strand,phase,attributes,name,strand num,sRNA_type,sequence
0,NC_000913.2,RendSeq,sRNA,75517,75608,.,-,.,name=sroA; E_coli_WT; start_dist=-0.0; stop_di...,sroA,2,intergenic,GTTCTCAACGGGGTGCCACGCGTACGCGTGCGCTGAGAAAATACCC...
1,NC_000913.2,RendSeq,sRNA,92418,92658,.,+,.,name=ftsI_ftsO; E_coli_WT; start_dist=-67.0; s...,ftsI_ftsO,1,intragenic,ATTAACGGCCACGAAATCAAAGACGTGGCACGCTACAGCGAATTAA...
2,NC_000913.2,RendSeq,sRNA,92485,92658,.,+,.,name=ftsI_ftsO; E_coli_WT; start_dist=0.0; sto...,ftsI_ftsO,1,intragenic,AGTCGAGTAACGTCGGTGTTTCCAAGCTGGCGTTAGCGATGCCGTC...
3,NC_000913.2,RendSeq,sRNA,344476,344638,.,+,.,name=xtpA_yahM; E_coli_WT; start_dist=0.0; sto...,xtpA_yahM,1,5'UTR,ATGTTTGTGTGGGGTCTTTTCTGTATCTTACGCATCGCACTCAAGC...
4,NC_000913.2,RendSeq,sRNA,506428,506511,.,+,.,name=chiX; E_coli_WT; start_dist=0.0; stop_dis...,chiX,1,independent,ACACCGTCGCTTAAAGTGACGGCATAATAATAAAAAAATGAAATTC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,NC_000913.2,RendSeq,sRNA,4188346,4188510,.,-,.,name=sroH; E_coli_WT; start_dist=-0.0; stop_di...,sroH,2,independent,GAAAATAAGAACACATGTTCTCATCTTCCAGGATGCAGCAGACTGA...
87,NC_000913.2,RendSeq,sRNA,4261197,4261269,.,+,.,name=pspH; E_coli_WT; start_dist=35.0; stop_di...,pspH,1,independent,ATTCATCGTGCTGTACCCTACATACAGCCGAACTATAAAAAGAAAG...
88,NC_000913.2,RendSeq,sRNA,4525999,4526135,.,+,.,name=ryjB; E_coli_WT; start_dist=-1.0; stop_di...,ryjB,1,independent_w_isoform,ATCATCCGTCGTTGACTCCATGCCGATTCGGGTTAATCTGGTAGCG...
89,NC_000913.2,RendSeq,sRNA,4526044,4526135,.,+,.,name=ryjB; E_coli_WT; start_dist=44.0; stop_di...,ryjB,1,independent_w_isoform,GATCCCCGTCGATACTTTTGACGAAGGCGGCAGGGATCGCAGAAGG...
