In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [2]:
import os
from Bio import SeqIO, Entrez
from urllib.error import HTTPError
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
def load_seq_from_FASTA(filename, as_type="list"):
    fasta_records = SeqIO.parse(open(filename),'fasta')
    if as_type=="dict":
        sequences = {}
        for fasta_record in fasta_records:
            sequences[fasta_record.id] = str(fasta_record.seq)
        return sequences
    elif as_type=="list":
        sequences = []
        for fasta_record in fasta_records:
            sequences.append(str(fasta_record.seq))
        return sequences
    else:
        raise ValueError(f"Desired type {as_type} not supported.")


Download the RNA-RNA interactions file from http://www.rnainter.org/download/

In [4]:
fn_db = '../data/sRNA/TableS2_E_coli.xlsx'
data = pd.read_excel(fn_db, sheet_name='sRNAs')

In [15]:
data['name'] = data['attributes'].apply(lambda x: x.split('name=')[1].split(';')[0])
data['sRNA_type'] = data['attributes'].apply(lambda x: x.split('sRNA_type=')[1])
data

Unnamed: 0,seqID,source,feature,start,end,score,strand,phase,attributes,name,sRNA_type
0,NC_000913.2,RendSeq,sRNA,75517,75608,.,-,.,name=sroA; E_coli_WT; start_dist=-0.0; stop_di...,sroA,intergenic
1,NC_000913.2,RendSeq,sRNA,92418,92658,.,+,.,name=ftsI_ftsO; E_coli_WT; start_dist=-67.0; s...,ftsI_ftsO,intragenic
2,NC_000913.2,RendSeq,sRNA,92485,92658,.,+,.,name=ftsI_ftsO; E_coli_WT; start_dist=0.0; sto...,ftsI_ftsO,intragenic
3,NC_000913.2,RendSeq,sRNA,344476,344638,.,+,.,name=xtpA_yahM; E_coli_WT; start_dist=0.0; sto...,xtpA_yahM,5'UTR
4,NC_000913.2,RendSeq,sRNA,506428,506511,.,+,.,name=chiX; E_coli_WT; start_dist=0.0; stop_dis...,chiX,independent
...,...,...,...,...,...,...,...,...,...,...,...
86,NC_000913.2,RendSeq,sRNA,4188346,4188510,.,-,.,name=sroH; E_coli_WT; start_dist=-0.0; stop_di...,sroH,independent
87,NC_000913.2,RendSeq,sRNA,4261197,4261269,.,+,.,name=pspH; E_coli_WT; start_dist=35.0; stop_di...,pspH,independent
88,NC_000913.2,RendSeq,sRNA,4525999,4526135,.,+,.,name=ryjB; E_coli_WT; start_dist=-1.0; stop_di...,ryjB,independent_w_isoform
89,NC_000913.2,RendSeq,sRNA,4526044,4526135,.,+,.,name=ryjB; E_coli_WT; start_dist=44.0; stop_di...,ryjB,independent_w_isoform


# Get sequences

In [17]:
Entrez.email = "olivia.gallup@gmail.com"  # Add your email address
Entrez.api_key = "8167226cf01abaa985a1c23d9b5283d22208"  # Add your NCBI API key


def get_seq_from_genome(gene_id):
    try:
        handle = Entrez.efetch(db="gene", id=gene_id, rettype="fasta", retmode="xml")
        record = Entrez.read(handle)
        nuc_id = record[0]['Entrezgene_locus'][0]['Gene-commentary_accession'] + '.' + record[0]['Entrezgene_locus'][0]['Gene-commentary_version']

        dna_start = record[0]['Entrezgene_locus'][0]['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval']['Seq-interval_from']
        dna_end = record[0]['Entrezgene_locus'][0]['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval']['Seq-interval_to']

        search_handle = Entrez.efetch(db="nuccore", id=nuc_id, retmode="XML", rettype='fasta_cds_na', strand="complement", seq_start=int(dna_start), seq_stop=int(dna_end))
        # dna_sequence = SeqIO.read(search_handle, "fasta").seq
        r = search_handle.read()
        if r:
            dna_sequence = ''.join(r.split('>')[1].split('\n')[1:])
        else:
            dna_sequence = ''
        return dna_sequence
    except HTTPError:
        return ''


def get_seq_info(gene_id):
    try:
        handle = Entrez.efetch(db="nucleotide", id=gene_id, rettype="fasta", retmode="text")
        fast = handle.read()
        seq = ''.join(fast.split('\n')[1:])
        if not seq or ('N' in seq):
            seq = get_seq_from_genome(gene_id)
        return seq
    except HTTPError:
        seq = get_seq_from_genome(gene_id)
        return seq


In [20]:
import Bio
from Bio import Entrez

def get_dna_sequence(sequence_id, start, end):
    """
    Retrieves the DNA sequence for the given sequence ID, start, and end positions.
    
    Parameters:
    sequence_id (str): The ID of the DNA sequence to retrieve.
    start (int): The starting position of the DNA sequence to retrieve.
    end (int): The ending position of the DNA sequence to retrieve.
    
    Returns:
    str: The DNA sequence for the given sequence ID, start, and end positions.
    """
    # Set your email address for Entrez access
    Entrez.email = "olivia.gallup@gmail.com"  # Add your email address
    Entrez.api_key = "8167226cf01abaa985a1c23d9b5283d22208"  # Add your NCBI API key

    
    try:
        # Retrieve the DNA sequence using Entrez
        handle = Entrez.efetch(db="nucleotide", id=sequence_id, rettype="fasta", seq_start=start, seq_stop=end)
        record = Bio.Seq.SeqRecord.SeqRecord.from_fasta(handle.read())
        
        # Return the DNA sequence
        return str(record.seq)
    
    except Bio.Entrez.efetch.RuntimeError as e:
        print(f"Error retrieving DNA sequence: {e}")
        return None
    
    
sequence_id = 'NC_000913.2'
start = 75517
end = 75608

dna_sequence = get_dna_sequence(sequence_id, start, end)
if dna_sequence:
    print(f"DNA sequence: {dna_sequence}")

AttributeError: 'function' object has no attribute 'RuntimeError'

In [None]:
d = {k: get_seq_info(k) for k in list(data['Raw_ID1'].str.replace('NCBI:', '').apply(str).unique()) + list(data['Raw_ID2'].str.replace('NCBI:', '').apply(str).unique())}


In [17]:
# if ('Sequence1' not in data.columns) or ('Sequence2' not in data.columns):
# data['Sequence1'] = data['Raw_ID1'].str.replace('NCBI:', '').apply(str).apply(get_fasta)
d = {k: get_seq_info(k) for k in list(data['Raw_ID1'].str.replace('NCBI:', '').apply(str).unique()) + list(data['Raw_ID2'].str.replace('NCBI:', '').apply(str).unique())}

for i, l in enumerate(zip(data['Raw_ID1'].str.replace('NCBI:', '').apply(str), data['Raw_ID2'].str.replace('NCBI:', '').apply(str))):
    data[f'Sequence{i+1}'] = list(map(lambda x: d[x], l))
    # data[f'Gene_Start{i+1}'] = list(map(lambda x: d[x][1], l))
    # data[f'Gene_End{i+1}'] = list(map(lambda x: d[x][2], l))
# data['Sequence1'] = data['Raw_ID1'].str.replace('NCBI:', '').apply(str).apply(get_fasta)

print(sum((data['Sequence1'] != '') & (data['Sequence2'] != '')))
data.to_csv(fn)


In [None]:
data[data['Sequence1'].isna()]

Unnamed: 0,level_0,index,RNAInterID,Interactor1.Symbol,Category1,Species1,Interactor2.Symbol,Category2,Species2,Raw_ID1,Raw_ID2,score,strong,weak,predict,Sequence1,Sequence2
