# SRA Extractor

Input: Excel file with Bioprojects IDs and their Description
Output : Excel file with SRA IDs and BioSample IDs for each Bioproject ID extracted from NCBI.

This script also filter out Bioprojects based on their description.

In [None]:
# Importing Libraries

import pandas as pd
import pysradb
from Bio import Entrez
import re
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

Pre-processing Data as per requirements.

In [None]:
# Read in the TSV file
df = pd.read_csv('BioProjects.tsv', delimiter='\t')
print("Total Samples = " + str(len(df.index)))

# Filter for descriptions with "metatranscriptome"
df = df[df['description'].str.contains('metatranscriptome', case=False)]

# Remove duplicate descriptions
df = df.drop_duplicates(subset='description', keep='first')

# Remove duplicate accession numbers, keeping the first occurrence
df = df.drop_duplicates(subset='accession', keep='first')

print("Total samples after removing duplicates = " + str(len(df.index)))
df.sample()

In [None]:
df_after = df[df['description'].str.contains('after', case=False, na=False)]
df_after.sample()
print("Total Samples including different timepoints = " + str(len(df_after.index)))

In [None]:
#splitting the df.description into two comlumns, split by "after"
df_after[['description', 'Time_point']] = df_after.description.str.split("after", expand = True)
df_after.sample()

In [None]:
#removing multiple timepoints
df_unique_time = df_after.drop_duplicates(subset='description', keep='first')
print(len(df_unique_time.index))

In [None]:
#joining both columns together
df_unique_time["whole_description"] = df_unique_time["description"] + " after " + df_unique_time["Time_point"]
df_joined_col = df_unique_time.drop(["description", "Time_point"], axis=1)
df_joined_col.sample()
df_joined_col.rename(columns={'whole_description': 'description'}, inplace=True)

In [None]:
#Creating a list for indices to Drop

indices_to_drop = []
for i, row in df.iterrows():
        if 'after' in row['description']:
            indices_to_drop.append(i)
df_no_after=df.drop(indices_to_drop)

print(len(df_no_after.index))

In [None]:
#Combining both dfs.

frame= [df_no_after, df_joined_col]
new_df = pd.concat(frame)
print(len(new_df.index))

In [None]:
# Creating a second list of indices to drop.
indices_to_drop_second = []
for i, row in df.iterrows():
        if 'Tara Oceans' in row['description']:
            indices_to_drop_second.append(i)
new_df=new_df.drop(indices_to_drop_second)
print(len(new_df.index))
print(new_df[:30])

In [None]:
# Exporting the filtered Bioprojects into an excel file
new_df.to_excel("ena_project_filtered.xlsx", index = None)

Getting SRA available for each Project ID.

In [None]:
accession_numbers = new_df['accession'].tolist()

data = []
success_count = 0
for accession_number in tqdm(accession_numbers, desc=f'Processing accession numbers'):
    try:
        # Define BioProject ID of interest
        bioproject_id = "PRJNA550452"
        # Create connection to SRA database
        sradb = pysradb.SRAweb()
        # Retrieve metadata associated with BioProject ID
        metadata = sradb.sra_metadata(accession_number)
        if metadata is not None:
            # Extract list of SRA IDs
            sra_ids = metadata['run_accession'].tolist()
            data.append([accession_number, sra_ids])  # Append [accession_number, sra_ids] to data list
    except Exception as e:
        pass

sra_df = pd.DataFrame(data, columns=['accession', 'sra_ids'])

In [None]:
# Exporting fetched SRAs into an excel file
sra_df.to_excel("SRA_ids.xlsx", index = None)

Getting BioSample IDs

In [None]:
sra_test = pd.read_excel('SRA_ids.xlsx')

biosample_ids = []
for bioproject_id in tqdm(sra_df['accession'], desc='Processing BioProject IDs for BioSamples'):
    try:
        url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=bioproject&db=biosample&id={bioproject_id}"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "xml")
        link = soup.find("LinkSetDb", dbTo="biosample")
        biosample_id = link.find("Id").text if link is not None else ""
    except Exception as e:
        biosample_id = ""
    biosample_ids.append(biosample_id)

sra_test['BioSample Id'] = biosample_ids
sra_test.to_csv('sra_metadata_with_biosample.csv', index=False)  # replace with the path where you want to save the updated metadata file


In [None]:
#Exporting Biosamples into excel file.
sra_test.to_excel("SRA_Biosample_tes1.xlsx", index = None)