In [None]:
from Bio import Entrez
import pandas as pd

API_KEY = "" # Please extract api key from NCBI website (sign in and get access to API KEY) 
Entrez.email = '' # Mention the email address same as which is used to sign in NCBI.
Entrez.api_key = API_KEY

#### Nucleotide records search query

In [132]:
search_term = ("""Homo sapiens[ORGN] AND complete genome[TITLE] AND mitochondrion[FILT] AND 015400:016700[SLEN] 
               NOT (unverified OR Homo sp. Altai OR Denisova hominin OR neanderthalensis OR heidelbergensis OR consensus)""")

search_handle = Entrez.esearch(db= "nucleotide", 
                        term= search_term,
                        usehistory = "y",
                        retmax = 100000     # Total count was: 62173 so, to access all UIDs, set retmax to 100000
                        )
nucleotide_esearch_output = Entrez.read(search_handle)

#### Extracting linked BioSample Accession Ids based on nucleotide Ids. 

In [133]:
# Function to split a list into chunks
def chunk_list(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]
        
# Step 2: Link nucleotide IDs to BioSample IDs in batches
import time

bio_sample_results = []
biosample_ids = []
batch_size = 1000  # Adjust this to prevent long URL issues
i = 1
for batch in chunk_list(nucleotide_esearch_output["IdList"], batch_size):
    print(f"{i} batch is processing")
    try:
        link_handle = Entrez.elink(
            dbfrom="nucleotide",
            id=",".join(batch),
            db="biosample"
        )
        link_results = Entrez.read(link_handle)
        link_handle.close()

        bio_sample_results.append(link_results)
        print(f"{i} batch elink completed \n")
        i  = i+1
    except Exception as e:
        print(f"Error processing batch {batch}: {e}")


1 batch is processing
1 batch elink completed 

2 batch is processing
2 batch elink completed 

3 batch is processing
3 batch elink completed 

4 batch is processing
4 batch elink completed 

5 batch is processing
5 batch elink completed 

6 batch is processing
6 batch elink completed 

7 batch is processing
7 batch elink completed 

8 batch is processing
8 batch elink completed 

9 batch is processing
9 batch elink completed 

10 batch is processing
10 batch elink completed 

11 batch is processing
11 batch elink completed 

12 batch is processing
12 batch elink completed 

13 batch is processing
13 batch elink completed 

14 batch is processing
14 batch elink completed 

15 batch is processing
15 batch elink completed 

16 batch is processing
16 batch elink completed 

17 batch is processing
17 batch elink completed 

18 batch is processing
18 batch elink completed 

19 batch is processing
19 batch elink completed 

20 batch is processing
20 batch elink completed 

21 batch is proces

In [221]:
bio_sample_results

[[{'LinkSetDbHistory': [], 'ERROR': [], 'LinkSetDb': [{'Link': [{'Id': '3283347'}], 'DbTo': 'biosample', 'LinkName': 'nuccore_biosample'}], 'DbFrom': 'nuccore', 'IdList': ['2903184920', '2903184906', '2903184892', '2903184878', '2903184864', '2903184850', '2903184836', '2903184822', '2903184808', '2903184794', '2903184780', '2903184766', '2903184752', '2903184738', '2903184724', '2903184710', '2903184696', '2903184682', '2903184668', '2903184654', '2903184640', '2903184626', '2903184612', '2903184598', '2903184584', '2903184570', '2903184556', '2903184542', '2903184528', '2903184514', '2903184500', '2903184486', '2903184472', '2903184458', '2903184444', '2903184430', '2903184416', '2903184402', '2903184388', '2903184374', '2903184360', '2903184346', '2903184332', '2903184318', '2903184304', '2903184290', '2903184276', '2903184262', '2903184248', '2903184234', '2903184220', '2903184206', '2903184192', '2903184178', '2903184164', '2903184150', '2903184136', '2903184122', '2903184108', '2

##### Unique BioSample UIDs and Accession Ids associated with Nucleotide records

In [136]:
bio_sample_ids = []
for bio_sample_output in bio_sample_results:
    if(bio_sample_output[0]["LinkSetDb"] != []):
        for item in bio_sample_output[0]["LinkSetDb"][0]["Link"]:
            bio_sample_ids.append(item["Id"])
            

unique_bio_sample_ids = list(set(bio_sample_ids))
unique_bio_sample_ids

['4592681',
 '3283347',
 '4592679',
 '4592680',
 '4437172',
 '4592669',
 '4592682',
 '4592710',
 '15879935',
 '4592676',
 '4592714',
 '4592709',
 '3255769',
 '3955869',
 '4592678',
 '4592677']

In [None]:
biosample_df = pd.DataFrame(columns=["UID", "AccessionID"])
for uid in unique_bio_sample_ids:
    bio_sample_handle = Entrez.esummary(db = "biosample", id= uid)
    data = Entrez.read(bio_sample_handle)
    
    item = {
        "UID": uid,
        "AccessionID": data["DocumentSummarySet"]["DocumentSummary"][0]["Accession"],
    }
    biosample_df.loc[len(biosample_df)] = item
    
biosample_df

Unnamed: 0,UID,AccessionID
0,4592681,SAMN04592681
1,3283347,SAMN03283347
2,4592679,SAMN04592679
3,4592680,SAMN04592680
4,4437172,SAMN04437172
5,4592669,SAMN04592669
6,4592682,SAMN04592682
7,4592710,SAMN04592710
8,15879935,SAMD00243993
9,4592676,SAMN04592676


##### BioSample Records containing SRA records

In [139]:

sra_link_results_list = []
for bio_sample_id in bio_sample_ids:
    sra_link_handle = Entrez.elink(
        dbfrom="biosample",
        id=bio_sample_id,
        db="sra"
    )
    sra_link_results = Entrez.read(sra_link_handle)
    
    biosample_df.loc[biosample_df["UID"] == bio_sample_id, "HasSRARecord"] = (sra_link_results[0]["LinkSetDb"] != [])
    
    sra_link_results_list.append(sra_link_results)
    sra_link_handle.close()

In [140]:
sra_link_results_list

[[{'LinkSetDbHistory': [], 'ERROR': [], 'LinkSetDb': [{'Link': [{'Id': '35574709'}, {'Id': '35574708'}, {'Id': '32209633'}, {'Id': '32209632'}, {'Id': '32209631'}, {'Id': '32209630'}, {'Id': '32209629'}, {'Id': '32209628'}, {'Id': '32209627'}, {'Id': '32209626'}, {'Id': '32209625'}, {'Id': '32209624'}, {'Id': '32209623'}, {'Id': '32209622'}, {'Id': '32209621'}, {'Id': '32209620'}, {'Id': '32209619'}, {'Id': '31543732'}, {'Id': '30202934'}, {'Id': '30040525'}, {'Id': '27880816'}, {'Id': '27880815'}, {'Id': '27863613'}, {'Id': '27863612'}, {'Id': '27669440'}, {'Id': '27669439'}, {'Id': '27669438'}, {'Id': '27669411'}, {'Id': '26615891'}, {'Id': '21946299'}, {'Id': '21569896'}, {'Id': '21569895'}, {'Id': '21569894'}, {'Id': '20695292'}, {'Id': '20695291'}, {'Id': '20695290'}, {'Id': '20695289'}, {'Id': '20695288'}, {'Id': '20695287'}, {'Id': '20695286'}, {'Id': '20695285'}, {'Id': '20695284'}, {'Id': '20695283'}, {'Id': '20695282'}, {'Id': '20695281'}, {'Id': '20695280'}, {'Id': '20695279

In [141]:
biosample_df

Unnamed: 0,UID,AccessionID,HasSRARecord
0,4592681,SAMN04592681,False
1,3283347,SAMN03283347,True
2,4592679,SAMN04592679,False
3,4592680,SAMN04592680,False
4,4437172,SAMN04437172,False
5,4592669,SAMN04592669,False
6,4592682,SAMN04592682,False
7,4592710,SAMN04592710,False
8,15879935,SAMD00243993,False
9,4592676,SAMN04592676,False


Only two biosamples were found to have associated SRA records.

#### Data Validation for SRA IDs.

In [142]:
sra_link_handle = Entrez.elink(
    dbfrom="biosample",
    id=",".join(bio_sample_ids),
    db="sra"
)
sra_link_results = Entrez.read(sra_link_handle)
sra_link_handle.close()

In [143]:
sra_ids = []

for item in sra_link_results[0]["LinkSetDb"][0]["Link"]:
    sra_ids.append(item["Id"])


In [144]:
# Open the file in read mode
with open('sra_id.txt', 'r') as file:
    # Read all lines and split them based on newline
    uid_content = file.read()
    uid_list_from_query = uid_content.splitlines()


print(f"Total count: {len(uid_list_from_query)}")
print(f"Few sample records: {uid_list_from_query[:10]}")

Total count: 744
Few sample records: ['35574709', '35574708', '32209633', '32209632', '32209631', '32209630', '32209629', '32209628', '32209627', '32209626']


In [146]:
unique_ids = [item for item in sra_ids if item not in uid_list_from_query]
unique_ids

[]

#### Extracting BioProject records associated with Nucleotide Records

In [None]:
bio_project_results = []
batch_size = 100  # Adjust this to prevent long URL issues
i = 1
for batch in chunk_list(nucleotide_esearch_output["IdList"], batch_size):
    print(f"{i} batch is processing")
    try:
        link_handle = Entrez.elink(
            dbfrom="nucleotide",
            id=",".join(batch),
            db="bioproject"
        )
        link_results = Entrez.read(link_handle)
        link_handle.close()

        bio_project_results.append(link_results)
        print(f"{i} batch elink completed \n")
        i  = i+1
    except Exception as e:
        print(f"Error processing batch {batch}: {e}")


1 batch is processing
1 batch elink completed 

2 batch is processing
2 batch elink completed 

3 batch is processing
3 batch elink completed 

4 batch is processing
4 batch elink completed 

5 batch is processing
5 batch elink completed 

6 batch is processing
6 batch elink completed 

7 batch is processing
7 batch elink completed 

8 batch is processing
8 batch elink completed 

9 batch is processing
9 batch elink completed 

10 batch is processing
10 batch elink completed 

11 batch is processing
11 batch elink completed 

12 batch is processing
12 batch elink completed 

13 batch is processing
13 batch elink completed 

14 batch is processing
14 batch elink completed 

15 batch is processing
15 batch elink completed 

16 batch is processing
16 batch elink completed 

17 batch is processing
17 batch elink completed 

18 batch is processing
18 batch elink completed 

19 batch is processing
19 batch elink completed 

20 batch is processing
20 batch elink completed 

21 batch is proces

In [91]:
bio_project_results[:10]

[[{'LinkSetDbHistory': [], 'ERROR': [], 'LinkSetDb': [], 'DbFrom': 'nuccore', 'IdList': ['2903184920', '2903184906', '2903184892', '2903184878', '2903184864', '2903184850', '2903184836', '2903184822', '2903184808', '2903184794', '2903184780', '2903184766', '2903184752', '2903184738', '2903184724', '2903184710', '2903184696', '2903184682', '2903184668', '2903184654', '2903184640', '2903184626', '2903184612', '2903184598', '2903184584', '2903184570', '2903184556', '2903184542', '2903184528', '2903184514', '2903184500', '2903184486', '2903184472', '2903184458', '2903184444', '2903184430', '2903184416', '2903184402', '2903184388', '2903184374', '2903184360', '2903184346', '2903184332', '2903184318', '2903184304', '2903184290', '2903184276', '2903184262', '2903184248', '2903184234', '2903184220', '2903184206', '2903184192', '2903184178', '2903184164', '2903184150', '2903184136', '2903184122', '2903184108', '2903184094', '2903184080', '2903184066', '2903184052', '2903184038', '2903184024', '

In [None]:

bio_project_ids = []
for bio_project_result in bio_project_results:
    if(bio_project_result[0]["LinkSetDb"] != []):
        for link_set_db in bio_project_result[0]["LinkSetDb"]:
            for links in link_set_db["Link"]:
                    bio_project_ids.append(links["Id"])

In [54]:
unique_bio_project_ids = list(set(bio_project_ids))
unique_bio_project_ids

['291844', '927338', '727431', '559484', '658651']

In [None]:
bioproj_df = pd.DataFrame(columns=["UID", "AccessionID"])
for uid in unique_bio_project_ids:
    bioproj_handler = Entrez.efetch(db="bioproject", id= uid, rettype="acc", retmode="text")
    content = bioproj_handler.read()
    records = content.split("\n")
    for line in records:
        if "BioProject Accession" in line:
            item = {
                "UID": uid,
                "AccessionID": line.split(": ")[1]
            }
            bioproj_df.loc[len(bioproj_df)] = item

In [153]:
for uid in unique_bio_project_ids:
    sra_link = Entrez.elink(dbfrom="bioproject", id= uid, db="sra")
    content = Entrez.read(sra_link)
    bioproj_df.loc[bioproj_df["UID"] == uid, "HasSRARecord1"] = content[0]["LinkSetDb"] != []

In [154]:
bioproj_df

Unnamed: 0,UID,AccessionID,HasSRARecord1
0,291844,PRJNA291844,False
1,927338,PRJNA927338,False
2,727431,PRJNA727431,False
3,559484,PRJNA559484,True
4,658651,PRJDB10452,False


#### Reading extrected nucleotide metadata, SRA metadata

In [155]:
nucleotide_metadata = pd.read_csv("DATA_Nucleotide_Metadata.csv")
sra_metadata_bioproject = pd.read_csv("DATA_SRA_Metadata_linked_through_BioProject.csv")
sra_metadata_biosample = pd.read_csv("DATA_SRA_Metadata_linked_through_BioSample.csv")

In [156]:
nucleotide_metadata.head()

Unnamed: 0,UID,AccessionID,BioProject,BioSample,VERSION,ORGANISM,SEQ_LEN,CREATE_DATE,AUTHORS,TITLE,REFERENCE,TAXONOMY
0,,AP023485,PRJDB10452,SAMD00243993,1,Homo sapiens,16568,,"Takayama,J., Kinoshita,K., Yamamoto,M. and Tam...",Construction and Integration of Three De Novo ...,Unpublished,Eukaryota;Metazoa;Chordata;Craniata;Vertebrata...
1,,KX061457,PRJNA291844,SAMN04592714,1,Homo sapiens,16568,,"Ancora,M., Orsini,M., Colosimo,A., Russo,V., M...",Human mitochondrial heteroplasmy profiling in ...,Unpublished,Eukaryota;Metazoa;Chordata;Craniata;Vertebrata...
2,,KX061456,PRJNA291844,SAMN04592710,1,Homo sapiens,16568,,"Ancora,M., Orsini,M., Colosimo,A., Russo,V., M...",Human mitochondrial heteroplasmy profiling in ...,Unpublished,Eukaryota;Metazoa;Chordata;Craniata;Vertebrata...
3,,KX061455,PRJNA291844,SAMN04592709,1,Homo sapiens,16568,,"Ancora,M., Orsini,M., Colosimo,A., Russo,V., M...",Human mitochondrial heteroplasmy profiling in ...,Unpublished,Eukaryota;Metazoa;Chordata;Craniata;Vertebrata...
4,,KX061454,PRJNA291844,SAMN04592682,1,Homo sapiens,16568,,"Ancora,M., Orsini,M., Colosimo,A., Russo,V., M...",Human mitochondrial heteroplasmy profiling in ...,Unpublished,Eukaryota;Metazoa;Chordata;Craniata;Vertebrata...


In [157]:
sra_metadata_biosample.head()

Unnamed: 0,PRIMARY_ID,BioSample,BioProject,SRA_ID
0,SRX26348347,SAMN03283347,PRJNA200694,SRA1989932
1,SRX26348346,SAMN03283347,PRJNA200694,SRA1989932
2,SRX23904609,SAMN03283347,PRJNA731524,SRA1822549
3,SRX23904608,SAMN03283347,PRJNA731524,SRA1822549
4,SRX23904607,SAMN03283347,PRJNA731524,SRA1822549


In [160]:
sra_metadata_bioproject.head()

Unnamed: 0,PRIMARY_ID,BioSample,BioProject,SRA_ID
0,SRX14137770,SAMN03255769,PRJNA559484,SRA1369843
1,SRX14137769,SAMN03255769,PRJNA559484,SRA1369843
2,SRX14137768,SAMN03255769,PRJNA559484,SRA1369843
3,SRX14137767,SAMN03255769,PRJNA559484,SRA1369843
4,SRX14137766,SAMN03255769,PRJNA559484,SRA1369843


### Identifying association between nucleotide records, and SRA records through BioProject Id and BioSample Id

In [None]:
bioproj_value_count = (nucleotide_metadata["BioProject"].value_counts().reset_index())
# print(bioproj_value_count)
# print(nucleotide_metadata["BioSample"].value_counts().reset_index())

nucleotide_bioproject_sra_df = pd.DataFrame(columns= ["bioproject_id", "nucleotide_count", "sra_count"])
nucleotide_bioproject_sra_df["bioproject_id"] = bioproj_value_count["BioProject"]
nucleotide_bioproject_sra_df["nucleotide_count"] = bioproj_value_count["count"]

sra_bioproj_valuecount = (sra_metadata_bioproject["BioProject"].value_counts().reset_index())

for row in sra_bioproj_valuecount.itertuples():
    if(row.BioProject in nucleotide_bioproject_sra_df["bioproject_id"].values):
        nucleotide_bioproject_sra_df.loc[nucleotide_bioproject_sra_df["bioproject_id"] == row.BioProject, "sra_count"] = row.count
    else:
        nucleotide_bioproject_sra_df.loc[len(nucleotide_bioproject_sra_df)] = {
            "bioproject_id": row.BioProject,
            "nucleotide_count": 0,
            "sra_count": row.count
        }
        
nucleotide_bioproject_sra_df

Unnamed: 0,bioproject_id,nucleotide_count,sra_count
0,PRJNA291844,13,
1,PRJDB10452,1,
2,PRJNA559484,1,259.0
3,PRJNA727431,1,
4,PRJNA927338,1,


In [None]:
biosample_value_count = (nucleotide_metadata["BioSample"].value_counts().reset_index())

nucleotide_biosample_sra_df = pd.DataFrame(columns= ["biosample_id", "nucleotide_count", "sra_count"])
nucleotide_biosample_sra_df["biosample_id"] = biosample_value_count["BioSample"]
nucleotide_biosample_sra_df["nucleotide_count"] = bioproj_value_count["count"]

sra_biosample_valuecount = (sra_metadata_biosample["BioSample"].value_counts().reset_index())

for row in sra_biosample_valuecount.itertuples():
    if(row.BioSample in nucleotide_biosample_sra_df["biosample_id"].values):
        nucleotide_biosample_sra_df.loc[nucleotide_biosample_sra_df["biosample_id"] == row.BioSample, "sra_count"] = row.count
    else:
        nucleotide_biosample_sra_df.loc[len(nucleotide_biosample_sra_df)] = {
            "biosample_id": row.BioSample,
            "nucleotide_count": 0,
            "sra_count": row.count
        }
        
nucleotide_biosample_sra_df


Unnamed: 0,biosample_id,nucleotide_count,sra_count
0,SAMD00243993,13.0,
1,SAMN04592714,1.0,
2,SAMN04592710,1.0,
3,SAMN04592709,1.0,
4,SAMN04592682,1.0,
5,SAMN04592681,,
6,SAMN04592680,,
7,SAMN04592679,,
8,SAMN04592678,,
9,SAMN04592677,,


In [None]:
bioproj_value_count = (nucleotide_metadata["BioProject"].value_counts().reset_index())

nucleotide_bioproject_sra_df = pd.DataFrame(columns= ["bioproject_id", "nucleotide_count", "sra_count"])
nucleotide_bioproject_sra_df["bioproject_id"] = bioproj_value_count["BioProject"]
nucleotide_bioproject_sra_df["nucleotide_count"] = bioproj_value_count["count"]

sra_bioproj_valuecount = (sra_metadata_biosample["BioProject"].value_counts().reset_index())

for row in sra_bioproj_valuecount.itertuples():
    if(row.BioProject in nucleotide_bioproject_sra_df["bioproject_id"].values):
        nucleotide_bioproject_sra_df.loc[nucleotide_bioproject_sra_df["bioproject_id"] == row.BioProject, "sra_count"] = row.count
    else:
        nucleotide_bioproject_sra_df.loc[len(nucleotide_bioproject_sra_df)] = {
            "bioproject_id": row.BioProject,
            "nucleotide_count": 0,
            "sra_count": row.count
        }
        
nucleotide_bioproject_sra_df

Unnamed: 0,bioproject_id,nucleotide_count,sra_count
0,PRJNA291844,13,
1,PRJDB10452,1,
2,PRJNA559484,1,259.0
3,PRJNA727431,1,
4,PRJNA927338,1,
5,PRJNA200694,0,251.0
6,PRJNA335618,0,110.0
7,PRJNA731524,0,48.0
8,PRJNA269593,0,18.0
9,PRJNA558394,0,9.0
