In [1]:
from Bio import Entrez
import mpu
from urllib.error import HTTPError
import pandas as pd

In [2]:
# Define file path
file_path = '/home/tobamo/analize/project-tobamo/results/megan6_results_combined.csv'

# Load CSV file into DataFrame and clean NaN values
cleaned_df = pd.read_csv(file_path, index_col=0).dropna(subset=['nr_sseqid'])

# Extract unique 'nr_sseqid' values and split before dot
nr_sseqid = [str(sseqid).split('.')[0] for sseqid in cleaned_df['nr_sseqid'].unique()]

In [4]:
def fetch_taxonomies_in_batch(accession_dict, db, batch_size):
    if batch_size > 100:
        raise ValueError("Batch size exceeds the limit of 100.")
    
    accession_list = [key for key, val in accession_dict.items() if val == ""]
    batch_accessions = accession_list[:batch_size]

    try:
        handle = Entrez.efetch(db=db, id=batch_accessions, retmode='xml')
        records = Entrez.read(handle)
        for record in records:
            accession = record["GBSeq_primary-accession"]
            accession_dict[accession] = record["GBSeq_taxonomy"]
    except HTTPError as e:
        print(f"HTTP Error: {e}")
    except Exception as e:
        print(f"Error occurred: {e}")

    return accession_dict

In [None]:
output_name = 'accession_dict'
Entrez.email = "neza.pajekarambasic@fri.uni-lj.si"

accession_dict = {"":""}

# Fetch taxonomies until all values are filled
while "" in accession_dict.values():
    try:
        accession_dict = mpu.io.read(f'{output_name}.json')
    except FileNotFoundError:
        accession_dict = {accession: "" for accession in nr_sseqid}
    accession_dict = fetch_taxonomies_in_batch(accession_dict, "protein", 100) # batch_size <= 100
    mpu.io.write(f'{output_name}.json', accession_dict)