### Download Fasta and GFF Files throught RdRp number.

##### Import all libraries.

In [6]:
import time
from Bio import Entrez
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
import xlrd

##### Set the email address and API key for the Entrez API

In [7]:
Entrez.email = "yash.panwar@aeratx.com"
Entrez.api_key = "5920517397bd2a4db153a0ab87968903da08"

##### Read the RDRP numbers from an Excel file

In [8]:
workbook = xlrd.open_workbook('virus_list.xls')
sheet = workbook.sheet_by_index(0)
rdrp_numbers = sheet.col_values(0)[1:] # Process only the first 10 entries

##### Download the FASTA and GFF files for each RDRP number

In [9]:
batch_size = 200
#WAIT_TIME = 10
for i in range(0, len(rdrp_numbers), batch_size):
    batch = rdrp_numbers[i:i+batch_size]
    batch_size = len(batch)
    print(f"Processing batch {i/batch_size+1} with {batch_size} records")

    for rdrp_num in batch:
        rdrp_num = str(int(rdrp_num))  # Convert RDRP number to string and remove decimal places
        print(f"Processing record {rdrp_num}...")

        # Search for the GenPept record using the RDRP number
        search_term = f"{rdrp_num}"
        search_handle = Entrez.esearch(db="protein", term=search_term, retmax=1)
        search_results = Entrez.read(search_handle)
        search_handle.close()

        if search_results["IdList"]:
            protein_id = search_results["IdList"][0]

            # Download the FASTA file for the protein record
            fasta_dir = os.path.join(os.getcwd(), "downloads", "Fasta_files")
            if not os.path.exists(fasta_dir):
                os.makedirs(fasta_dir)
            
           
            record = SeqIO.read(Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text"), "genbank")
            seq_record = str(record.seq)
            fasta_path = os.path.join(fasta_dir, f"{rdrp_num}.fasta")

            new_seq_record = SeqRecord(Seq(seq_record), id=rdrp_num, description=record.description)
            SeqIO.write(new_seq_record, fasta_path, "fasta")
            print(f"Downloaded {fasta_path}")

            # Download the GFF file for the protein record
            gff_dir = os.path.join(os.getcwd(), "downloads", "Gff_files")
            if not os.path.exists(gff_dir):
                os.makedirs(gff_dir)
            
            # Try to download the GFF file directly
            gff_handle = Entrez.efetch(db="protein", id=protein_id, rettype="gff3", retmode="text")
            gff_data = gff_handle.read()
            gff_handle.close()

            gff_path = os.path.join(gff_dir, f"{rdrp_num}.gff")
            with open(gff_path, "w") as f:
                f.write(gff_data)
            print(f"Downloaded {gff_path}")
    print(f"Batch {i/batch_size+1} complete")

Processing batch 1.0 with 200 records
Processing record 752455814...
Downloaded /home/jupyter/Viral Meta/test 3/FINAL script/downloads/Fasta_files/752455814.fasta
Downloaded /home/jupyter/Viral Meta/test 3/FINAL script/downloads/Gff_files/752455814.gff
Processing record 998454037...
Downloaded /home/jupyter/Viral Meta/test 3/FINAL script/downloads/Fasta_files/998454037.fasta
Downloaded /home/jupyter/Viral Meta/test 3/FINAL script/downloads/Gff_files/998454037.gff
Processing record 1124091243...
Downloaded /home/jupyter/Viral Meta/test 3/FINAL script/downloads/Fasta_files/1124091243.fasta
Downloaded /home/jupyter/Viral Meta/test 3/FINAL script/downloads/Gff_files/1124091243.gff
Processing record 391738071...
Downloaded /home/jupyter/Viral Meta/test 3/FINAL script/downloads/Fasta_files/391738071.fasta
Downloaded /home/jupyter/Viral Meta/test 3/FINAL script/downloads/Gff_files/391738071.gff
Processing record 633266568...
Downloaded /home/jupyter/Viral Meta/test 3/FINAL script/downloads/Fa

KeyboardInterrupt: 