In [None]:
import pandas as pd
import requests
import os
from pathlib import Path
import gzip
import shutil

# Create directories for downloaded and extracted files
os.makedirs('viral', exist_ok=True)
os.makedirs('bacteria', exist_ok=True)
os.makedirs('archaea', exist_ok=True)

# Parse the data
df = pd.read_csv("standard_fasta_files_link.tsv", sep='\t')
df = df[df['URL'].notna() & (df['URL'].str.strip() != '')]  # remove empty URLs

# Extract sequence ID
def extract_sequence_id(seq_name):
    if pd.isna(seq_name):
        return None
    return seq_name.split()[0]

df['sequence_id'] = df['Sequence Name'].apply(extract_sequence_id)

# Download helper
def download_file(url, output_path, base_url='https://ftp.ncbi.nlm.nih.gov/'):
    try:
        if not url.startswith('http'):
            url = base_url + url
        
        print(f"Downloading: {url}")
        response = requests.get(url, stream=True, timeout=60)
        response.raise_for_status()

        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"✓ Downloaded: {output_path}")
        return True
    except Exception as e:
        print(f"✗ Error downloading {url}: {str(e)}")
        return False

# Extract helper
def extract_gz(gz_path, output_path):
    """Decompress .gz → .fna and remove .gz"""
    try:
        with gzip.open(gz_path, 'rb') as f_in:
            with open(output_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(gz_path)
        print(f"✓ Extracted: {output_path}")
    except Exception as e:
        print(f"✗ Error extracting {gz_path}: {str(e)}")

# Split by library
viral_df = df[df['Library'] == 'viral'].head(1000)
archaea_df = df[df['Library'] == 'archaea'].head(1000)
bacteria_df = df[df['Library'] == 'bacteria'].head(1000)

print(f"Found {len(viral_df)} viral sequences")
print(f"Found {len(archaea_df)} archaea sequences")
print(f"Found {len(bacteria_df)} bacteria sequences")
print("\n" + "="*60)

# Download + extract per category
def process_downloads(df_subset, folder_name):
    print(f"\n### DOWNLOADING {folder_name.upper()} FILES ###")
    for idx, row in df_subset.iterrows():
        seq_id = row['sequence_id']
        url = row['URL']
        gz_path = f"{folder_name}/{seq_id}.fna.gz"
        fna_path = f"{folder_name}/{seq_id}.fna"
        if download_file(url, gz_path):
            extract_gz(gz_path, fna_path)

# Run downloads
process_downloads(viral_df, "viral")
process_downloads(archaea_df, "archaea")
process_downloads(bacteria_df, "bacteria")

print("\n" + "="*60)
print("Download and extraction complete!")
print("Extracted files saved as:")
print("  ./viral/*.fna")
print("  ./archaea/*.fna")
print("  ./bacteria/*.fna")

Found 1000 viral sequences
Found 1000 archaea sequences
Found 1000 bacteria sequences


### DOWNLOADING VIRAL FILES ###
Downloading: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/018/574/705/GCF_018574705.1_ASM1857470v1/GCF_018574705.1_ASM1857470v1_genomic.fna.gz
✓ Downloaded: viral/NC_073812.1.fna.gz
✓ Extracted: viral/NC_073812.1.fna
Downloading: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/023/141/495/GCF_023141495.1_ASM2314149v1/GCF_023141495.1_ASM2314149v1_genomic.fna.gz
✓ Downloaded: viral/NC_076816.1.fna.gz
✓ Extracted: viral/NC_076816.1.fna
Downloading: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/609/485/GCF_002609485.1_ASM260948v1/GCF_002609485.1_ASM260948v1_genomic.fna.gz
✓ Downloaded: viral/NC_047763.1.fna.gz
✓ Extracted: viral/NC_047763.1.fna
Downloading: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/937/255/GCF_002937255.1_ASM293725v1/GCF_002937255.1_ASM293725v1_genomic.fna.gz
✓ Downloaded: viral/NC_037062.1.fna.gz
✓ Extracted: viral/NC_037062.1.fna
Downloading: htt