In [None]:
import pandas as pd
from pathlib import Path

In [None]:
df = pd.read_csv("../data/taxa_list.tsv", sep="\t")

In [None]:
df.head()

In [None]:
%%bash
#!/usr/bin/env bash

set -euo pipefail

TAXON_ID="2171554"
BASE_DIR="../data/raw/${TAXON_ID}"
ZIP_FILE="../data/raw/${TAXON_ID}.zip"

# Create directory for the taxon
mkdir -p "$BASE_DIR"

# Download genome zip only if it doesn't already exist
if [[ -f "$ZIP_FILE" ]]; then
    echo "[INFO] Zip file already exists: $ZIP_FILE — skipping download."
else
    echo "[INFO] Downloading genome for taxon $TAXON_ID..."
    (
        cd ../data/raw
        datasets download genome taxon "$TAXON_ID" \
            --reference \
            --filename "${TAXON_ID}.zip" \
            --include genome,gbff
    )
fi

# Unzip only if the directory is empty
if [[ -n "$(ls -A "$BASE_DIR" 2>/dev/null)" ]]; then
    echo "[INFO] Target directory not empty: $BASE_DIR — skipping unzip."
else
    echo "[INFO] Extracting to $BASE_DIR..."
    (
        unzip -d "$BASE_DIR" ../data/raw/"${TAXON_ID}.zip"
    )
fi


In [None]:
from Bio import SeqIO


In [None]:
feature_dict = {}

with open("../data/raw/2171554/ncbi_dataset/data/GCF_003073175.1/genomic.gbff") as handle:
    for rec in SeqIO.parse(handle, "genbank"):
        accession = rec.id
        for f in rec.features:
            if f.type not in {"CDS"}:
                continue
            else:
                q = f.qualifiers
                feature_cds = {}
                for criteria, value in q.items():
                    if type(value) == list:
                        feature_cds[criteria] = value[0]
                try:
                    feature_dict[q["locus_tag"][0]] = feature_cds
                except TypeError:
                    print(q)

In [None]:
outdir = Path("../data/processed/")
outdir.mkdir(exist_ok=True, parents=True)
pd.DataFrame.from_dict(feature_dict).T.to_csv(outdir / "2171554.tsv", sep="\t")

In [None]:
f.qualifiers['locus_tag']
f.qualifiers['product']
f.qualifiers['translation']



In [None]:
f.qualifiers