In [1]:
import pandas as pd
from pathlib import Path

In [2]:
df = pd.read_csv("../data/taxa_list.tsv", sep="\t")

In [3]:
df.head()

Unnamed: 0,spesies,accession,link,keterangan,tax_id
0,Peribacillus acanthi,NZ_QBBX00000000,"Peribacillus acanthi strain L28, whole genome ...",whole genome shotgun,2171554.0
1,Planctomicrobium piriforme,NZ_FOQD00000000,"Planctomicrobium piriforme strain DSM 26348, w...",whole genome shotgun,1576369.0
2,Hyphomicrobium album,NZ_WMBQ00000000,"Hyphomicrobium album strain XQ2, whole genome ...",whole genome shotgun,
3,Motilibacter rhizosphaera,-,-,,
4,Thiorhodospira sibirica,NZ_AGFD00000000,"Thiorhodospira sibirica ATCC 700588, whole gen...",whole genome shotgun,


In [4]:
%%bash
#!/usr/bin/env bash

set -euo pipefail

TAXON_ID="2171554"
BASE_DIR="../data/raw/${TAXON_ID}"
ZIP_FILE="../data/raw/${TAXON_ID}.zip"

# Create directory for the taxon
mkdir -p "$BASE_DIR"

# Download genome zip only if it doesn't already exist
if [[ -f "$ZIP_FILE" ]]; then
    echo "[INFO] Zip file already exists: $ZIP_FILE — skipping download."
else
    echo "[INFO] Downloading genome for taxon $TAXON_ID..."
    (
        cd ../data/raw
        datasets download genome taxon "$TAXON_ID" \
            --reference \
            --filename "${TAXON_ID}.zip" \
            --include genome,gbff
    )
fi

# Unzip only if the directory is empty
if [[ -n "$(ls -A "$BASE_DIR" 2>/dev/null)" ]]; then
    echo "[INFO] Target directory not empty: $BASE_DIR — skipping unzip."
else
    echo "[INFO] Extracting to $BASE_DIR..."
    (
        unzip -d "$BASE_DIR" ../data/raw/"${TAXON_ID}.zip"
    )
fi


[INFO] Downloading genome for taxon 2171554...


Collecting 1 genome record [------------------------------------------------]   0% 0/1
[1A[2KCollecting 1 genome record [------------------------------------------------]   0% 0/1
[1A[2KCollecting 1 genome record [------------------------------------------------]   0% 0/1
[1A[2KCollecting 1 genome record [------------------------------------------------]   0% 0/1
[1A[2KCollecting 1 genome record [------------------------------------------------]   0% 0/1
[1A[2KCollecting 1 genome record [------------------------------------------------]   0% 0/1
[1A[2KCollecting 1 genome record [------------------------------------------------]   0% 0/1
[1A[2KCollecting 1 genome record [------------------------------------------------]   0% 0/1
[1A[2KCollecting 1 genome record [------------------------------------------------]   0% 0/1
[1A[2KCollecting 1 genome record [------------------------------------------------]   0% 0/1
[1A[2KCollecting 1 genome record [----------------------

[INFO] Extracting to ../data/raw/2171554...
Archive:  ../data/raw/2171554.zip
  inflating: ../data/raw/2171554/README.md  
  inflating: ../data/raw/2171554/ncbi_dataset/data/assembly_data_report.jsonl  
  inflating: ../data/raw/2171554/ncbi_dataset/data/GCF_003073175.1/genomic.gbff  
  inflating: ../data/raw/2171554/ncbi_dataset/data/GCF_003073175.1/GCF_003073175.1_ASM307317v1_genomic.fna  
  inflating: ../data/raw/2171554/ncbi_dataset/data/dataset_catalog.json  
  inflating: ../data/raw/2171554/md5sum.txt  


In [None]:
from Bio import SeqIO


In [None]:
feature_dict = {}

with open("../data/raw/2171554/ncbi_dataset/data/GCF_003073175.1/genomic.gbff") as handle:
    for rec in SeqIO.parse(handle, "genbank"):
        accession = rec.id
        for f in rec.features:
            if f.type not in {"CDS"}:
                continue
            else:
                q = f.qualifiers
                feature_cds = {}
                for criteria, value in q.items():
                    if type(value) == list:
                        feature_cds[criteria] = value[0]
                try:
                    feature_dict[q["locus_tag"][0]] = feature_cds
                except TypeError:
                    print(q)

In [None]:
outdir = Path("../data/processed/")
outdir.mkdir(exist_ok=True, parents=True)
pd.DataFrame.from_dict(feature_dict).T.to_csv(outdir / "2171554.tsv", sep="\t")

In [None]:
f.qualifiers['locus_tag']
f.qualifiers['product']
f.qualifiers['translation']



In [None]:
f.qualifiers