### Create a custom DADA2 taxonomy database based on the Berry 16S dataset.

See the [DADA2 docs](https://benjjneb.github.io/dada2/training.html) for more details.

In [1]:
from Bio import SeqIO
import pandas as pd
import pytaxonkit

In [2]:
taxids = pd.read_csv("../data/refdb/taxids_fish_only.txt", header=None)

lineages = pytaxonkit.lineage(taxids[0].to_list())
lineages = lineages.dropna(subset="Lineage")
lineages = lineages.drop_duplicates(subset=["TaxID",])
lineages = lineages.set_index("TaxID")

In [3]:
records = SeqIO.parse("../data/refdb/db_vnew_fish_only.fasta", "fasta")
get_taxid_from_seq = lambda seq: int(seq.description.split("merged_taxid={")[-1].split(":")[0])

with open("../data/refdb/db_vnew_fish_only_dada2_custom.fasta", "w") as f:
    with open("../data/refdb/db_vnew_fish_only_dada2_custom_sp.fasta", "w") as fsp:
        for rec in records:
            taxid = get_taxid_from_seq(rec)
            rec.id = lineages.loc[taxid, "Lineage"].replace(" ", "_") + ";"
            rec.description = str(taxid)
            rec.seq = rec.seq.upper()
            SeqIO.write(rec, f, "fasta")
            
            # Species file
            rec.id = str(taxid)
            rec.description = lineages.loc[taxid, "Lineage"].split(";")[-1]
            
            SeqIO.write(rec, fsp, "fasta")