In [None]:
# Biopython to parse fasta files
from Bio import SeqIO
# pandas is for dataframes
import pandas as pd
# ete3 to convert from lineage to taxid (and vice versa)
from ete3 import NCBITaxa
ncbi = NCBITaxa()
# ncbi.update_taxonomy_database()

### convert each taxid in metadata to a lineage ###
# load mgv metadata
mgv_metadata= pd.read_csv(str(snakemake.input.metadata), sep='\t')
mgv_metadata['ictv_family'] = mgv_metadata.apply(lambda x: x.ictv_family if x.ictv_family != 'crAss-phage' else 'uncultured crAssphage', axis = 1)
mgv_families = list(set(mgv_metadata['ictv_family']))

# convert families to taxids
taxids = []
for family in mgv_families[1:]:
    taxid = ncbi.get_name_translator([family])
    taxids.append(taxid)

# flatten list of dictionaries
taxids_dict = {}
for dictionary in taxids:
    taxids_dict.update(dictionary)

mgv_metadata['taxid'] = mgv_metadata['ictv_family'].map(taxids_dict)

#parse through fasta file
contig_to_taxid = mgv_metadata.set_index('contig_id').to_dict()['taxid']
new_mgv_sequences = []
for record in SeqIO.parse(str(snakemake.input.genomes), "fasta"):
    updated_id = str(contig_to_taxid[record.id]).removeprefix('[').removesuffix(']')
    record.id = str(record.id) + '|kraken:taxid|' + updated_id
    new_mgv_sequences.append(record)

# write customized sequences out
SeqIO.write(new_mgv_sequences, str(snakemake.output), "fasta")