In [None]:
##For generating an XML for continuous phylo for ONE alignment, given a alignment fasta, latlongs file, empirical trees
##and a template XML

In [3]:
import os
import re
import csv
from Bio import SeqIO


# === Input Files ===
fasta_file = "test_looped_xml_gen/alignment/Wisconsin_64_alignment_allWIdt_w_meta.fasta"
# tsv_file = fasta_file.replace(".fasta", ".tsv")
tsv_file = "test_looped_xml_gen/latlong_tsv/Wisconsin_64_allWIdt.tsv"
template_file = "test_looped_xml_gen/contphyl_emptre_template.xml"
output_file = fasta_file.replace(".fasta", ".xml")

# === Prefix for empirical tree ===
# prefix = fasta_file.split("_all")[0]
# tree_filename = f"{prefix}_emptre_500.trees"
tree_filename = "Wisconsin_64_alignment_allWIdt_w_meta.trees"

# === Load taxa from FASTA ===
records = list(SeqIO.parse(fasta_file, "fasta"))
taxa = [record.id for record in records]

# === Load location data ===
location_data = {}
with open(tsv_file, newline='') as tsvfile:
    reader = csv.DictReader(tsvfile, delimiter='\t', fieldnames=['taxon_id', 'latitude', 'longitude'])
    for row in reader:
        taxon_id = row['taxon_id']
        try:
            decimal_date = taxon_id.split("|")[1]
        except IndexError:
            print(f"Warning: Invalid taxon_id format: {taxon_id}")
            continue
        location_data[taxon_id] = {
            "decimal_date": decimal_date,
            "latitude": row["latitude"],
            "longitude": row["longitude"]
        }

# === Build <taxa> block ===
taxa_block = ['\t<taxa id="taxa">']
for taxon in taxa:
    if taxon not in location_data:
        print(f"Warning: No location found for {taxon}")
        continue
    data = location_data[taxon]
    taxa_block.append(f'\t\t<taxon id="{taxon}">')
    taxa_block.append(f'\t\t\t<date value="{data["decimal_date"]}" direction="forwards" units="years"/>')
    taxa_block.append(f'\t\t\t<attr name="LATITUDE">\n\t\t\t\t{data["latitude"]}\n\t\t\t</attr>')
    taxa_block.append(f'\t\t\t<attr name="LONGITUDE">\n\t\t\t\t{data["longitude"]}\n\t\t\t</attr>')
    taxa_block.append(f'\t\t\t<!-- START Multivariate diffusion model -->')
    taxa_block.append(f'\t\t\t<attr name="location">\n\t\t\t\t{data["latitude"]} {data["longitude"]}\n\t\t\t</attr>')
    taxa_block.append(f'\t\t\t<!-- END Multivariate diffusion model -->')
    taxa_block.append(f'\t\t</taxon>')
taxa_block.append('\t</taxa>')
taxa_block_str = "\n".join(taxa_block)

# === Build <alignment> block ===
alignment_block = ['\t<alignment id="alignment" dataType="nucleotide">']
for record in records:
    alignment_block.append(f'\t\t<sequence>')
    alignment_block.append(f'\t\t\t<taxon idref="{record.id}"/>')
    alignment_block.append(f'\t\t\t{str(record.seq)}')
    alignment_block.append(f'\t\t</sequence>')
alignment_block.append('\t</alignment>')
alignment_block_str = "\n".join(alignment_block)

# === Load XML Template ===
with open(template_file, 'r') as f:
    template_xml = f.read()

# === Replace <taxa> block ===
template_xml = re.sub(
    r'<taxa id="taxa">.*?</taxa>',
    taxa_block_str,
    template_xml,
    flags=re.DOTALL
)

# === Replace empirical tree fileName ===
template_xml = re.sub(
    r'(<empiricalTreeDistributionModel id="treeModel" fileName=")[^"]+(")',
    rf'\1{tree_filename}\2',
    template_xml
)

# === Replace <alignment> block ===
template_xml = re.sub(
    r'<alignment id="alignment" dataType="nucleotide">.*?</alignment>',
    alignment_block_str,
    template_xml,
    flags=re.DOTALL
)

# === Replace taxa number ===

ntax = len(records)
# Replace only the ntax portion inside the comment
template_xml = re.sub(r'ntax=\d+', f'ntax={len(records)}', template_xml)


# === Write output ===
with open(output_file, 'w') as f:
    f.write(template_xml)

print(f"XML written to {output_file}")
print(f"Empirical tree set to: {tree_filename}")
print(f"Alignment and taxa updated based on {fasta_file}")


✅ XML written to test_looped_xml_gen/alignment/Wisconsin_64_alignment_allWIdt_w_meta.xml
🔁 Empirical tree set to: Wisconsin_64_alignment_allWIdt_w_meta.trees
🔃 Alignment and taxa updated based on test_looped_xml_gen/alignment/Wisconsin_64_alignment_allWIdt_w_meta.fasta
