In [1]:
import os
import pandas as pd
import numpy as np
from Bio import SeqIO
import json
from pathlib import Path

In [2]:
db = pd.read_excel('/home/tobamo/analize/project-tobamo/analysis/data/tobamo/reference_database.xlsx', index_col=0)
ref_fasta = SeqIO.to_dict(SeqIO.parse('/home/tobamo/analize/project-tobamo/analysis/data/tobamo/reference_nukleotidne.fasta', 'fasta'))

In [17]:
outgroup[3]

'AJ132577.1_Soil-borne_cereal_mosaic_virus_RNA2_complete_genome'

In [None]:
tobamo = db[db['type'] == 'tobamo']['record_id'].tolist()
tobamo_plus = tobamo.copy()
outgroup = db[db['type'] == 'outgroup']['record_id'].tolist()
tobamo_plus.append(outgroup[1])

tobamo_ref_fasta = {k: v for k, v in ref_fasta.items() if k in tobamo}
tobamo_ref_fasta_plus_1og = {k: v for k, v in ref_fasta.items() if k in tobamo_plus}

62

In [None]:
with open('/home/tobamo/analize/project-tobamo/analysis/phylogenetic_placement/data/reference_nukleotidne_tobamo.fasta', 'w') as f:
    SeqIO.write(tobamo_ref_fasta.values(), f, 'fasta')

with open('/home/tobamo/analize/project-tobamo/analysis/phylogenetic_placement/data/reference_nukleotidne_tobamo_plus_1og.fasta', 'w') as f:
    SeqIO.write(tobamo_ref_fasta_plus_1og.values(), f, 'fasta')

In [29]:
def create_simple_mapper(fasta_path, output_fasta, mapper_json, prefix="ref"):
    fasta_path = Path(fasta_path)
    output_fasta = Path(output_fasta)
    mapper_json = Path(mapper_json)
    
    # Create output directory if needed
    output_fasta.parent.mkdir(parents=True, exist_ok=True)
    mapper_json.parent.mkdir(parents=True, exist_ok=True)
    
    # Read sequences and create mapper
    mapper = {}
    renamed_records = []
    
    print(f"Reading sequences from: {fasta_path}")
    records = list(SeqIO.parse(fasta_path, "fasta"))
    print(f"Found {len(records)} sequences")
    
    for idx, record in enumerate(records, start=1):
        # Create simple ID
        simple_id = f"{prefix}{idx:03d}"
        
        # Store mapping
        mapper[simple_id] = {
            "original_id": record.id,
            "original_description": record.description
        }
        
        # Create renamed record (keep original in description for reference)
        new_record = record[:]  # Copy sequence
        new_record.id = simple_id
        new_record.name = simple_id
        new_record.description = ""  # Clean description to avoid issues
        renamed_records.append(new_record)
    
    # Write renamed FASTA
    print(f"Writing renamed FASTA to: {output_fasta}")
    SeqIO.write(renamed_records, output_fasta, "fasta")
    
    # Create reverse mapper for easy lookup
    reverse_mapper = {v["original_id"]: k for k, v in mapper.items()}
    
    full_mapper = {
        "simple_to_original": mapper,
        "original_to_simple": reverse_mapper,
        "metadata": {
            "total_sequences": len(records),
            "prefix": prefix,
            "input_file": str(fasta_path),
            "output_file": str(output_fasta)
        }
    }
    
    # Write mapper JSON
    print(f"Writing mapper to: {mapper_json}")
    with open(mapper_json, 'w') as f:
        json.dump(full_mapper, f, indent=2)
    
    print(f"\n✓ Success!")
    print(f"  - Renamed {len(records)} sequences")
    print(f"  - Output FASTA: {output_fasta}")
    print(f"  - Mapper JSON: {mapper_json}")
    
    return full_mapper

In [31]:
# Define paths
reference_fasta = "/home/tobamo/analize/project-tobamo/analysis/phylogenetic_placement/data/reference_nukleotidne_tobamo_plus_1og.fasta"
renamed_reference = "../data/tobamo_plus_1og_reference_renamed.fasta"
mapper_file = "../data/tobamo_plus_1og_reference_mapper.json"

# Create renamed FASTA and mapper
mapper = create_simple_mapper(
    fasta_path=reference_fasta,
    output_fasta=renamed_reference,
    mapper_json=mapper_file,
    prefix="ref"
)

# Show a few examples
print("\nExample mappings:")
for simple_id in list(mapper["simple_to_original"].keys())[:2]:
    original = mapper["simple_to_original"][simple_id]["original_id"]
    print(f"  {simple_id} → {original}")

Reading sequences from: /home/tobamo/analize/project-tobamo/analysis/phylogenetic_placement/data/reference_nukleotidne_tobamo_plus_1og.fasta
Found 62 sequences
Writing renamed FASTA to: ../data/tobamo_plus_1og_reference_renamed.fasta
Writing mapper to: ../data/tobamo_plus_1og_reference_mapper.json

✓ Success!
  - Renamed 62 sequences
  - Output FASTA: ../data/tobamo_plus_1og_reference_renamed.fasta
  - Mapper JSON: ../data/tobamo_plus_1og_reference_mapper.json

Example mappings:
  ref001 → AJ132579.1_Oat_golden_stripe_virus_RNA2_complete_genome
  ref002 → AB015145.1_Kyuri_green_mottle_mosaic_virus_genomic_RNA,_complete_sequence_of_strain_Yodo


### align references (for reference tree)

In [None]:
mafft --auto --reorder /home/tobamo/analize/project-tobamo/analysis/phylogenetic_placement/data/tobamo_plus_1og_reference_renamed.fasta > results3/mafft_aligned_tobamo_plus_1og_refs_renamed.fasta

In [None]:
# using papara
# issues with papara settings, need to fix

### make reference tree

In [None]:
iqtree3 -s /home/tobamo/analize/project-tobamo/analysis/phylogenetic_placement/epa-ng/results3/mafft_aligned_tobamo_plus_1og_refs_renamed.fasta -m MFP -bb 1000 -T 38 --prefix results3/iqtree/ref_aligned

### align references + query contigs (to get contig alignments file)

In [None]:
# make a combined fasta file #by hand

In [None]:
# Add fragmentary queries:
mafft --thread 32 --addfragments ../data/tob2_contigs.fasta results3/mafft_aligned_tobamo_plus_1og_refs_renamed.fasta > results3/ref_plus_query_tob2_add-fragments.fasta #fail
mafft --thread 32 --add ../data/tob2_contigs_selected4.fasta results3/mafft_aligned_tobamo_plus_1og_refs_renamed.fasta > results3/ref_plus_query_tob2_add_selected4.fasta #success

# issue with results3/iqtree/ref_aligned & results3/ref_plus_query_tob2_add_selected4.fasta (not the same length!)

In [None]:
# try main mafft again
mafft --auto /home/tobamo/analize/project-tobamo/analysis/phylogenetic_placement/data/ref_test_contigs_combined.fasta > results3/mafft_aligned_ref_test_contigs_combined.fasta

# check alignment length
# keep query contigs
# try epa-ng


In [None]:
# filter (keep the queries)
# by hand 
results3/ref_plus_query_tob2_add_selected4.fasta

In [None]:
# use epa-ng to place the queries onto the reference tree
epa-ng --ref-msa results3/mafft_aligned_tobamo_plus_1og_refs_renamed.fasta --tree results3/iqtree/ref_aligned.treefile --model results3/iqtree/ref_aligned.iqtree --query results3/ref_plus_query_tob2_add_selected4.fasta --threads 32 --redo --outdir results3/epa-ng