In [1]:
from Bio import SeqIO

Let's try again, from scratch

A1. Collect reference sequences (done)

A2. Align the references

In [None]:
mafft --auto --reorder inputs/refs.fasta > outputs/mafft_ref_aligned.fasta

A3. build the reference tree

In [None]:
iqtree3 -s outputs/mafft_ref_aligned.fasta -m MFP -bb 1000 -T 38 --prefix outputs/iqtree/ref_aligned

B1.1 Align Querry Sequences to the Reference w/ MAFFT or HMMER

In [None]:
mafft --auto --addfragments inputs/contigs.fasta --reorder outputs/mafft_ref_aligned.fasta > outputs/contigs_aligned.fasta

In [None]:
awk '/^>/{if(seq){print length(seq); exit} seq=""; next} {seq=seq $0} END{if(seq) print length(seq)}' outputs/contigs_aligned.fasta

B1.2. try with --reorder flag

In [None]:
# all contigs
mafft --addfragments inputs/contigs.fasta --reorder --keeplength outputs/mafft_ref_aligned.fasta > outputs/contigs_aligned_keeplength.fasta
# /home/tobamo/miniconda3/envs/phylo_placement/bin/mafft: line 2842: 3993494 Segmentation fault      (core dumped) "$prefix/pairlocalalign" $localparam $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -Q $spfactor -h $laof -Y $usenaivepairscore < infile > /dev/null 2>> "$progressfile"

# test one contig
mafft --addfragments inputs/contigs_n1.fasta --reorder --keeplength outputs/mafft_ref_aligned.fasta > outputs/test/n1.fasta
mafft --addfragments inputs/contigs_n1.fasta --reorder --keeplength --clustalout outputs/mafft_ref_aligned.fasta > outputs/test/n1.clustal
# works fine

# test one contig without flag keeplenght
mafft --addfragments inputs/contigs_n1.fasta --reorder outputs/mafft_ref_aligned.fasta > outputs/test/n1_noflag.fasta
mafft --addfragments inputs/contigs_n1.fasta --reorder --clustalout outputs/mafft_ref_aligned.fasta > outputs/test/n1_noflag.clustal
# works fine

# test two contigs
mafft --addfragments inputs/contigs_n2.fasta --reorder --keeplength outputs/mafft_ref_aligned.fasta > outputs/test/n2.fasta
# segmentation fault

# test 10
mafft --addfragments inputs/contigs_n10.fasta --reorder --keeplength outputs/mafft_ref_aligned.fasta > outputs/test/n10.fasta
# segmentation fault



In [3]:
from Bio import AlignIO

In [18]:
aln_fasta = AlignIO.read("outputs/test/n1.clustal", "clustal") 
for record in aln_fasta:
    print(f"{record.id[:6]}\t{record.seq}")

ref001	----------------------gtattttcttttcacacagttca--------agcgttagcttcggacga-cg------------tggagactacctttcttaccaggataagaaataa-----------------------------------------------------------------------------------------------------------gatattttctgggttctgcaacaaacgttgtggaaattcaaaattgttgtcctagg---------------------------------------------------------------------------------------------------cctgccacggcaataaatcctaggcgtctcagttttggtttttcg-----------------------------------------c------------------------------------tacgttagttgttaccacgctgcgctggataagccacttaagttgaagggcgatatccatca------gtcgttgcaggcagctagttaaatcaccaaggagagtact-------------------------------------------------------------------gagg--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ctttggagaagctcgttactacggtcttaccaaag-------atgcctgttacaccgggttacac--------------------------------------------------------------

In [19]:
aln_fasta = AlignIO.read("outputs/test/n1_noflag.clustal", "clustal") 
for record in aln_fasta:
    print(f"{record.id[:6]}\t{record.seq}")

ref001	----------------------gtattttcttttcacacagttca--------agcgttagcttcggacga-cg------------tggagactacctttcttaccaggataagaaataa-----------------------------------------------------------------------------------------------------------gatattttctgggttctgcaacaaacgttgtggaaattcaaaattgttgtcctagg---------------------------------------------------------------------------------------------------cctgccacggcaataaatcctaggcgtctcagttttggtttttcg-----------------------------------------c------------------------------------tacgttagttgttaccacgctgcgctggataagccacttaagttgaagggcgatatccatca------gtcgttgcaggcagctagttaaatcaccaaggagagtact-------------------------------------------------------------------gagg--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ctttggagaagctcgttactacggtcttaccaaag-------atgcctgttacaccgggttacac--------------------------------------------------------------

### compare approaches

In [20]:
for path in ["outputs/test/n1_noflag.clustal", "outputs/test/n1.clustal"]:
    aln = AlignIO.read(path, "clustal")
    rec = aln[-1]  # last record
    print(f"{path}:")
    print(f"{rec.id[:6]}\t{rec.seq}")

outputs/test/n1_noflag.clustal:
NODE_5	-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
# memory issues or input issues?
# poskusi v manjsih chunkih da ne padel na racun spomina
# check add / addfragments default in keeplenghth

In [None]:
awk '/^>/{if(seq){print length(seq); exit} seq=""; next} {seq=seq $0} END{if(seq) print length(seq)}' outputs/contigs_aligned_keeplength.fasta

B2. Filter aligned queries (drop references)

We keep only the contig/query sequences from the MAFFT keeplength output (`outputs/contigs_aligned_keeplength.fasta`). This guarantees the query alignment width matches the reference MSA and can be used as `--query` for EPA-NG. The result is saved as `outputs/queries_aligned.fasta`.

In [6]:
contig_ids = [record.id for record in SeqIO.parse('inputs/contigs.fasta', 'fasta')]

In [None]:
with open('outputs/contigs_aligned_keeplength.fasta') as fin, open('outputs/queries_aligned.fasta', 'w') as fout:
    records = [record for record in SeqIO.parse(fin, 'fasta') if record.id in contig_ids]
    SeqIO.write(records, fout, 'fasta')

In [None]:
from Bio import SeqIO

def aln_len(fp):
    first = next(SeqIO.parse(fp, 'fasta'))
    return len(first.seq)

ref_len = aln_len('outputs/mafft_ref_aligned.fasta')
q_lengths = {len(rec.seq) for rec in SeqIO.parse('outputs/queries_aligned.fasta', 'fasta')}
print(f"Reference alignment length: {ref_len}")
print(f"Unique query alignment lengths: {sorted(q_lengths)[:5]}{'...' if len(q_lengths) > 5 else ''}")
assert len(q_lengths) == 1 and ref_len in q_lengths, f"Width mismatch: ref={ref_len}, queries={sorted(q_lengths)}"

In [None]:
# After filtering from the keeplength alignment, the query and reference widths should match.
# If this assertion still fails, inspect inputs and consider batching or sanitizing sequences.

C1. Run the Placement w/ pplacer or epa-ng

In [None]:
epa-ng --ref-msa outputs/mafft_ref_aligned.fasta --tree outputs/iqtree/ref_aligned.treefile --query outputs/queries_aligned.fasta --model outputs/iqtree/ref_aligned.iqtree -w outputs/epa_output --redo

In [None]:
# test with n1
epa-ng --ref-msa outputs/mafft_ref_aligned.fasta --tree outputs/iqtree/ref_aligned.treefile --query outputs/test/n1_filtered.fasta --model outputs/iqtree/ref_aligned.iqtree -w outputs/epa_output --redo

# test with n1 noflag
epa-ng --ref-msa outputs/mafft_ref_aligned.fasta --tree outputs/iqtree/ref_aligned.treefile --query outputs/test/n1_noflag.fasta --model outputs/iqtree/ref_aligned.iqtree -w outputs/epa_output_noflag --redo
# not the same length

In [None]:
# Width mismatch should now be resolved by using the keeplength-based query file (`outputs/queries_aligned.fasta`).

### DEBUG

D1. Visualize or analyze these with tools like grappa or gruppy