In [2]:
import pandas as pd
from Bio import SeqIO

In [12]:
contigs = list(SeqIO.parse("/home/tobamo/analize/project-tobamo/analysis/phylogenetic_placement/data/representative_contigs.fasta", "fasta"))
forward_contigs = list(SeqIO.parse("/home/tobamo/analize/project-tobamo/analysis/phylogenetic_placement/data/representative_contigs_forward.fasta", "fasta"))

In [16]:
# Step 1: Check if all forward_contigs IDs are substrings of contigs IDs
contig_ids = [record.id for record in contigs]
forward_ids = [record.id for record in forward_contigs]

# Create mapping from truncated to full names
contig_id_map = {}
unmapped = []

for forward_id in forward_ids:
    found = False
    for contig_id in contig_ids:
        if forward_id in contig_id:
            if forward_id in contig_id_map:
                print(f"⚠️ Warning: {forward_id} matches multiple contigs!")
            contig_id_map[forward_id] = contig_id
            found = True
            break
    if not found:
        unmapped.append(forward_id)

# Report results
print(f"Total forward contigs: {len(forward_ids)}")
print(f"Successfully mapped: {len(contig_id_map)}")
print(f"Unmapped: {len(unmapped)}")

if unmapped:
    print(f"\n⚠️ Warning: {len(unmapped)} forward contig IDs are NOT substrings of any contig ID:")
    for uid in unmapped[:10]:  # Show first 10
        print(f"  - {uid}")
    if len(unmapped) > 10:
        print(f"  ... and {len(unmapped) - 10} more")
else:
    print("✓ All forward_contigs IDs are substrings of contigs IDs")
    
    # Step 2: Update forward_contigs with full names
    for record in forward_contigs:
        if record.id in contig_id_map:
            record.id = contig_id_map[record.id]
            record.description = ''
    
    # Save updated fasta
    output_file = "/home/tobamo/analize/project-tobamo/analysis/phylogenetic_placement/data/representative_contigs_forward_fullnames.fasta"
    SeqIO.write(forward_contigs, output_file, 'fasta')
    print(f"\n✓ Forward contigs updated with full names")
    print(f"✓ New fasta file saved to: {output_file}")

Total forward contigs: 71
Successfully mapped: 37
Unmapped: 34

  - NODE_48752
  - NODE_14605
  - NODE_64879
  - NODE_72146
  - NODE_89351
  - NODE_28410
  - NODE_15726
  - NODE_26446
  - NODE_2985
  - NODE_12041
  ... and 24 more


In [10]:
# Create sets of sequences for comparison
original_seqs = [record.seq for record in contigs]
forward_seqs = set(record.seq for record in forward_contigs)

# Compare sequences
forward_count = 0
reverse_count = 0

for seq in original_seqs:
    if seq in forward_seqs:
        forward_count += 1
    elif seq.reverse_complement() in forward_seqs:
        reverse_count += 1

print(f"Forward orientation: {forward_count}")
print(f"Reverse orientation: {reverse_count}")
print(f"Total compared: {forward_count + reverse_count}")
print(f"Total original contigs: {len(original_seqs)}")
print(f"Total forward contigs: {len(forward_seqs)}")

Forward orientation: 0
Reverse orientation: 37
Total compared: 37
Total original contigs: 57
Total forward contigs: 71


In [6]:
forward_dict

{'NODE_3764': Seq('CCTTGTGGCAATAAGTGTCTGCTCCTCACTTATCCCCTATGGATCGACAACGTA...CCT'),
 'NODE_9460': Seq('ATTCTTAAAGAAATATGGGTATTTCTGTGGTAAATACGTGATCGTCCACTCTAC...GCC'),
 'NODE_26511': Seq('GGCCTATTCATTTGCAAGAAACTTCTCAGAATTCTGGATGGAAACATTTCATTC...AGC'),
 'NODE_28458': Seq('CAAAAATTCTATGAGGGCGGAAAATACACCCTCGTCTTCAAGAGGGAGGGATCT...ACG'),
 'NODE_31983': Seq('GCTGTTTATCCTGATTTAGAGAAGAAGAAAAATAGCTACTTTGCGCTCAAAAGC...CTG'),
 'NODE_48752': Seq('AAGGCCACGGACAAAGACCTAAAACATTATGTTTTAGCGGCCGGTGCTGTCCAC...AAT'),
 'NODE_50431': Seq('AGAAGCAATAATTCGAACAGAGAGATCCGTCCGAAATTGGAGGGTGAAGTTGTG...AAC'),
 'NODE_14605': Seq('CTTCGCTGCTCACATGTTCAAGGGCAGAGATTATGTGCATTGCTGCATGCCCAA...TAC'),
 'NODE_64879': Seq('ATATTTCATTATCTAAATTGCATTTTACTAAGATTGTTGATTTAAAACTCCGCG...GTT'),
 'NODE_70268': Seq('AAGTCAGAGTGGGACGTCCCAATCGAGTCCCTTACAGACATATCGTTCACTCTT...GCA'),
 'NODE_72146': Seq('ATCAGTACCAGTACTTCGCAGCAGCTTGGGCTCATCCCAAGCCTCTGCTTGACT...GCT'),
 'NODE_89351': Seq('TAGTTGAATATAACTTAGTGTCGGTTGAAACAGCGAAACGAACCCATTTCTCCT...T