In [22]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [23]:
# This function finds ORFs in a DNA sequence given a minimum length
def find_orfs(seq, min_length):
    start_codons = ['ATG']
    stop_codons = ['TAA', 'TAG', 'TGA']
    orfs = []

    # Loop over positive and reverse strands
    for strand, nucleotide_seq in [(+1, seq), (-1, seq.reverse_complement())]:
        seq_length = len(nucleotide_seq)

        # Loop over reading frames
        for frame in range(3):
            in_orf = False
            orf_start = None

            # Loop through the sequence in codon-sized steps
            for i in range(frame, seq_length - 2, 3):
                codon = nucleotide_seq[i:i + 3]

                # Check for start codons
                if not in_orf and codon in start_codons:
                    in_orf = True
                    orf_start = i

                # Check for stop codons
                if in_orf and codon in stop_codons:
                    orf_end = i + 3
                    if orf_start is not None and (orf_end - orf_start) >= min_length:
                        orfs.append({
                            'strand': strand,
                            'frame': frame,
                            'start': orf_start,
                            'end': orf_end,
                            'sequence': nucleotide_seq[orf_start:orf_end]
                        })
                    in_orf = False
                    orf_start = None

    return orfs

In [25]:
# Read sequences from a multifasta file
fasta_file_path = "data/test_subset.fasta"  # Change this to the path of your multifasta file

# Iterate over sequences in the FASTA file
for record in SeqIO.parse(fasta_file_path, "fasta"):
    sequence = record.seq
    orfs = find_orfs(sequence, min_length=100)  # Set your minimum length here

    if orfs:
        print(f"ORFs in {record.id}:")
        for orf in orfs:
            print(
                f"  - Strand: {orf['strand']}, Frame: {orf['frame']}, Start: {orf['start']}, "
                f"End: {orf['end']}, ORF: {orf['sequence']}"
            )
    else:
        print(f"No ORFs found in {record.id} with the given minimum length.")


ORFs in NODE_3447_length_1836_cov_746.295340_DRR146894:
  - Strand: 1, Frame: 2, Start: 1616, End: 1775, ORF: ATGTTGTCTTTTTCGATGTGTACTTATCAAGATATTGCGAAATGTATTCTTCAACCCGAACATAAATGGGTGTTGTTTCAATACATGGCATCTTGGAGTTTGTTTCCAACCGATTTAGAAACCACTTTCGTAGTTTTGATGGATCATCGCTGTTTTTAA
ORFs in NODE_3346_length_1986_cov_333.205593_DRR146906:
  - Strand: 1, Frame: 2, Start: 1037, End: 1163, ORF: ATGAGTTTCAGGTTCAGCATTCAGATTTGGAAGTACACGTCGACAATGTCAGAGTTGATCTCTCTAAAGACGGGGAAAGAGCCCACCGCTTTAGAGAAGAAAAAGTTGGATTTAGACCTAGGTTGA
  - Strand: 1, Frame: 2, Start: 1715, End: 1826, ORF: ATGCTTTGCTCGAAATCGATTTTTCAAAATTCGACAAGAGTCAGACAGACATATGCTTTAACCTGGAGATGCATATTTGGGAATTGTTGGGAATGGATCAGTACTTTTTGA
  - Strand: -1, Frame: 0, Start: 432, End: 642, ORF: ATGATGTTTTGTGGACTATTATATTCAAAAACTGCGTTTCTTTCGAGTTTGTTTTTTGGGTCTTTTTTAACCATCATTGTATACTCCACCATCCTTTCCAAGTCTAAACCTAAATTTTCGATGTTAATTCTTTTTAGTTTACTTGGTTCCTGTCCTGCCGCCCATTTTGTCAAATTTTCCCTAGTTAACGAAACGGGATTCTCCATGTAA
  - Strand: -1, Frame: 0, Start: 1311, End: 1467, ORF: ATGAGTTC