In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import random

In [2]:

def fragment_without_ns(fasta_file: str, length: int, random_seed: int, number_of_fragments: int) -> list:
    """
    Fragment sequences from a multi-fasta file. Avoid Ns.
    
    Args:
        fasta_file: Path to input multi-fasta file
        length: length of fragments
        random_seed: Seed for random number generator
        number_of_fragments: Number of fragments to generate
        outseq: Output file path for fragments
    
    Returns:
        List of SeqRecord objects containing the fragments
    """
    random.seed(random_seed)
    
    # Load all records from multi-fasta
    records = list(SeqIO.parse(fasta_file, "fasta"))
    
    if not records:
        raise ValueError("No records found in fasta file")
    
    fragments = []
    goods = 0
    while goods < number_of_fragments:
        # Choose a random record
        record = random.choice(records)
        seq_len = len(record.seq)
        
        # Calculate fragment length based on proportion
        frag_len = int(length)
        
        
        # Ensure fragment length doesn't exceed sequence length
        if frag_len > seq_len:
            continue

        
        # Choose random start position
        max_start = seq_len - frag_len
        start = random.randint(0, max_start) if max_start > 0 else 0
        end = start + frag_len
        
        # skip fragments with Ns
        if "N" in record.seq[start:end]:
            continue
        # Extract subsequence
        subseq = record.seq[start:end]
        
        # Create new SeqRecord for fragment
        frag_record = SeqRecord(
            subseq,
            id=f"{record.id}_frag{goods+1}_{start}-{end}",
            description=f"Fragment from {record.id}, positions {start}-{end}"
        )

        fragments.append(frag_record)
        goods += 1
    
    # Write fragments to output file
    #SeqIO.write(fragments, outseq, "fasta")
    
    return fragments

In [4]:
test_frags = fragment_without_ns(
    "/data/tisza/analyses/mjt_projects/mjt_long_read_assembly/checkv/R000-605.mask_prophage.fna",
    1000,
    311,
    10000
)

In [11]:
for g in test_frags:
    if g.seq.count("N") > 0:
        print(g.seq)

In [3]:
frags_1k_311 = fragment_without_ns(
    "/data/tisza/analyses/mjt_projects/mjt_long_read_assembly/checkv/R000-605.mask_prophage.fna",
    1000,
    311,
    10000
)

In [5]:
SeqIO.write(frags_1k_311, "../../../data/discovery/mags_no_phage/frags_1k_311.fasta", "fasta")

10000

In [8]:
frags_1k_312 = fragment_without_ns(
    "/data/tisza/analyses/mjt_projects/mjt_long_read_assembly/checkv/R000-605.mask_prophage.fna",
    1000,
    312,
    10000
)

SeqIO.write(frags_1k_312, "../../../data/discovery/mags_no_phage/frags_1k_312.fasta", "fasta")

10000

In [9]:
frags_1k_313 = fragment_without_ns(
    "/data/tisza/analyses/mjt_projects/mjt_long_read_assembly/checkv/R000-605.mask_prophage.fna",
    1000,
    313,
    10000
)

SeqIO.write(frags_1k_313, "../../../data/discovery/mags_no_phage/frags_1k_313.fasta", "fasta")

10000

In [10]:
frags_10k_311 = fragment_without_ns(
    "/data/tisza/analyses/mjt_projects/mjt_long_read_assembly/checkv/R000-605.mask_prophage.fna",
    10000,
    311,
    10000
)

SeqIO.write(frags_10k_311, "../../../data/discovery/mags_no_phage/frags_10k_311.fasta", "fasta")

10000

In [11]:
frags_10k_312 = fragment_without_ns(
    "/data/tisza/analyses/mjt_projects/mjt_long_read_assembly/checkv/R000-605.mask_prophage.fna",
    10000,
    312,
    10000
)

SeqIO.write(frags_10k_312, "../../../data/discovery/mags_no_phage/frags_10k_312.fasta", "fasta")

10000

In [12]:
frags_10k_313 = fragment_without_ns(
    "/data/tisza/analyses/mjt_projects/mjt_long_read_assembly/checkv/R000-605.mask_prophage.fna",
    10000,
    313,
    10000
)

SeqIO.write(frags_10k_313, "../../../data/discovery/mags_no_phage/frags_10k_313.fasta", "fasta")

10000

In [13]:
frags_100k_311 = fragment_without_ns(
    "/data/tisza/analyses/mjt_projects/mjt_long_read_assembly/checkv/R000-605.mask_prophage.fna",
    100000,
    311,
    1000
)

SeqIO.write(frags_100k_311, "../../../data/discovery/mags_no_phage/frags_100k_311.fasta", "fasta")

1000

In [14]:
frags_100k_312 = fragment_without_ns(
    "/data/tisza/analyses/mjt_projects/mjt_long_read_assembly/checkv/R000-605.mask_prophage.fna",
    100000,
    312,
    1000
)

SeqIO.write(frags_100k_312, "../../../data/discovery/mags_no_phage/frags_100k_312.fasta", "fasta")

1000

In [15]:
frags_100k_313 = fragment_without_ns(
    "/data/tisza/analyses/mjt_projects/mjt_long_read_assembly/checkv/R000-605.mask_prophage.fna",
    100000,
    313,
    1000
)

SeqIO.write(frags_100k_313, "../../../data/discovery/mags_no_phage/frags_100k_313.fasta", "fasta")

1000