In [5]:
from Bio import SeqIO

def rename_sequences(input_file, label_name, label_value, output_file):
    with open(input_file, "r") as f:
        records = list(SeqIO.parse(f, "fasta"))

    for idx, record in enumerate(records, 1):
        new_header = f"{label_name}_{idx:02d}|{label_value}"
        record.id = new_header
        record.description = ""

    with open(output_file, "a") as f:  # Use 'a' mode to append instead of 'w'
        SeqIO.write(records, f, "fasta")
        
        


In [8]:
rename_sequences("negative-pet.fasta","negative-pet",0, "all.fasta" )
rename_sequences("synthetic-pet-seq.fasta","synthetic-pet",1, "all.fasta" )
rename_sequences("real-pet-seq.fasta","real-pet",1, "all.fasta" )

In [20]:
from Bio import SeqIO

def count_sequences(input_file):
    group_counts = {}
    with open(input_file, "r") as f:
        records = list(SeqIO.parse(f, "fasta"))

    for record in records:
        group_label = record.id.split("|")[0]
        group_name = group_label.split("_")[0]
        label = int(group_label.split("_")[1])

        if group_name in group_counts:
            group_counts[group_name] = group_counts[group_name] + 1
        else:
            group_counts[group_name] = 1

    return group_counts


In [22]:
input_file = "all.fasta"  # Replace with the path to your input fasta file

group_counts = count_sequences(input_file)
print(group_counts)

{'negative-pet': 130, 'synthetic-pet': 100, 'real-pet': 104}


In [23]:
from Bio import SeqIO
import random

def shuffle_and_split_fasta(input_file, train_file, test_file, split_ratio=0.8):
    # Read the input FASTA file
    records = list(SeqIO.parse(input_file, "fasta"))

    # Shuffle the records randomly
    random.shuffle(records)

    # Calculate the split index
    split_index = int(len(records) * split_ratio)

    # Split into train and test sets
    train_records = records[:split_index]
    test_records = records[split_index:]

    # Write the train set to a new FASTA file
    with open(train_file, "w") as train_handle:
        SeqIO.write(train_records, train_handle, "fasta")

    # Write the test set to a new FASTA file
    with open(test_file, "w") as test_handle:
        SeqIO.write(test_records, test_handle, "fasta")



In [24]:
input_file = "all.fasta"  # Replace with the path to your input FASTA file
train_file = "train.fasta"
test_file = "test.fasta"
split_ratio = 0.8  # 80% for training, 20% for testing

shuffle_and_split_fasta(input_file, train_file, test_file, split_ratio)

In [25]:
print( f" train: {count_sequences(train_file)}")
print( f" test: {count_sequences(test_file)}")
print( f" all: {count_sequences(input_file)}")



 train: {'negative-pet': 101, 'synthetic-pet': 84, 'real-pet': 82}
 test: {'real-pet': 22, 'negative-pet': 29, 'synthetic-pet': 16}
 all: {'negative-pet': 130, 'synthetic-pet': 100, 'real-pet': 104}
