In [10]:
import pandas as pd
import random

def create_test_data(sequence_length=20, num_sequences=100):
    filters = {
        "AAA": 2,
        "BBB": 1,
        "CCC": 3,
        "ABC": 4,
        "BCA": 0
    }

    sequences = []

    # Inserting each filter into random sequences based on their counts
    for filter_seq, count in filters.items():
        for _ in range(count):
            seq = ''.join(random.choices("XYZ", k=sequence_length - len(filter_seq)))
            insert_idx = random.randint(0, sequence_length - len(filter_seq))
            seq = seq[:insert_idx] + filter_seq + seq[insert_idx:]
            sequences.append(seq)

    # Inserting sequences with two occurrences of AAA
    for _ in range(2):
        random_seq = ''.join(random.choices("XYZ", k=(sequence_length - 6) // 2))
        seq = random_seq + "AAA" + random_seq + "AAA" + random_seq
        sequences.append(seq)

    # Inserting sequences with both AAA and BBB
    for _ in range(2):
        random_seq = ''.join(random.choices("XYZ", k=(sequence_length - 6) // 2))
        seq = random_seq + "AAA" + random_seq + "BBB" + random_seq
        sequences.append(seq)

    # Complete the sequences list with random sequences until we have (num_sequences - 2) sequences
    while len(sequences) < num_sequences - 2:
        sequences.append(''.join(random.choices("XYZ", k=sequence_length)))

    # Add 2 rows with NaN values
    sequences.extend([None, None])

    df = pd.DataFrame({
        "Sequence": sequences,
        "Count": [random.randint(1, 10) for _ in range(len(sequences))],
        "Amino Acid": [random.choice("XYZ") for _ in range(len(sequences))]
    })

    df.to_csv("test_data/test_sequence_data.csv", index=False)

filters_data = {
    "ID": ["Ref1", "Ref2", "Ref3", "Ref4", "Ref5"],
    "Name": ["Filter1", "Filter2", "Filter3", "Filter4", "Filter5"],
    "Filter Sequence": ["AAA", "BBB", "CCC", "ABC", "BCA"]
}

filters_df = pd.DataFrame(filters_data)
filters_df.to_csv("test_data/test_filters.csv", index=False)

# Sample call
create_test_data(sequence_length=20, num_sequences=100)
