In [8]:
import pandas as pd
import random

def create_test_data():
    """
    Generates test sequence data for the purpose of testing the GeneSequenceAnalyzer class.
    
    The function aims to simulate a dataset that the GeneSequenceAnalyzer class would typically handle.
    Instead of real genetic sequences, we use fake "filters" (AAA, BBB, etc.) and sequences made up of XYZ combinations.
    These fake sequences have the embedded filter strings at random positions.
    
    Here's what is expected in the generated data:
    - "AAA" appears 2 times.
    - "BBB" appears 1 time.
    - "CCC" appears 3 times.
    - "ABC" appears 4 times.
    - "BCA" appears 0 times.
    - There are 2 rows with NaN values.
    - The rest of the sequences consist of randomly generated "XYZ" combinations.
    
    The generated data will have the structure that's compatible with the GeneSequenceAnalyzer's expected input.
    The data is saved in the directory `test_data` with the filename `test_sequence_data.csv`.
    """
    
    # Define filters and the number of times they should appear in the sequence data
    filters = {
        "AAA": 2,
        "BBB": 1,
        "CCC": 3,
        "ABC": 4,  
        "BCA": 0
    }

    sequences = []

    # Iterate over each filter and its corresponding count
    for filter_seq, count in filters.items():
        for _ in range(count):
            # Create a random sequence of "XYZ" with space for the filter sequence
            seq = ''.join(random.choices("XYZ", k=20 - len(filter_seq)))
            
            # Choose a random position to insert the filter sequence
            insert_idx = random.randint(0, 20 - len(filter_seq))
            
            # Insert the filter sequence at the chosen position
            seq = seq[:insert_idx] + filter_seq + seq[insert_idx:]
            sequences.append(seq)

    # Complete the sequences list with random sequences until we have 98 sequences
    while len(sequences) < 98:
        sequences.append(''.join(random.choices("XYZ", k=20)))

    # Add 2 rows with NaN values to simulate missing data
    sequences.extend([None, None])

    # Convert the sequences list to a pandas DataFrame
    df = pd.DataFrame({
        "Sequence": sequences,
        "Count": [random.randint(1, 10) for _ in range(100)],
        "Amino Acid": [random.choice("XYZ") for _ in range(100)]
    })
    
    # Save the sequence data to a CSV file
    df.to_csv("test_data/test_sequence_data.csv", index=False)

# Define the data for the filter reference
filters_data = {
    "ID": ["Ref1", "Ref2", "Ref3", "Ref4", "Ref5"],
    "Name": ["Filter1", "Filter2", "Filter3", "Filter4", "Filter5"],
    "Filter Sequence": ["AAA", "BBB", "CCC", "ABC", "BCA"]
}

# Convert the filters data to a pandas DataFrame and save to a CSV file
filters_df = pd.DataFrame(filters_data)
filters_df.to_csv("test_data/test_filters.csv", index=False)

# Call the create_test_data function to generate the test data
create_test_data()


FileNotFoundError: [Errno 2] No such file or directory: 'test_data/test_filters.csv'