In [1]:
import random
import os

# --- CONFIGURATION ---
# Define output directory (Updated to current project structure)
BASE_DIR = "/Users/user/Downloads/02. PROJECTS/Stress-region-predictor/software_test/random_seq_test"
os.makedirs(BASE_DIR, exist_ok=True)
print(f"Output Directory: {BASE_DIR}")

def generate_random_dna(length):
    """Generates a random DNA sequence of specified length."""
    return ''.join(random.choices("ATGC", k=length))

def save_fasta(filename, header, sequence):
    """Saves sequence to FASTA format."""
    path = os.path.join(BASE_DIR, filename)
    with open(path, "w") as f:
        f.write(f">{header}\n")
        # Write format: 80 characters per line
        for i in range(0, len(sequence), 80):
            f.write(sequence[i:i+80] + "\n")
    print(f"Saved: {filename}")

# --- 1. GENERATE RANDOM NOISE (Negative Controls) ---

# A. Short Region (200bp) - Proper input for Region Mode (--rg)
save_fasta("random_200bp.fasta", "random_200bp_noise", generate_random_dna(200))

# B. Promoter Length (2kb) - Standard input for Promoter Scan (--pr)
save_fasta("random_2kb.fasta", "random_2kb_noise", generate_random_dna(2000))

# C. Long Sequence (10kb) - Stress test for Slicing Logic
save_fasta("random_10kb.fasta", "random_10kb_noise", generate_random_dna(10000))

Output Directory: /Users/user/Downloads/02. PROJECTS/Stress-region-predictor/software_test/random_seq_test
Saved: random_200bp.fasta
Saved: random_2kb.fasta
Saved: random_10kb.fasta


In [2]:
# --- 2. GENERATE SYNTHETIC POSITIVES (Validation Test) ---
# We inject known stress motifs into random noise. 
# The model SHOULD detect peaks at these locations.

motif_abre = "ACGTG"     # ABRE (Abscisic Acid Response Element)
motif_dre = "GCCGAC"     # DRE (Dehydration Response Element)
motif_myb = "TAACTG"     # MYB recognition site

# Create background noise (1.5kb)
length = 1500
seq_list = list(generate_random_dna(length))

# Function to inject motif
def inject(seq_list, pos, motif):
    for i, char in enumerate(motif):
        if pos + i < len(seq_list):
            seq_list[pos+i] = char

# Inject Motifs at specifics positions:
# 1. Cluster at Start (200-250bp)
inject(seq_list, 200, motif_abre)
inject(seq_list, 210, motif_abre) 
inject(seq_list, 230, motif_dre)

# 2. Strong Signal at Middle (700bp)
inject(seq_list, 700, motif_dre)
inject(seq_list, 710, motif_myb)

# 3. Isolated Signal at End (1200bp)
inject(seq_list, 1200, motif_abre)

synthetic_seq = "".join(seq_list)
save_fasta("synthetic_positive_1.5kb.fasta", "synthetic_stress_ABRE_DRE_MYB", synthetic_seq)

print(f"Generated Synthetic Positive Sequence (1.5kb) with motifs injected at:")
print(" - 200-250bp (Cluster)")
print(" - 700bp (Middle)")
print(" - 1200bp (End)")

Saved: synthetic_positive_1.5kb.fasta
Generated Synthetic Positive Sequence (1.5kb) with motifs injected at:
 - 200-250bp (Cluster)
 - 700bp (Middle)
 - 1200bp (End)


In [3]:
# --- 3. SCENARIO: BIOLOGICAL BIAS (GC vs AT Rich) ---
# Testing if the model is biased by simple GC content rather than motifs.

# A. High GC Content (70% G/C) - Often found in monocots or gene bodies
gc_rich_seq = ''.join(random.choices("GCAT", weights=[35, 35, 15, 15], k=2000))
save_fasta("bias_high_gc.fasta", "bias_high_gc_70percent", gc_rich_seq)

# B. High AT Content (70% A/T) - Typical of plant promoters
at_rich_seq = ''.join(random.choices("ATGC", weights=[35, 35, 15, 15], k=2000))
save_fasta("bias_high_at.fasta", "bias_high_at_70percent", at_rich_seq)

print(f"Generated Bias Test Sequences:")
print(" - bias_high_gc.fasta (Should be Negative if no motifs)")
print(" - bias_high_at.fasta (Should be Negative if no motifs)")

Saved: bias_high_gc.fasta
Saved: bias_high_at.fasta
Generated Bias Test Sequences:
 - bias_high_gc.fasta (Should be Negative if no motifs)
 - bias_high_at.fasta (Should be Negative if no motifs)


In [4]:
# --- 4. SCENARIO: SUPER STRESS CLUSTER ---
# Simulates a strong promoter with multiple different overlapping stress signals.

# Base: AT-rich promoter (common in plants)
length = 1000
seq_list = list(''.join(random.choices("ATGC", weights=[35, 35, 15, 15], k=length)))

strong_motifs = {
    "ABRE": "ACGTG",
    "G-Box": "CACGTG", 
    "DRE": "GCCGAC",
    "LowTemp": "CCGAAA"
}

# Create a "Hot Zone" at 400-600bp
start_zone = 400
for i in range(10):
    # Randomly pick a motif
    m_name, m_seq = random.choice(list(strong_motifs.items()))
    # Clustered positions
    pos = start_zone + (i * 15) # Every 15bp
    inject(seq_list, pos, m_seq)

super_seq = "".join(seq_list)
save_fasta("scenario_super_stress.fasta", "scenario_super_stress_cluster", super_seq)

print("Generated 'Super Stress' Cluster (Strong Positive Control):")
print(" - Contains 10+ motifs packed into 400-600bp region.")

Saved: scenario_super_stress.fasta
Generated 'Super Stress' Cluster (Strong Positive Control):
 - Contains 10+ motifs packed into 400-600bp region.


In [5]:
# --- 5. SCENARIO: SLICING BOUNDARY TEST ---
# Places a critical motif exactly where the model cuts the sequence (1000bp).
# This tests if the overlap/slicing logic misses edge cases.

length = 2500
seq_list = list(generate_random_dna(length))

# Inject DRE motif exactly across the 1000bp boundary
# Normal slice: 0-1000, 800-1800 (if stride=200). 
# If exact 1000 cut without overlap, we might miss it.
motif = "GCCGAC" # 6bp
# Place at 997 (spans 997, 998, 999 | 1000, 1001, 1002)
inject(seq_list, 997, motif)

# Also place one at the very end to check padding
inject(seq_list, 2490, "ACGTG")

boundary_seq = "".join(seq_list)
save_fasta("scenario_boundary_test.fasta", "scenario_boundary_check", boundary_seq)

print("Generated Boundary Test Sequence:")
print(" - Motif at 997bp (Crossing the 1000bp 1st slice boundary).")
print(" - Motif at 2490bp (End of sequence padding check).")

Saved: scenario_boundary_test.fasta
Generated Boundary Test Sequence:
 - Motif at 997bp (Crossing the 1000bp 1st slice boundary).
 - Motif at 2490bp (End of sequence padding check).


# --- 6. RUN COMMANDS ---
# Copy and paste these commands into your terminal to run the pipeline on the generated test files.

## 1. Negative Controls (Random Noise)
```bash
# Region Mode Check (Should find 0 regions)
python stress_predictor/main.py --input "software_test/random_seq_test/random_200bp.fasta" --model-path "PlantBERT" --rg

# Promoter Mode Check (Should find 0 regions)
python stress_predictor/main.py --input "software_test/random_seq_test/random_2kb.fasta" --model-path "PlantBERT" --pr
```

## 2. Validation Test (Synthetic Positives)
```bash
# Expecting peaks at ~200bp, ~700bp, ~1200bp
python stress_predictor/main.py --input "software_test/random_seq_test/synthetic_positive_1.5kb.fasta" --model-path "PlantBERT" --pr
```

## 3. Biological Bias Test
```bash
# High GC (Should be Negative)
python stress_predictor/main.py --input "software_test/random_seq_test/bias_high_gc.fasta" --model-path "PlantBERT" --pr

# High AT (Should be Negative - crucial check for False Positives)
python stress_predictor/main.py --input "software_test/random_seq_test/bias_high_at.fasta" --model-path "PlantBERT" --pr
```

## 4. Stress Tests
```bash
# Super Cluster (Expecting very high confidence block at 400-600bp)
python stress_predictor/main.py --input "software_test/random_seq_test/scenario_super_stress.fasta" --model-path "PlantBERT" --pr

# Boundary Test (Checking motif at 997bp)
python stress_predictor/main.py --input "software_test/random_seq_test/scenario_boundary_test.fasta" --model-path "PlantBERT" --pr
```