# Functions:

In [3]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

# Constants
DATA_DIR = Path("../data")  # Base data directory
TRAIN_DIR = DATA_DIR / "train"
TEST_DIR = DATA_DIR / "test"
VALIDATION_DIR = DATA_DIR / "validation"
CSV_FILE = DATA_DIR / "stmf_data_3.csv"
NUM_SAMPLES = 500  # Total number of samples
DIM_X, DIM_Y, DIM_Z = 6, 74, 918  # Dimensions of each spectrogram
RANDOM_SEED = 42  # For reproducibility

# Ensure directories exist
TRAIN_DIR.mkdir(parents=True, exist_ok=True)
TEST_DIR.mkdir(parents=True, exist_ok=True)
VALIDATION_DIR.mkdir(parents=True, exist_ok=True)

# Step 1: Generate data with thick lines and circles
np.random.seed(RANDOM_SEED)
data = np.random.rand(NUM_SAMPLES, DIM_X, DIM_Y, DIM_Z) + np.random.normal(0, 0.1, (NUM_SAMPLES, DIM_X, DIM_Y, DIM_Z))
data = np.clip(data, 0, 1)  # Ensure all values are between 0 and 1

targets = []

for d in range(NUM_SAMPLES):
    # Line 1: Vertical line with noise
    noise = np.random.normal(0, 5)
    x1_start, y1_start = 0, 30
    x1_end, y1_end = DIM_Z - 1, 30 + noise

    # Line 2: Diagonal line with noise
    x2_start, y2_start = 400 + np.random.normal(0, 5), 60 + np.random.normal(0, 5)
    x2_end, y2_end = 800 + np.random.normal(0, 5), 30 + np.random.normal(0, 1)

    # Draw Line 1
    for width in range(-2, 3):  # Thickness
        y = int(y1_start + width)
        if 0 <= y < DIM_Y:
            for z in range(DIM_Z):
                data[d, :, y, z] = np.clip(data[d, :, y, z] + np.exp(-abs(width) / 2), 0, 1)

    # Draw Line 2
    for width in range(-2, 3):  # Thickness
        for z in range(int(x2_start), int(x2_end)):
            y = int(y2_start + (y2_end - y2_start) * (z - x2_start) / (x2_end - x2_start) + width)
            if 0 <= y < DIM_Y:
                data[d, :, y, z] = np.clip(data[d, :, y, z] + np.exp(-abs(width) / 2), 0, 1)
    
    # Calculate the target value for this observation
    target = -y2_start / 2
    targets.append(target)

# Step 2: Split data into train, test, and validation sets
train_indices, test_val_indices = train_test_split(np.arange(NUM_SAMPLES), test_size=0.4, random_state=RANDOM_SEED)
test_indices, val_indices = train_test_split(test_val_indices, test_size=0.5, random_state=RANDOM_SEED)

# Step 3: Save spectrogram data into respective directories
def save_spectrograms(indices, target_dir):
    """
    Save spectrogram data for the given indices to the target directory.
    """
    for idx in indices:
        obs_no = idx  # Observation number matches the index
        filepath = target_dir / f"{obs_no}_stacked_spectrograms.npy"
        np.save(filepath, data[idx])  # Save spectrogram

save_spectrograms(train_indices, TRAIN_DIR)
save_spectrograms(test_indices, TEST_DIR)
save_spectrograms(val_indices, VALIDATION_DIR)

# Step 4: Create a CSV file for all the data
csv_data = []
for idx in range(NUM_SAMPLES):
    csv_data.append({"obs_no": idx, "BallVr": targets[idx]})
df = pd.DataFrame(csv_data)

# Save the CSV file
df.to_csv(CSV_FILE, index=False)

print(f"Data successfully saved in '{DATA_DIR}'!")
print(f"CSV file '{CSV_FILE}' created with {len(df)} entries.")


Data successfully saved in '../data'!
CSV file '../data/stmf_data_3.csv' created with 500 entries.
