# Synthetic Data Generator for ME/CFS
## Optimized & Modular Version

This notebook generates synthetic patient data based on SF-36 inputs using a cascade of deep learning models.
The methodology follows the graph theory analysis described in the paper *A synthetic data generation system for myalgic encephalomyelitis/chronic fatigue syndrome questionnaires*.

**Requirements:**
* TensorFlow
* Pandas
* Pre-trained `.h5` model files in the `./models` directory.

In [None]:
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import load_model

# GPU Configuration
# Checks if a GPU is available and sets memory growth to avoid allocating all memory at once.
physical_devices = tf.config.list_physical_devices('GPU')
for device in physical_devices:
    tf.config.experimental.set_memory_growth(device, True)

print(f"TensorFlow Version: {tf.__version__}")
print(f"GPU Available: {len(physical_devices) > 0}")

# ==========================================
# 1. Column Definitions
# ==========================================
# These lists define the specific column names for each questionnaire,
# matching the output layers of the pre-trained models.

COLS_HAD = [f'had_{i}' for i in range(1, 15)]

COLS_SCL90R = [f'SCL{i}' for i in range(1, 91)]

COLS_FIS8 = [f'fis{i}' for i in range(1, 9)]

# FIS40 uses a mixed naming convention in the original dataset:
# The first 8 columns have a '_y' suffix, followed by standard numbering.
COLS_FIS40 = [f'fis{i}_y' for i in range(1, 9)] + [f'fis{i}' for i in range(9, 41)]

COLS_PSQI = [f'comp{i}' for i in range(1, 8)]

In [None]:
# ==========================================
# 2. Core Functions
# ==========================================

def predict_questionnaire(input_data, col_names, model_filename_template, models_dir='.'):
    """
    Generic function to predict a full questionnaire column by column.

    Args:
        input_data (pd.DataFrame): The accumulated input matrix (previous steps).
        col_names (list): List of target column names to predict.
        model_filename_template (str): Filename template (e.g., 'HADW_model_{col}.h5').
        models_dir (str): Directory path containing the .h5 files.

    Returns:
        pd.DataFrame: DataFrame containing the predictions for this specific questionnaire.
    """
    predictions = {}

    # print(f"Processing {len(col_names)} items for pattern: {model_filename_template}...")

    for col in col_names:
        model_path = os.path.join(models_dir, model_filename_template.format(col=col))

        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Model not found: {model_path}")

        # Loading with compile=False is significantly faster for inference
        # as it skips loading optimizers and loss functions.
        model = load_model(model_path, compile=False)

        # Perform prediction
        pred_probs = model.predict(input_data, verbose=0)

        # Get the class with the highest probability (argmax)
        # Assumes the output is categorical classification (0, 1, 2, 3...)
        predictions[col] = pred_probs.argmax(axis=1)

    return pd.DataFrame(predictions)

def generate_synthetic_data(sf36_input, models_dir='.'):
    """
    Generates synthetic data for all questionnaires using the cascade method.

    Architecture defined in the paper:
    Flow: SF36 -> HAD -> SCL90R -> FIS8 -> FIS40 -> PSQI

    Args:
        sf36_input (pd.DataFrame): Validated SF-36 responses (n_samples, 36).
        models_dir (str): Path to the folder containing .h5 model weights.

    Returns:
        tuple: DataFrames for (HAD, SCL90R, FIS8, FIS40, PSQI)
    """

    # Input validation
    if sf36_input.shape[1] != 36:
        print(f"Warning: Expected 36 input columns, received {sf36_input.shape[1]}.")

    # Step 1: Predict HAD (Input: SF36)
    print("--- Step 1: Generating HAD ---")
    df_had = predict_questionnaire(sf36_input, COLS_HAD, 'HADW_model_{col}.h5', models_dir)

    # Concatenate input + prediction for the next step
    X_step1 = pd.concat([sf36_input.reset_index(drop=True), df_had], axis=1)

    # Step 2: Predict SCL90R (Input: SF36 + HAD)
    print("--- Step 2: Generating SCL-90-R ---")
    df_scl = predict_questionnaire(X_step1, COLS_SCL90R, 'SCLW_model_{col}.h5', models_dir)
    X_step2 = pd.concat([X_step1, df_scl], axis=1)

    # Step 3: Predict FIS8 (Input: SF36 + HAD + SCL90R)
    print("--- Step 3: Generating FIS-8 ---")
    df_fis8 = predict_questionnaire(X_step2, COLS_FIS8, 'fis8_model_{col}.h5', models_dir)
    X_step3 = pd.concat([X_step2, df_fis8], axis=1)

    # Step 4: Predict FIS40 (Input: ... + FIS8)
    print("--- Step 4: Generating FIS-40 ---")
    df_fis40 = predict_questionnaire(X_step3, COLS_FIS40, 'fis40_model_{col}.h5', models_dir)
    X_step4 = pd.concat([X_step3, df_fis40], axis=1)

    # Step 5: Predict PSQI (Input: ... + FIS40)
    print("--- Step 5: Generating PSQI ---")
    df_psqi = predict_questionnaire(X_step4, COLS_PSQI, 'psqi_model_{col}.h5', models_dir)

    return df_had, df_scl, df_fis8, df_fis40, df_psqi

In [None]:
# ==========================================
# 3. Execution Block
# ==========================================

# Settings
# Update these paths if running in Google Colab with Drive mounted
# e.g., models_dir = '/content/drive/MyDrive/SFCSyntheticDataGenerator/models'
MODELS_DIR = './models'
INPUT_FILE = 'sf36.csv'

if os.path.exists(INPUT_FILE):
    print("Loading input data...")
    sf36_df = pd.read_csv(INPUT_FILE)

    # Basic Data Cleaning
    if 'Unnamed: 0' in sf36_df.columns:
        sf36_df = sf36_df.drop(['Unnamed: 0'], axis=1)
    sf36_df = sf36_df.dropna()

    # Ensure we are using the correct columns (assuming first 36 are SF-36)
    # Adjust .iloc if your csv structure is different
    X_input = sf36_df.iloc[:, :36]

    try:
        # Run the generator
        print("Starting generation process...")
        had, scl90r, fis8, fis40, psqi = generate_synthetic_data(X_input, models_dir=MODELS_DIR)

        print("\nGeneration completed successfully!")
        print(f"Output Shapes -> HAD: {had.shape}, SCL: {scl90r.shape}, PSQI: {psqi.shape}")

        # Save results example
        # had.to_csv('synthetic_HAD.csv', index=False)

    except Exception as e:
        print(f"\nError during execution: {e}")
        print("Please ensure you have downloaded the weights and the path is correct.")
else:
    print(f"Input file '{INPUT_FILE}' not found. Please upload it to the runtime.")