# Hackathon Data Preparation Pipeline
**Output:** Cleaned Snapshot Datasets for Uncertainty Quantification Hackathon
**Author:** [Your Name]
**Date:** 2026-01-18

## Overview
This notebook processes the raw NASA C-MAPSS turbofan degradation data into "Snapshot" datasets suitable for the hackathon. 

**Procedure:**
1.  **Load** raw `train_FD00x.txt` files.
2.  **Calculate RUL** (Remaining Useful Life) based on the run-to-failure cycle counts.
3.  **Split** data into Train (80%) and Test (20%) sets based on **Unit ID**.
    * *Crucial:* We split by Unit ID (e.g., Engine 1 vs Engine 2) to ensure no temporal leakage.
4.  **Shuffle** all rows to destroy the time-series structure (creating independent snapshots).
5.  **Export** two versions of the data:
    * **Public:** Features + Target (RUL). Metadata (Unit/Cycle) is removed.
    * **Mapping:** Original Metadata (Unit/Cycle) for lineage tracking (kept internal).
6.  **Generate** a `info.json` for each scenario containing full sensor metadata.

**Reproducibility:**
* A fixed `RANDOM_SEED` is used for unit shuffling and row shuffling.

In [1]:
import pandas as pd
import numpy as np
import os
import json

# --- CONFIGURATION ---
RANDOM_SEED = 42
DATA_DIR = '../data'  # Location of raw train_FD00x.txt files
OUTPUT_BASE = '../data/scenarios' # Where clean folders will be created

SOURCE_FILES = ['train_FD001.txt', 'train_FD002.txt', 'train_FD003.txt', 'train_FD004.txt']

# --- METADATA DEFINITIONS ---

SCENARIO_DESCRIPTIONS = {
    "FD001": "One operating condition (Sea Level). One fault mode (HPC Degradation).",
    "FD002": "Six operating conditions. One fault mode (HPC Degradation).",
    "FD003": "One operating condition (Sea Level). Two fault modes (HPC or Fan Degradation).",
    "FD004": "Six operating conditions. Two fault modes (HPC or Fan Degradation)."
}

# Full Sensor Glossary
SENSOR_METADATA = {
    "op1": {
        "symbol": "Alt",
        "description": "Altitude", 
        "unit": "ft", 
        "short_description": "Altitude",
        "label": "op1: Altitude (ft)"
    },
    "op2": {
        "symbol": "Mach",
        "description": "Mach Number", 
        "unit": "Mach", 
        "short_description": "Mach Number",
        "label": "op2: Mach Number"
    },
    "op3": {
        "symbol": "TRA",
        "description": "Throttle Resolver Angle", 
        "unit": "%", 
        "short_description": "Throttle Angle",
        "label": "op3: Throttle Resolver Angle (%)"
    },
    "s1": {"symbol": "T2", "description": "Total temperature at fan inlet", "unit": "°R", "short_description": "temp fan inlet", "label": "T2: Total temperature at fan inlet (°R)"},
    "s2": {"symbol": "T24", "description": "Total temperature at LPC outlet", "unit": "°R", "short_description": "temp LPC outlet", "label": "T24: Total temperature at LPC outlet (°R)"},
    "s3": {"symbol": "T30", "description": "Total temperature at HPC outlet", "unit": "°R", "short_description": "temp HPC outlet", "label": "T30: Total temperature at HPC outlet (°R)"},
    "s4": {"symbol": "T50", "description": "Total temperature at LPT outlet", "unit": "°R", "short_description": "temp LPT outlet", "label": "T50: Total temperature at LPT outlet (°R)"},
    "s5": {"symbol": "P2", "description": "Pressure at fan inlet", "unit": "psia", "short_description": "Press. fan inlet", "label": "P2: Pressure at fan inlet (psia)"},
    "s6": {"symbol": "P15", "description": "Total pressure in bypass-duct", "unit": "psia", "short_description": "Press. bypass-duct", "label": "P15: Total pressure in bypass-duct (psia)"},
    "s7": {"symbol": "P30", "description": "Total pressure at HPC outlet", "unit": "psia", "short_description": "Press. HPC outlet", "label": "P30: Total pressure at HPC outlet (psia)"},
    "s8": {"symbol": "Nf", "description": "Physical fan speed", "unit": "rpm", "short_description": "Phys. fan speed", "label": "Nf: Physical fan speed (rpm)"},
    "s9": {"symbol": "Nc", "description": "Physical core speed", "unit": "rpm", "short_description": "Phys. core speed", "label": "Nc: Physical core speed (rpm)"},
    "s10": {"symbol": "epr", "description": "Engine pressure ratio (P50/P2)", "unit": "--", "short_description": "Engine press. ratio", "label": "epr: Engine pressure ratio (P50/P2) (--)"},
    "s11": {"symbol": "Ps30", "description": "Static pressure at HPC outlet", "unit": "psia", "short_description": "Stat. press. HPC outlet", "label": "Ps30: Static pressure at HPC outlet (psia)"},
    "s12": {"symbol": "phi", "description": "Ratio of fuel flow to Ps30", "unit": "pps/psi", "short_description": "Ratio fuel flow:Ps30", "label": "phi: Ratio of fuel flow to Ps30 (pps/psi)"},
    "s13": {"symbol": "NRf", "description": "Corrected fan speed", "unit": "rpm", "short_description": "Corr. fan speed", "label": "NRf: Corrected fan speed (rpm)"},
    "s14": {"symbol": "NRc", "description": "Corrected core speed", "unit": "rpm", "short_description": "Corr. core speed", "label": "NRc: Corrected core speed (rpm)"},
    "s15": {"symbol": "BPR", "description": "Bypass Ratio", "unit": "--", "short_description": "Bypass Ratio", "label": "BPR: Bypass Ratio (--)"},
    "s16": {"symbol": "farB", "description": "Burner fuel-air ratio", "unit": "--", "short_description": "Burner fuel-air ratio", "label": "farB: Burner fuel-air ratio (--)"},
    "s17": {"symbol": "htBleed", "description": "Bleed Enthalpy", "unit": "--", "short_description": "Bleed Enthalpy", "label": "htBleed: Bleed Enthalpy (--)"},
    "s18": {"symbol": "Nf_dmd", "description": "Demanded fan speed", "unit": "rpm", "short_description": "Demand. fan speed", "label": "Nf_dmd: Demanded fan speed (rpm)"},
    "s19": {"symbol": "PCNfR_dmd", "description": "Demanded corrected fan speed", "unit": "rpm", "short_description": "Demand. corr. fan speed", "label": "PCNfR_dmd: Demanded corrected fan speed (rpm)"},
    "s20": {"symbol": "W31", "description": "HPT coolant bleed", "unit": "lbm/s", "short_description": "HPT coolant bleed", "label": "W31: HPT coolant bleed (lbm/s)"},
    "s21": {"symbol": "W32", "description": "LPT coolant bleed", "unit": "lbm/s", "short_description": "LPT coolant bleed", "label": "W32: LPT coolant bleed (lbm/s)"},
    "RUL": {"description": "Remaining Useful Life", "unit": "Cycles", "label": "RUL: Remaining Useful Life (Cycles)"}
}

In [2]:
def process_scenario(filename):
    """
    Reads a raw NASA file, splits by unit, shuffles, and saves clean CSVs.
    """
    scenario_name = filename.replace('train_', '').replace('.txt', '') # e.g. FD001
    output_dir = os.path.join(OUTPUT_BASE, scenario_name)
    mapping_dir = os.path.join(output_dir, 'source_mapping')
    
    # Ensure directories exist
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(mapping_dir, exist_ok=True)
    
    print(f"Processing {scenario_name}...")
    
    # 1. LOAD RAW DATA
    path = os.path.join(DATA_DIR, filename)
    cols = ['unit', 'cycle', 'op1', 'op2', 'op3'] + [f's{i}' for i in range(1, 22)]
    df = pd.read_csv(path, sep=r'\s+', header=None, names=cols)
    
    # 2. CALCULATE RUL (Run-to-Failure)
    max_cycles = df.groupby('unit')['cycle'].transform('max')
    df['RUL'] = max_cycles - df['cycle']
    
    # 3. SPLIT TRAIN/TEST (80/20 by Unit ID)
    units = df['unit'].unique()
    
    # Deterministic Shuffle of Units
    rng = np.random.default_rng(RANDOM_SEED)
    rng.shuffle(units)
    
    n_train = int(len(units) * 0.8)
    train_units = units[:n_train]
    test_units = units[n_train:]
    
    print(f"  Split: {len(train_units)} Train Units / {len(test_units)} Test Units")
    
    df_train = df[df['unit'].isin(train_units)].copy()
    df_test = df[df['unit'].isin(test_units)].copy()
    
    # 4. SAVE FUNCTIONS (Shuffle Rows + Vertical Split)
    def save_datasets(df_subset, split_name):
        # Shuffle Rows
        df_shuffled = df_subset.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
        
        # A. Public File (Features + Target) - No Unit/Cycle
        cols_public = [c for c in df_shuffled.columns if c not in ['unit', 'cycle']]
        df_public = df_shuffled[cols_public]
        
        public_path = os.path.join(output_dir, f"{split_name}_{scenario_name}.csv")
        df_public.to_csv(public_path, index=False)
        
        # B. Mapping File (Metadata for Lineage) - Kept Secret
        cols_mapping = ['unit', 'cycle', 'RUL']
        df_mapping = df_shuffled[cols_mapping]
        
        mapping_path = os.path.join(mapping_dir, f"{split_name}_{scenario_name}_source.csv")
        df_mapping.to_csv(mapping_path, index=False)
        
        return len(df_public)

    n_train_rows = save_datasets(df_train, "train")
    n_test_rows = save_datasets(df_test, "test")
    
    print(f"  Saved: Train ({n_train_rows} rows), Test ({n_test_rows} rows)")

    # 5. GENERATE METADATA JSON
    info_content = {
        "scenario": scenario_name,
        "description": SCENARIO_DESCRIPTIONS[scenario_name],
        "files": {
            "train": f"train_{scenario_name}.csv",
            "test": f"test_{scenario_name}.csv"
        },
        "column_metadata": SENSOR_METADATA
    }
    
    with open(os.path.join(output_dir, 'info.json'), 'w') as f:
        json.dump(info_content, f, indent=4, ensure_ascii=False)

In [3]:
# --- EXECUTION ---
if __name__ == "__main__":
    print(f"Starting Data Prep. Seed={RANDOM_SEED}\n" + "-"*30)
    
    for f in SOURCE_FILES:
        process_scenario(f)
        
    print("-" * 30)
    print("Done. All scenarios generated in:", OUTPUT_BASE)

Starting Data Prep. Seed=42
------------------------------
Processing FD001...
  Split: 80 Train Units / 20 Test Units
  Saved: Train (16527 rows), Test (4104 rows)
Processing FD002...
  Split: 208 Train Units / 52 Test Units
  Saved: Train (43301 rows), Test (10458 rows)
Processing FD003...
  Split: 80 Train Units / 20 Test Units
  Saved: Train (19842 rows), Test (4878 rows)
Processing FD004...
  Split: 199 Train Units / 50 Test Units
  Saved: Train (49521 rows), Test (11728 rows)
------------------------------
Done. All scenarios generated in: ../data/scenarios
