# PRISM Pipeline: Step 1 - Fetch Findings

**Dataset:** NASA C-MAPSS FD001 (Turbofan Engine Degradation)

**Date:** 2026-01-18

In [None]:
import polars as pl
from prism.db import get_path, OBSERVATIONS

## Load Observations

In [None]:
obs = pl.read_parquet(get_path(OBSERVATIONS))
print(f"Total observations: {len(obs):,}")
print(f"Columns: {obs.columns}")
obs.head(10)

## Dataset Summary

| Metric | Value |
|--------|-------|
| **Rows** | 515,775 |
| **Entities** | 100 engines (FD001_U001 - FD001_U100) |
| **Signals** | 25 (21 sensors + 3 ops + RUL) |
| **Columns** | entity_id, signal_id, timestamp, value |

## Entity Analysis (Engines)

In [None]:
entities = obs['entity_id'].unique().sort()
print(f"Unique entities: {len(entities)}")
print(f"\nFirst 10: {entities.head(10).to_list()}")
print(f"Last 10: {entities.tail(10).to_list()}")

## Signal Analysis (Sensors)

In [None]:
signals = obs['signal_id'].unique().sort()
print(f"Unique signals: {len(signals)}")
print(f"\nAll signals: {signals.to_list()}")

### Signal Categories

**21 Sensors:**
- Temperature: T2, T24, T30, T50
- Pressure: P2, P15, P30, Ps30
- Speed: Nf, Nc, NRf, NRc
- Ratios: epr, BPR, farB, phi
- Bleed: htBleed, W31, W32
- Demand: Nf_dmd, PCNfR_dmd

**3 Operating Conditions:** op1, op2, op3

**1 Target:** RUL (Remaining Useful Life)

## Timestamp Analysis (Cycles)

In [None]:
# Cycles per entity
cycles_per_entity = (
    obs.filter(pl.col('signal_id') == 'T2')  # One signal to count cycles
    .group_by('entity_id')
    .agg(pl.col('timestamp').max().alias('max_cycle'))
    .sort('entity_id')
)

print(f"Cycle range per engine:")
print(f"  Min cycles: {cycles_per_entity['max_cycle'].min()}")
print(f"  Max cycles: {cycles_per_entity['max_cycle'].max()}")
print(f"  Mean cycles: {cycles_per_entity['max_cycle'].mean():.1f}")

cycles_per_entity.head(10)

## RUL Distribution

In [None]:
rul_data = obs.filter(pl.col('signal_id') == 'RUL')
print(f"RUL observations: {len(rul_data):,}")
print(f"\nRUL statistics:")
print(f"  Min: {rul_data['value'].min():.0f}")
print(f"  Max: {rul_data['value'].max():.0f}")
print(f"  Mean: {rul_data['value'].mean():.1f}")
print(f"  Std: {rul_data['value'].std():.1f}")

## Sensor Value Ranges

In [None]:
signal_stats = (
    obs.group_by('signal_id')
    .agg([
        pl.col('value').min().alias('min'),
        pl.col('value').max().alias('max'),
        pl.col('value').mean().alias('mean'),
        pl.col('value').std().alias('std'),
        pl.col('value').count().alias('count'),
    ])
    .sort('signal_id')
)

signal_stats

## Data Quality Check

In [None]:
# Check for nulls
null_counts = obs.null_count()
print("Null counts per column:")
print(null_counts)

# Check expected row count: 100 entities * 25 signals * avg_cycles
expected_per_signal = len(obs) / 25
print(f"\nExpected observations per signal: {expected_per_signal:,.0f}")

## Next Steps

1. **Cohort Discovery** - Group signals by behavioral similarity
2. **Signal Vector** - Compute 51 behavioral metrics per signal
3. **Geometry** - Compute pairwise relationships
4. **State** - Track temporal dynamics
5. **ML Accelerator** - Train RUL prediction model