In [1]:
# %% [markdown]
# # Oxygen Anomaly Detection – Generic Minute-Level Model
#
# This notebook builds a generic anomaly detection model for minute-level
# oxygen readings, detecting:
# - Point anomalies (outliers)
# - Collective anomalies (sequences)
# - Contextual anomalies (conditional outliers)
# - Sensor-fault anomalies (stuck, spikes/glitches, high noise)
#
# It produces a per-minute anomaly severity score for each sensor.

# %% imports & config

import numpy as np
import pandas as pd
import os
from pathlib import Path
from dataclasses import dataclass

In [2]:
@dataclass
class AnomalyConfig:
    # Rolling windows (in minutes)
    roll_window_baseline: int = 60           # 1-hour baseline
    roll_window_collective: int = 120        # 2-hour window for collective anomalies
    roll_window_stuck: int = 60              # 1-hour rolling std for 'stuck'
    roll_window_noise: int = 60              # 1-hour rolling std for 'noise'

    # Point / contextual anomaly z-score thresholds for score scaling
    z_point_low: float = 2.0                 # start increasing score
    z_point_high: float = 5.0                # max severity for point anomalies
    z_ctx_low: float = 2.0
    z_ctx_high: float = 4.0

    # Collective anomaly thresholds (mean |z| over a window)
    collective_low: float = 1.0
    collective_high: float = 3.0

    # Sensor-fault parameters
    stuck_rel_std_factor: float = 0.1        # local std < 0.1 * typical std ⇒ stuck
    noise_factor: float = 2.0                # local std > 2 * typical std ⇒ noisy
    spike_z_threshold: float = 3.0           # how many std-devs for spikes

    # Small epsilon for numerical stability
    eps: float = 1e-6

In [3]:
# %% helpers: loading & preparation

def load_oxygen_csv(csv_path: str) -> pd.DataFrame:
    """
    Load the raw oxygen CSV.
    Assumes columns: time, Oxygen[%sat], EquipmentUnit, SubUnit, System, Unit.
    """
    df = pd.read_csv(csv_path)
    df["time"] = pd.to_datetime(df["time"], errors="coerce")
    return df

In [4]:
def prepare_sensor_frame(df_raw: pd.DataFrame) -> pd.DataFrame:
    """
    Prepare a generic sensor frame:

    - Drop rows with missing Oxygen.
    - Build a generic sensor_id from tags (no semantic assumptions).
    - Keep only (time, sensor_id, oxygen, hour, dayofweek).
    """
    df = df_raw.copy()
    df = df[df['time'].notna() & df["Oxygen[%sat]"].notna()].copy()

    df["sensor_id"] = (
        df["System"].astype(str)
        + "|"
        + df["EquipmentUnit"].astype(str)
        + "|"
        + df["SubUnit"].astype(str)
    )

    df = df.rename(columns={"Oxygen[%sat]": "oxygen"})

    df["hour"] = df["time"].dt.hour
    df["dayofweek"] = df["time"].dt.dayofweek

    df = df.sort_values(["sensor_id", "time"]).reset_index(drop=True)
    return df[["time", "sensor_id", "oxygen", "hour", "dayofweek"]]

In [5]:
# %% helpers: score scaling

def squash_z(z: pd.Series, low: float, high: float) -> pd.Series:
    """
    Map |z| to [0, 1] linearly between `low` and `high`.
    """
    az = z.abs()
    return ((az - low) / (high - low)).clip(lower=0.0, upper=1.0)

In [6]:
# %% components: baseline & point anomalies

def add_baseline_and_point_scores(df: pd.DataFrame, cfg: AnomalyConfig) -> pd.DataFrame:
    """
    For each sensor_id:
    - Rolling baseline (mean + std) over `roll_window_baseline`.
    - z-score of residuals.
    - Point anomaly score from z-score.
    """
    df = df.copy()
    g = df.groupby("sensor_id", group_keys=False)

    df["roll_mean"] = g["oxygen"].transform(
        lambda s: s.rolling(
            window=cfg.roll_window_baseline,
            min_periods=cfg.roll_window_baseline // 2
        ).mean()
    )
    df["roll_std"] = g["oxygen"].transform(
        lambda s: s.rolling(
            window=cfg.roll_window_baseline,
            min_periods=cfg.roll_window_baseline // 2
        ).std()
    )

    df["z_global"] = (df["oxygen"] - df["roll_mean"]) / (df["roll_std"] + cfg.eps)
    df["score_point"] = squash_z(df["z_global"], cfg.z_point_low, cfg.z_point_high)

    return df

In [7]:
# %% components: collective anomalies

def add_collective_scores(df: pd.DataFrame, cfg: AnomalyConfig) -> pd.DataFrame:
    """
    Collective anomalies: sequences where residuals stay large.
    Use rolling mean(|z_global|) over a larger window.
    """
    df = df.copy()
    g = df.groupby("sensor_id", group_keys=False)

    df["roll_mean_abs_z"] = g["z_global"].transform(
        lambda s: s.abs().rolling(
            window=cfg.roll_window_collective,
            min_periods=cfg.roll_window_collective // 2
        ).mean()
    )

    df["score_collective"] = (
        (df["roll_mean_abs_z"] - cfg.collective_low)
        / (cfg.collective_high - cfg.collective_low)
    ).clip(lower=0.0, upper=1.0)

    return df

In [8]:
# %% components: contextual anomalies

def add_contextual_scores(df: pd.DataFrame, cfg: AnomalyConfig) -> pd.DataFrame:
    """
    Contextual anomalies: unusual values for their hour-of-day.
    Build global per-hour stats and compute contextual z-score.
    """
    df = df.copy()

    ctx_stats = (
        df.groupby("hour")["oxygen"]
        .agg(["mean", "std"])
        .rename(columns={"mean": "ctx_mean_hour", "std": "ctx_std_hour"})
    )

    df = df.join(ctx_stats, on="hour")

    df["z_context"] = (df["oxygen"] - df["ctx_mean_hour"]) / (df["ctx_std_hour"] + cfg.eps)
    df["score_context"] = squash_z(df["z_context"], cfg.z_ctx_low, cfg.z_ctx_high)

    return df

In [9]:
# %% components: sensor-fault anomalies

def add_sensor_fault_scores(df: pd.DataFrame, cfg: AnomalyConfig) -> pd.DataFrame:
    """
    Sensor-fault anomalies:
    - Stuck sensor: very low local variance vs typical.
    - Spikes/glitches: big jump + immediate reversal.
    - High noise: local variance >> typical.
    """
    df = df.copy()
    g = df.groupby("sensor_id", group_keys=False)

    # Typical volatility per sensor
    sensor_std = g["oxygen"].transform("std")
    sensor_diff_std = g["oxygen"].transform(lambda s: s.diff().std())

    # Rolling std used for stuck & noise
    df["roll_std_long"] = g["oxygen"].transform(
        lambda s: s.rolling(
            window=cfg.roll_window_stuck,
            min_periods=cfg.roll_window_stuck // 2
        ).std()
    )

    # --- stuck score ---
    ratio_std = df["roll_std_long"] / (sensor_std + cfg.eps)
    df["score_stuck"] = (cfg.stuck_rel_std_factor - ratio_std) / cfg.stuck_rel_std_factor
    df["score_stuck"] = df["score_stuck"].clip(lower=0.0, upper=1.0)

    # --- spikes / glitches score ---
    diff_prev = g["oxygen"].diff()
    diff_next = -g["oxygen"].diff(-1)
    spike_mag = np.minimum(diff_prev.abs(), diff_next.abs())
    spike_norm = spike_mag / (sensor_diff_std + cfg.eps)
    candidate_spike = (diff_prev * diff_next < 0) & (spike_norm > cfg.spike_z_threshold)

    df["score_spike"] = 0.0
    df.loc[candidate_spike, "score_spike"] = (
        (spike_norm[candidate_spike] - cfg.spike_z_threshold) / cfg.spike_z_threshold
    ).clip(upper=1.0)

    # --- high-noise score ---
    df["roll_std_noise"] = g["oxygen"].transform(
        lambda s: s.rolling(
            window=cfg.roll_window_noise,
            min_periods=cfg.roll_window_noise // 2
        ).std()
    )
    noise_ratio = df["roll_std_noise"] / (sensor_std + cfg.eps)
    df["score_noise"] = (noise_ratio - 1.0) / (cfg.noise_factor - 1.0)
    df["score_noise"] = df["score_noise"].clip(lower=0.0, upper=1.0)

    # Combined sensor-fault score
    df["score_sensor_fault"] = df[["score_stuck", "score_spike", "score_noise"]].max(axis=1)

    return df

In [10]:
# %% optional: synthetic anomaly injection (for illustration / testing)

def inject_synthetic_anomalies(
    df: pd.DataFrame,
    n_point_spikes: int = 20,
    n_collective_dips: int = 5,
    stuck_length: int = 60,
    noise_length: int = 60,
    random_state: int = 42,
) -> pd.DataFrame:
    """
    Inject synthetic anomalies into a copy of df:

    - Random point spikes
    - Random collective dips
    - Stuck segments
    - High-noise segments

    Adds column `synthetic_label` with:
    'normal', 'syn_point_spike', 'syn_collective_dip', 'syn_stuck', 'syn_high_noise'
    """
    rng = np.random.default_rng(random_state)
    df = df.copy()
    df["synthetic_label"] = "normal"

    for sensor_id, df_s in df.groupby("sensor_id"):
        idx = df_s.index.to_numpy()
        n = len(idx)
        if n < 1000:
            continue

        # point spikes
        for _ in range(n_point_spikes):
            i = int(rng.integers(0, n))
            row_idx = idx[i]
            sign = rng.choice([-1, 1])
            magnitude = rng.uniform(20, 40)
            df.loc[row_idx, "oxygen"] += sign * magnitude
            df.loc[row_idx, "synthetic_label"] = "syn_point_spike"

        # collective dips
        for _ in range(n_collective_dips):
            start = int(rng.integers(0, n - 120))
            length = int(rng.integers(30, 120))
            segment_idx = idx[start : start + length]
            df.loc[segment_idx, "oxygen"] *= rng.uniform(0.6, 0.8)
            df.loc[segment_idx, "synthetic_label"] = "syn_collective_dip"

        # stuck
        start = int(rng.integers(0, n - stuck_length))
        segment_idx = idx[start : start + stuck_length]
        stuck_value = float(df.loc[segment_idx[0], "oxygen"])
        df.loc[segment_idx, "oxygen"] = stuck_value
        df.loc[segment_idx, "synthetic_label"] = "syn_stuck"

        # high-noise
        start = int(rng.integers(0, n - noise_length))
        segment_idx = idx[start : start + noise_length]
        base = df.loc[segment_idx, "oxygen"]
        df.loc[segment_idx, "oxygen"] = base + rng.normal(
            0, base.std() * 2.0, size=len(segment_idx)
        )
        df.loc[segment_idx, "synthetic_label"] = "syn_high_noise"

    return df

In [11]:
# %% combine: overall severity

def add_severity_score(df: pd.DataFrame) -> pd.DataFrame:
    """
    Combine all sub-scores into a single anomaly severity score in [0, 1].
    """
    df = df.copy()
    cols = ["score_point", "score_collective", "score_context", "score_sensor_fault"]
    df["severity"] = df[cols].max(axis=1)
    return df

In [12]:
# %% OxygenAnomalyDetector class

class OxygenAnomalyDetector:
    """
    Scikit-like wrapper around the anomaly pipeline.

    Usage:
        det = OxygenAnomalyDetector()
        det.fit(df_raw)                # df_raw is the full raw dataset
        scored = det.score(df_raw)     # returns per-minute anomaly scores
    """

    def __init__(self, cfg: AnomalyConfig | None = None):
        self.cfg = cfg or AnomalyConfig()
        self.fitted_ = False

    def fit(self, df_raw: pd.DataFrame):
        """
        For this rule-based detector, 'fit' doesn't need to learn parameters,
        but we keep it for API symmetry (and to allow future extensions).
        """
        # In a more advanced version, you could:
        # - compute & store ctx_stats (per-hour means/stds)
        # - compute & store typical sensor stds, etc.
        # For now, we recompute those per `score()` call.
        self.fitted_ = True
        return self

    def score(self, df_raw: pd.DataFrame, inject_synthetic: bool = False) -> pd.DataFrame:
        """
        Run the full anomaly detection pipeline on df_raw.

        Returns a DataFrame with columns:
        - time, sensor_id, oxygen
        - score_point, score_collective, score_context, score_sensor_fault
        - severity
        - (optional) synthetic_label if injection enabled
        """
        if not self.fitted_:
            raise RuntimeError("Call `fit(df_raw)` before `score(df_raw)`.")

        df = prepare_sensor_frame(df_raw)

        if inject_synthetic:
            df = inject_synthetic_anomalies(df)

        df = add_baseline_and_point_scores(df, self.cfg)
        df = add_collective_scores(df, self.cfg)
        df = add_contextual_scores(df, self.cfg)
        df = add_sensor_fault_scores(df, self.cfg)
        df = add_severity_score(df)

        return df

In [13]:
DATA_DIR = os.path.abspath(os.path.join(os.getcwd(), "../" , "data"))
DATA_RAW_DIR = Path(os.path.join(DATA_DIR, "raw"))
# Path to the dataset (adjust this if needed)
csv_path = Path(os.path.join(DATA_RAW_DIR, "oxygen.csv"))

In [14]:
df_raw = load_oxygen_csv(csv_path)

In [15]:
detector = OxygenAnomalyDetector()
detector.fit(df_raw)

<__main__.OxygenAnomalyDetector at 0x10bcadc50>

In [16]:
df_raw

Unnamed: 0,time,Oxygen[%sat],EquipmentUnit,SubUnit,System,Unit
0,2025-04-01 17:43:00,,EquipmentUnit_05,,System_01,Unit_01
1,2025-04-01 17:44:00,,EquipmentUnit_05,,System_01,Unit_01
2,2025-04-01 17:45:00,,EquipmentUnit_05,,System_01,Unit_01
3,2025-04-01 17:46:00,,EquipmentUnit_05,,System_01,Unit_01
4,2025-04-01 17:47:00,,EquipmentUnit_05,,System_01,Unit_01
...,...,...,...,...,...,...
7128268,2025-11-13 23:55:00,87.648140,EquipmentUnit_11,,System_10,Unit_01
7128269,2025-11-13 23:56:00,87.377449,EquipmentUnit_11,,System_10,Unit_01
7128270,2025-11-13 23:57:00,87.434769,EquipmentUnit_11,,System_10,Unit_01
7128271,2025-11-13 23:58:00,87.449242,EquipmentUnit_11,,System_10,Unit_01


In [17]:
scored = detector.score(df_raw, inject_synthetic=True)

In [18]:
# Inspect a few columns
print(
    scored[
        [
            "time",
            "sensor_id",
            "oxygen",
            "score_point",
            "score_collective",
            "score_context",
            "score_sensor_fault",
            "severity",
            "synthetic_label",
        ]
    ].head(20)
)

                  time                       sensor_id     oxygen  \
0  2025-04-01 17:43:00  System_10|EquipmentUnit_05|nan  91.639549   
1  2025-04-01 17:44:00  System_10|EquipmentUnit_05|nan  91.443474   
2  2025-04-01 17:45:00  System_10|EquipmentUnit_05|nan  91.784111   
3  2025-04-01 17:46:00  System_10|EquipmentUnit_05|nan  92.180275   
4  2025-04-01 17:47:00  System_10|EquipmentUnit_05|nan  92.131180   
5  2025-04-01 17:48:00  System_10|EquipmentUnit_05|nan  92.002243   
6  2025-04-01 17:49:00  System_10|EquipmentUnit_05|nan  91.797531   
7  2025-04-01 17:50:00  System_10|EquipmentUnit_05|nan  91.981346   
8  2025-04-01 17:51:00  System_10|EquipmentUnit_05|nan  92.329308   
9  2025-04-01 17:52:00  System_10|EquipmentUnit_05|nan  92.779999   
10 2025-04-01 17:53:00  System_10|EquipmentUnit_05|nan  92.681908   
11 2025-04-01 17:54:00  System_10|EquipmentUnit_05|nan  92.421844   
12 2025-04-01 17:55:00  System_10|EquipmentUnit_05|nan  92.347351   
13 2025-04-01 17:56:00  System_10|

In [19]:
scored

Unnamed: 0,time,sensor_id,oxygen,hour,dayofweek,synthetic_label,roll_mean,roll_std,z_global,score_point,...,ctx_std_hour,z_context,score_context,roll_std_long,score_stuck,score_spike,roll_std_noise,score_noise,score_sensor_fault,severity
0,2025-04-01 17:43:00,System_10|EquipmentUnit_05|nan,91.639549,17,1,normal,,,,,...,5.891127,0.679695,0.0,,,0.0,,,0.0,0.000000
1,2025-04-01 17:44:00,System_10|EquipmentUnit_05|nan,91.443474,17,1,normal,,,,,...,5.891127,0.646412,0.0,,,0.0,,,0.0,0.000000
2,2025-04-01 17:45:00,System_10|EquipmentUnit_05|nan,91.784111,17,1,normal,,,,,...,5.891127,0.704234,0.0,,,0.0,,,0.0,0.000000
3,2025-04-01 17:46:00,System_10|EquipmentUnit_05|nan,92.180275,17,1,normal,,,,,...,5.891127,0.771482,0.0,,,0.0,,,0.0,0.000000
4,2025-04-01 17:47:00,System_10|EquipmentUnit_05|nan,92.131180,17,1,normal,,,,,...,5.891127,0.763148,0.0,,,0.0,,,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1612634,2025-11-14 09:27:00,System_10|EquipmentUnit_11|nan,85.481667,9,4,normal,83.634142,0.872716,2.116981,0.038994,...,5.999268,-0.407451,0.0,0.872716,0.0,0.0,0.872716,0.0,0.0,0.168845
1612635,2025-11-14 09:28:00,System_10|EquipmentUnit_11|nan,85.612083,9,4,normal,83.673079,0.907868,2.135775,0.045258,...,5.999268,-0.385712,0.0,0.907868,0.0,0.0,0.907868,0.0,0.0,0.170538
1612636,2025-11-14 09:29:00,System_10|EquipmentUnit_11|nan,85.724228,9,4,normal,83.713886,0.944015,2.129563,0.043188,...,5.999268,-0.367019,0.0,0.944015,0.0,0.0,0.944015,0.0,0.0,0.172094
1612637,2025-11-14 09:30:00,System_10|EquipmentUnit_11|nan,85.901230,9,4,normal,83.757643,0.983391,2.179789,0.059930,...,5.999268,-0.337515,0.0,0.983391,0.0,0.0,0.983391,0.0,0.0,0.173995


In [20]:
scored['severity'].describe()

count    1.612639e+06
mean     1.791623e-01
std      2.046654e-01
min      0.000000e+00
25%      3.845007e-02
50%      1.315895e-01
75%      2.260665e-01
max      1.000000e+00
Name: severity, dtype: float64

In [21]:
scored.columns

Index(['time', 'sensor_id', 'oxygen', 'hour', 'dayofweek', 'synthetic_label',
       'roll_mean', 'roll_std', 'z_global', 'score_point', 'roll_mean_abs_z',
       'score_collective', 'ctx_mean_hour', 'ctx_std_hour', 'z_context',
       'score_context', 'roll_std_long', 'score_stuck', 'score_spike',
       'roll_std_noise', 'score_noise', 'score_sensor_fault', 'severity'],
      dtype='object')

In [22]:
scored

Unnamed: 0,time,sensor_id,oxygen,hour,dayofweek,synthetic_label,roll_mean,roll_std,z_global,score_point,...,ctx_std_hour,z_context,score_context,roll_std_long,score_stuck,score_spike,roll_std_noise,score_noise,score_sensor_fault,severity
0,2025-04-01 17:43:00,System_10|EquipmentUnit_05|nan,91.639549,17,1,normal,,,,,...,5.891127,0.679695,0.0,,,0.0,,,0.0,0.000000
1,2025-04-01 17:44:00,System_10|EquipmentUnit_05|nan,91.443474,17,1,normal,,,,,...,5.891127,0.646412,0.0,,,0.0,,,0.0,0.000000
2,2025-04-01 17:45:00,System_10|EquipmentUnit_05|nan,91.784111,17,1,normal,,,,,...,5.891127,0.704234,0.0,,,0.0,,,0.0,0.000000
3,2025-04-01 17:46:00,System_10|EquipmentUnit_05|nan,92.180275,17,1,normal,,,,,...,5.891127,0.771482,0.0,,,0.0,,,0.0,0.000000
4,2025-04-01 17:47:00,System_10|EquipmentUnit_05|nan,92.131180,17,1,normal,,,,,...,5.891127,0.763148,0.0,,,0.0,,,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1612634,2025-11-14 09:27:00,System_10|EquipmentUnit_11|nan,85.481667,9,4,normal,83.634142,0.872716,2.116981,0.038994,...,5.999268,-0.407451,0.0,0.872716,0.0,0.0,0.872716,0.0,0.0,0.168845
1612635,2025-11-14 09:28:00,System_10|EquipmentUnit_11|nan,85.612083,9,4,normal,83.673079,0.907868,2.135775,0.045258,...,5.999268,-0.385712,0.0,0.907868,0.0,0.0,0.907868,0.0,0.0,0.170538
1612636,2025-11-14 09:29:00,System_10|EquipmentUnit_11|nan,85.724228,9,4,normal,83.713886,0.944015,2.129563,0.043188,...,5.999268,-0.367019,0.0,0.944015,0.0,0.0,0.944015,0.0,0.0,0.172094
1612637,2025-11-14 09:30:00,System_10|EquipmentUnit_11|nan,85.901230,9,4,normal,83.757643,0.983391,2.179789,0.059930,...,5.999268,-0.337515,0.0,0.983391,0.0,0.0,0.983391,0.0,0.0,0.173995


In [23]:
scored.to_csv("scored_v1.csv")

In [24]:
# scored[scored['severity'] > 1.801140e-01]

In [25]:
# scored[scored['severity'] <= 1.801140e-01]

In [26]:
# # %% example usage

# if __name__ == "__main__":
#     # Adjust to your actual path, e.g. "/mnt/data/oxygen.csv"
#     csv_path = "oxygen.csv"

#     df_raw = load_oxygen_csv(csv_path)

#     detector = OxygenAnomalyDetector()
#     detector.fit(df_raw)

#     scored = detector.score(df_raw, inject_synthetic=True)

#     # Inspect a few columns
#     print(
#         scored[
#             [
#                 "time",
#                 "sensor_id",
#                 "oxygen",
#                 "score_point",
#                 "score_collective",
#                 "score_context",
#                 "score_sensor_fault",
#                 "severity",
#                 "synthetic_label",
#             ]
#         ].head(20)
#     )
