In [None]:
# %%
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10, 5)
plt.rcParams["axes.grid"] = True

In [None]:
# %%
# Adjust path if needed
path_subset = "scored_v2.csv"

df = pd.read_csv(path_subset)
df["time"] = pd.to_datetime(df["time"])

# Drop index artefacts if present
for col in ["Unnamed: 0", "Unnamed: 0.1"]:
    if col in df.columns:
        df = df.drop(columns=col)

df.head()


In [None]:
# %%
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())

print("\nSensors:")
print(df["sensor_id"].value_counts())

print("\nSynthetic labels:")
print(df["synthetic_label"].value_counts())


In [None]:
# %%
severity = df["severity"]

print(severity.describe())
print("\nQuantiles:")
print(severity.quantile([0.9, 0.95, 0.99, 0.999]))

# Histogram
plt.hist(severity, bins=50)
plt.title("Severity distribution (all points)")
plt.xlabel("severity")
plt.ylabel("count")
plt.show()


In [None]:
# %%
label_stats = df.groupby("synthetic_label")["severity"].describe()
label_stats

In [None]:
# %% [markdown]
# Boxplot of severity by synthetic label.

In [None]:
# %%
df.boxplot(column="severity", by="synthetic_label", rot=45)
plt.title("Severity by synthetic_label")
plt.suptitle("")
plt.xlabel("synthetic_label")
plt.ylabel("severity")
plt.show()

In [None]:
# %%
# True anomalies: anything that is not 'normal'
is_anom = df["synthetic_label"] != "normal"
total_anom = is_anom.sum()
total_norm = (~is_anom).sum()
total_anom, total_norm

In [None]:
# %%
def confusion_at_threshold(th):
    pred = df["severity"] >= th
    true = is_anom

    tp = (pred & true).sum()
    fp = (pred & ~true).sum()
    fn = (~pred & true).sum()
    tn = (~pred & ~true).sum()

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0

    return {
        "threshold": th,
        "tp": tp,
        "fp": fp,
        "fn": fn,
        "tn": tn,
        "precision": precision,
        "recall": recall,
    }

for th in [0.5, 0.7, 0.9, 0.95]:
    print(confusion_at_threshold(th))

In [None]:
# %%
def label_stats_at_threshold(th):
    pred = df["severity"] >= th
    res = (
        df.groupby("synthetic_label")
        .apply(lambda g: pd.Series({
            "count": len(g),
            "flagged": (g["severity"] >= th).sum(),
            "flagged_pct": (g["severity"] >= th).mean() * 100,
        }))
    )
    return res

for th in [0.7, 0.9, 0.95]:
    print(f"=== threshold = {th} ===")
    display(label_stats_at_threshold(th))

In [None]:
# %%
score_cols = [
    "score_point",
    "score_collective",
    "score_context",
    "score_sensor_fault",
    "score_stuck",
    "score_spike",
    "score_noise",
]

df[score_cols].describe()


In [None]:
# %%
df.groupby("synthetic_label")[score_cols].mean()

In [None]:
# %%
sev_by_sensor_label = (
    df.groupby(["sensor_id", "synthetic_label"])["severity"]
    .describe()[["count", "mean", "std", "min", "25%", "50%", "75%", "max"]]
)
sev_by_sensor_label


In [None]:
# %%
# Focus on normals only: baseline severity behaviour by sensor
normal = df[df["synthetic_label"] == "normal"]
normal.groupby("sensor_id")["severity"].describe()

In [None]:
# %%
top_anoms = df.sort_values("severity", ascending=False).head(50)
top_anoms[[
    "time",
    "sensor_id",
    "oxygen",
    "synthetic_label",
    "score_point",
    "score_collective",
    "score_context",
    "score_sensor_fault",
    "severity",
]]


In [None]:
df.columns

In [None]:
# df["roll_std_noise"] = rolling_std_over_noise_window
# noise_ratio = df["roll_std_noise"] / (sensor_std + eps)
# df["score_noise"] = (noise_ratio - 1.0) / (cfg.noise_factor - 1.0)
# df["score_noise"] = df["score_noise"].clip(lower=0, upper=1)


In [None]:
# # pseudo-code
# scores = [score_point, score_collective, score_context, score_sensor_fault]
# severity = 1 - np.prod(1 - np.array(scores))