# Step 2: Streaming Baseline – SAND

In this notebook, we evaluate the **SAND (Streaming Subsequence Anomaly Detection)** method on the Normality 1–3 datasets using a streaming simulation.

Data is processed in **small overlapping windows** (subsequences), mimicking real-time streaming input. SAND incrementally builds a model of normal subsequences and scores each new window to detect anomalies.


In [1]:
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from TSB_UAD.models.sand import SAND     # ✅ from the repo you just installed

DATA_DIR    = "generated_datasets"
RESULTS_DIR = "results_streaming"
os.makedirs(RESULTS_DIR, exist_ok=True)

# Global streaming hyper-params
SUBSEQ_LEN      = 20          # subsequence_length  (ℓ in the paper)
PATTERN_LEN     = 30          # pattern_length      (> SUBSEQ_LEN)
INIT_LEN        = 300        # initial batch used to bootstrap Θ
BATCH_SIZE      = 300        # points pushed per streaming step
ALPHA           = 0.5         # update rate for weights / stats
OVERLAP_RATE    = SUBSEQ_LEN // 4
CONTAMINATION   = 0.05        # used only for thresholding / evaluation
K_CLUSTERS      = 6

In [2]:
def run_sand_stream(time_series):
    ts_length = len(time_series)

    # Use smallest acceptable lengths
    subseq_len = SUBSEQ_LEN
    min_init_len = subseq_len * 2
    min_batch_size = subseq_len * 2
    min_pattern_len = subseq_len + 1

    # Abort if even minimum setup is too big
    min_total = min_init_len + min_batch_size + min_pattern_len
    if ts_length < min_total:
        print(f"   ❌ Too short: {ts_length} < {min_total}")
        return None, None

    # Dynamically scale within safe bounds
    init_len = min(INIT_LEN, ts_length // 2)
    batch_size = min(BATCH_SIZE, (ts_length - init_len) // 2)
    pattern_len = min(PATTERN_LEN, batch_size - 1)

    # If scaling goes too low, force minimum
    init_len = max(init_len, min_init_len)
    batch_size = max(batch_size, min_batch_size)
    pattern_len = max(pattern_len, min_pattern_len)

    try:
        sand = SAND(pattern_length=pattern_len,
                    subsequence_length=subseq_len,
                    k=K_CLUSTERS)

        sand.fit(X=time_series,
                 online=True,
                 alpha=ALPHA,
                 init_length=init_len,
                 batch_size=batch_size,
                 overlaping_rate=OVERLAP_RATE,
                 verbose=False)

        scores = sand.decision_scores_
        if len(scores) == 0:
            print("   ⚠️  No scores produced.")
            return None, None

        threshold = np.percentile(scores, 100 * (1 - CONTAMINATION))
        predictions = (scores > threshold).astype(int)
        return predictions, scores

    except Exception as e:
        print(f"   ⚠️  Skipped due to error: {e}")
        return None, None


In [3]:
summary = []

for fname in sorted(os.listdir(DATA_DIR)):
    if not fname.endswith(".npy") or "_boundaries" in fname:
        continue

    print(f"\n▶️  {fname}")
    ts = np.load(os.path.join(DATA_DIR, fname))

    preds, scores = run_sand_stream(ts)
    if preds is None or scores is None or len(scores) == 0:
        print(f"   ⚠️  Skipped: Dataset too short or no scores returned")
        continue

    labels = np.zeros(len(scores))
    labels[-int(len(scores) * CONTAMINATION):] = 1

    precision = precision_score(labels, preds, zero_division=0)
    recall = recall_score(labels, preds, zero_division=0)
    f1 = f1_score(labels, preds, zero_division=0)
    auc = roc_auc_score(labels, scores) if len(set(labels)) > 1 else 0.0

    print(f"   Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}, AUC: {auc:.3f}")

    plt.figure(figsize=(12, 4))
    plt.plot(ts, label="Time Series", alpha=0.6)
    subseq_centers = np.where(preds == 1)[0] + SUBSEQ_LEN // 2
    subseq_centers = subseq_centers[subseq_centers < len(ts)]
    plt.scatter(subseq_centers, ts[subseq_centers], color='red', s=10, label="Anomalies")
    plt.title(fname)
    plt.legend()
    plt.tight_layout()
    plt.show()

    summary.append(dict(
        dataset=fname,
        length=len(ts),
        precision=round(precision, 3),
        recall=round(recall, 3),
        f1=round(f1, 3),
        auc=round(auc, 3)
    ))


▶️  normality_1_daphnet.npy




   ⚠️  Skipped due to error: The window size must be less than or equal to 20
   ⚠️  Skipped: Dataset too short or no scores returned

▶️  normality_1_genesis.npy
   ⚠️  Skipped due to error: The window size must be less than or equal to 20
   ⚠️  Skipped: Dataset too short or no scores returned

▶️  normality_1_nasa-msl.npy




   ⚠️  Skipped due to error: The window size must be less than or equal to 20
   ⚠️  Skipped: Dataset too short or no scores returned

▶️  normality_2_1_daphnet_genesis.npy




   ⚠️  Skipped due to error: The window size must be less than or equal to 20
   ⚠️  Skipped: Dataset too short or no scores returned

▶️  normality_2_2_daphnet_nasa-msl.npy




   ⚠️  Skipped due to error: The window size must be less than or equal to 20
   ⚠️  Skipped: Dataset too short or no scores returned

▶️  normality_2_3_genesis_nasa-msl.npy




   ⚠️  Skipped due to error: The window size must be less than or equal to 20
   ⚠️  Skipped: Dataset too short or no scores returned

▶️  normality_3_1_daphnet_genesis_nasa-msl.npy




   ⚠️  Skipped due to error: The window size must be less than or equal to 20
   ⚠️  Skipped: Dataset too short or no scores returned

▶️  normality_3_2_genesis_nasa-msl_daphnet.npy




   ⚠️  Skipped due to error: The window size must be less than or equal to 20
   ⚠️  Skipped: Dataset too short or no scores returned

▶️  normality_3_3_nasa-msl_daphnet_genesis.npy
   ⚠️  Skipped due to error: The window size must be less than or equal to 20
   ⚠️  Skipped: Dataset too short or no scores returned


In [4]:
if summary:
    df = pd.DataFrame(summary).set_index("dataset")
    display(df)
    df.to_csv(os.path.join(RESULTS_DIR, "sand_streaming_results.csv"))
else:
    print("❌ No datasets were successfully processed.")

❌ No datasets were successfully processed.
