In [None]:
##### ANOMALY DETECTION WITH DBSCAN #####
#
# Density-based clustering that finds arbitrary shapes and outliers.
# DBSCAN groups by density, not distance or counts
#
# *cluster is the same as *interval
#
# Main Idea
# 1. After sorting the code computes the difference between each neighboring pair.
# 2. If that difference is less than eps the points are in the normal region.
# 3. If a region contains less than mean_samples, it is normal.
# 4. Otherwise, it is anomaly.

In [None]:
import os
import glob
import time
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

try:
    import psutil
    HAVE_PSUTIL = True
except ImportError:
    HAVE_PSUTIL = False

warnings.filterwarnings("ignore")

# -----------------------------
# Config - Parameters and Paths
# -----------------------------
BASE = "/content/drive/MyDrive/Paper02_14Datasets"
MERGED_DIR = f"{BASE}/MERGED"
OUT_DIR = f"{BASE}/ANOMALY_DBSCAN"
SUMMARY_DIR = f"{OUT_DIR}/Percentiles_Summary"
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
Path(SUMMARY_DIR).mkdir(parents=True, exist_ok=True)

RESIDENCES = [
    "REFIT_House01",
    "REFIT_House02",
    "REFIT_House03",
    "REFIT_House05",
    "REFIT_House07",
    "REFIT_House09",
    "REFIT_House15",
    "UKDALE_House01",
    "UKDALE_House02",
    "UKDALE_House05",
    "AMPds2_House01",
    "GREEND_House00",
    "GREEND_House01",
    "GREEND_House03"
]

DBSCAN_EPS = 0.5
DBSCAN_MIN_SAMPLES = 5

# -----------------------------
# Helper Functions
# -----------------------------
# Measures current process memory usage in megabytes.
def rss_mb():
    if HAVE_PSUTIL:
        try:
            return psutil.Process().memory_info().rss / (1024*1024)
        except Exception:
            return None
    return None

# Enforces formats and cleans the data
def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
    ap = pd.to_numeric(df["active_power"], errors="coerce")
    ap = ap.fillna(method="ffill").fillna(method="bfill").astype(np.float32)
    df["active_power"] = ap
    return df.dropna(subset=["active_power"])

# -----------------------------
# Lightweight 1D DBSCAN on z-scored values
#
# Standardizes training values into z-space
# Raw values are scaled to zero mean and unit variance so distances are comparable.
# Sorts z-values, splits by eps (maximum allowed gap between neighboring values)
# Consecutive points farther apart than eps are treated as belonging to different regions.
# Only dense runs with enough points are kept as valid clusters; sparse runs are ignored.
# Stores scaler, intervals, timing, memory
# -----------------------------
def fit_dbscan_1d(train_vals: np.ndarray, eps: float, min_samples: int):
    X = train_vals.reshape(-1, 1).astype(np.float32)
    scaler = StandardScaler()
    Xz = scaler.fit_transform(X).ravel()  # 1D array in z-space

    start_t = time.perf_counter()
    mem_before = rss_mb()

    # Sort z-values
    order = np.argsort(Xz)
    z_sorted = Xz[order]

    # Find indices where consecutive gap > eps
    gaps = np.diff(z_sorted)
    break_idx = np.where(gaps > eps)[0]  # split after these indices

    # list of dense regions that DBSCAN would consider clusters.
    intervals = []
    run_start = 0
    for b in np.append(break_idx, len(z_sorted) - 1):
        run_end = b  # inclusive
        run_len = run_end - run_start + 1
        if run_len >= min_samples:
            z_min = z_sorted[run_start]
            z_max = z_sorted[run_end]
            intervals.append((z_min, z_max))
        run_start = run_end + 1

    # Merge overlapping/adjacent intervals for safety
    # Essentially - clean, disjoint cluster regions
    if intervals:
        intervals.sort(key=lambda t: t[0])
        merged = []
        cur_s, cur_e = intervals[0]
        for s, e in intervals[1:]:
            if s <= cur_e:  # overlap/adjacent in z-space
                cur_e = max(cur_e, e)
            else:
                merged.append((cur_s, cur_e))
                cur_s, cur_e = s, e
        merged.append((cur_s, cur_e))
        intervals = merged

    elapsed = time.perf_counter() - start_t
    mem_after = rss_mb()
    mem_used = (mem_after - mem_before) if (mem_before is not None and mem_after is not None) else None

    model = {
        "scaler": scaler,
        "eps": eps,
        "min_samples": min_samples,
        "intervals": intervals  # list of (z_min, z_max), sorted, non-overlapping
    }
    return model, elapsed, mem_used

# -----------------------------
# Predict using learned z-intervals
# Scales new values using training scaler
# Checks whether values fall in clusters
# Labels outside clusters as anomalies
# *cluster is the same as *interval
#
# If this new value falls inside any dense region
# learned from training data, it’s normal; otherwise, it’s anomalous.”
# -----------------------------
def predict_dbscan_1d(model, vals: np.ndarray) -> np.ndarray:
    X = vals.reshape(-1, 1).astype(np.float32)
    z = model["scaler"].transform(X).ravel()

    # Get the clusters
    intervals = model["intervals"]
    # If no clusters found
    if not intervals:
        return np.full(z.shape[0], -1, dtype=int)  # all anomalies if no clusters learned

    # Vectorized membership check using searchsorted on interval starts
    starts = np.array([s for s, _ in intervals], dtype=np.float32)
    ends   = np.array([e for _, e in intervals], dtype=np.float32)

    # For each z, find rightmost interval start <= z
    idx = np.searchsorted(starts, z, side="right") - 1
    in_left = idx >= 0

    # For valid idx, check z <= corresponding end
    # Essentially, determines if "start ≤ z ≤ end"
    in_right = np.zeros_like(in_left, dtype=bool)
    valid_idx = idx[in_left]
    in_right[in_left] = z[in_left] <= ends[valid_idx]

    # If 1=Normal, -1=Anomaly
    in_any = in_left & in_right
    return np.where(in_any, 1, -1).astype(int)

# -----------------------------
# Main
# -----------------------------
for residence in RESIDENCES:
    pattern = os.path.join(MERGED_DIR, f"{residence}*.csv")
    files = sorted(glob.glob(pattern))
    if not files:
        continue

    # Train on first file (or choose a dedicated training file)
    train_file = files[0]
    df_train = normalize_df(pd.read_csv(train_file))
    model, train_time, mem_used = fit_dbscan_1d(
        df_train["active_power"].values,
        eps=DBSCAN_EPS,
        min_samples=DBSCAN_MIN_SAMPLES
    )
    mem_str = f"{mem_used:.1f} MB" if mem_used is not None else "N/A"
    print(f"{residence}: Train time={train_time:.2f}s, MemΔ={mem_str}, intervals={len(model['intervals'])}")

    # Predict all files with the trained model
    for f in files:
        df = normalize_df(pd.read_csv(f))
        preds = predict_dbscan_1d(model, df["active_power"].values)
        df["prediction_anomaly"] = np.where(preds == -1, "Anomaly", "Normal")

        out_name = f"{os.path.splitext(os.path.basename(f))[0]}_DBSCAN.csv"
        df.to_csv(os.path.join(OUT_DIR, out_name), index=False)
        print(f"Wrote {out_name}")
