In [None]:
##### ANOMALY DETECTION WITH DIFFUSION - RESIDUAL / SPECTRAL #####
#
### Algorithm 1: Training (Diffusion-Based Denoising Model)
# Load StepChange training signal for a given residence and appliance
# Sample a diffusion timestep and inject Gaussian noise into each window
# Train a UNet1D model to predict the injected noise
# Optimize mean-squared error between true and predicted noise
# Repeat for all epochs and save trained model
#
### Algorithm 2: Inference (Residual-Spectral Anomaly Detection)
# Reconstruct windows using the trained diffusion model
# Compute time-domain and frequency-domain reconstruction errors
# Normalize errors using robust z-scoring and combine into an anomaly score
# Classify samples as anomalous using a percentile threshold
#
#

In [None]:
# ============================================================
# IMPORTS
# ============================================================
import os
import time
import psutil
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from scipy.fft import fft
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix
)

# ============================================================
# LOGGER
# ============================================================
def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)

# ============================================================
# CONFIGURATION
# ============================================================
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
WINDOW_SIZE = 24
BATCH_SIZE = 64
EPOCHS = 30
LR = 1e-4

BASE = "/content/drive/MyDrive/Paper02_14Datasets"
MERGED_DIR = f"{BASE}/MERGED"
OUT_DIR = f"{BASE}/ANOMALY_DIFFUSION_RESIDUALSPECTRAL"
SUMMARY_DIR = f"{OUT_DIR}/Percentiles_Summary"

os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(SUMMARY_DIR, exist_ok=True)

# ============================================================
# RESIDENCES / APPLIANCES / ANOMALIES
# ============================================================
RESIDENCES = [
    "AMPds2_House01",
    "GREEND_House00", "GREEND_House01", "GREEND_House03",
    "UKDALE_House01", "UKDALE_House02", "UKDALE_House05",
    "REFIT_House01", "REFIT_House02", "REFIT_House03", "REFIT_House05",
    "REFIT_House07", "REFIT_House09", "REFIT_House15",
]

APPLIANCES = [
    "Fridge",
    "WashingMachine",
    "Dishwasher",
]

ANOMALIES = [
    "StepChange",
    "MultiStepChange",
    "Repeating",
    "Mirror",
    "StuckMAX",
    "StuckMIN",
    "PowerCycling",
]

THRESHOLDS = [95]  # Percentiles to loop through

# ============================================================
# DATASET AND WINDOWS
# ============================================================
class PowerDataset(Dataset):
    def __init__(self, series):
        self.series = series

    def __len__(self):
        return len(self.series)

    def __getitem__(self, idx):
        # Each item is a 1D window -> shape (1, WINDOW_SIZE)
        return torch.tensor(self.series[idx], dtype=torch.float32).unsqueeze(0)

def create_windows(signal, window=WINDOW_SIZE):
    """
    Creates non-overlapping windows of length `window`.
    If the signal length is not divisible by `window`,
    the leftover tail is ignored (handled later by padding scores).
    """
    return np.array([
        signal[i:i + window]
        for i in range(0, len(signal) - window + 1, window)
    ])

# ============================================================
# SIMPLE UNET1D
# ============================================================
class UNet1D(nn.Module):
    def __init__(self):
        super().__init__()
        self.enc = nn.Conv1d(1, 16, kernel_size=3, padding=1)
        self.dec = nn.Conv1d(16, 1, kernel_size=3, padding=1)
        self.relu = nn.ReLU()

    def forward(self, x):
        e = self.relu(self.enc(x))
        d = self.dec(e)
        return d + x  # skip connection

# ============================================================
# DDPM
# ============================================================
class DDPM:
    def __init__(self, T=1000):
        self.T = T
        self.beta = torch.linspace(1e-4, 0.02, T).to(DEVICE)
        self.alpha = 1.0 - self.beta
        self.alpha_bar = torch.cumprod(self.alpha, dim=0)

    def add_noise(self, x0, t):
        eps = torch.randn_like(x0)
        a_bar = self.alpha_bar[t].view(-1, 1, 1)

        # Forward diffusion equation
        xt = torch.sqrt(a_bar) * x0 + torch.sqrt(1 - a_bar) * eps
        return xt, eps

# ============================================================
# TRAINING
# ============================================================
def train_model(train_loader):
    log("Initializing UNet1D diffusion model")
    model = UNet1D().to(DEVICE)
    ddpm = DDPM()
    opt = torch.optim.Adam(model.parameters(), lr=LR)
    loss_fn = nn.MSELoss()

    model.train()
    start_time = time.time()

    for epoch in range(EPOCHS):
        log(f"Epoch {epoch + 1}/{EPOCHS} started")
        for x in train_loader:
            x = x.to(DEVICE)
            t = torch.randint(0, ddpm.T, (x.size(0),)).to(DEVICE)

            ## EQUATION - Forward Diffusion
            xt, eps = ddpm.add_noise(x, t)

            eps_hat = model(xt)
            loss = loss_fn(eps_hat, eps)

            opt.zero_grad()
            loss.backward()
            opt.step()

        log(f"Epoch {epoch + 1} completed")

    elapsed = time.time() - start_time
    log(f"Training finished in {elapsed:.2f} seconds")
    return model, elapsed

# ============================================================
# ANOMALY SCORE
# ============================================================
def compute_scores(x, x_hat):
    # Time-domain reconstruction error (per window)
    mae_t = np.mean(np.abs(x - x_hat), axis=1)

    # Frequency-domain reconstruction error (per window)
    rx = np.log1p(np.abs(fft(x)))
    rx_hat = np.log1p(np.abs(fft(x_hat)))
    mae_f = np.mean(np.abs(rx - rx_hat), axis=1)

    # Robust z-score using MAD
    def z_robust(v):
        med = np.median(v)
        mad = np.median(np.abs(v - med)) + 1e-6
        return (v - med) / mad

    ## EQUATION - Score Calculation based on mae_t and mae_f
    score = 0.5 * (z_robust(mae_t) + z_robust(mae_f))
    return score

# ============================================================
# MAIN PIPELINE
# ============================================================
log("Starting diffusion anomaly detection pipeline")

for res in RESIDENCES:
    stats = []

    for appliance in APPLIANCES:

        # ------------------ TRAINING (always StepChange) ---------------------
        train_path = f"{MERGED_DIR}/{res}_{appliance}_15minutes_StepChange_MERGED.csv"
        if not os.path.exists(train_path):
            log(f"⚠️ Missing training file, skipping: {train_path}")
            continue

        log(f"Loading training data: {train_path}")
        df_train = pd.read_csv(train_path)

        if "active_power" not in df_train.columns:
            log(f"⚠️ Missing 'active_power' column in training file, skipping: {train_path}")
            continue

        signal_train = df_train["active_power"].values
        windows_train = create_windows(signal_train)

        if len(windows_train) == 0:
            log(f"⚠️ No training windows produced, skipping: {train_path}")
            continue

        train_ds = PowerDataset(windows_train)
        train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)

        train_peak_mb = psutil.Process().memory_info().rss / (1024 ** 2)
        model, train_time = train_model(train_loader)

        # Compute training scores (used to set percentile threshold)
        model.eval()
        with torch.no_grad():
            recon_train = model(
                torch.tensor(windows_train).float().unsqueeze(1).to(DEVICE)
            ).cpu().numpy().squeeze()

        scores_train = compute_scores(windows_train, recon_train)

        # ------------------ INFERENCE ---------------------
        for anomaly in ANOMALIES:
            infer_path = f"{MERGED_DIR}/{res}_{appliance}_15minutes_{anomaly}_MERGED.csv"
            if not os.path.exists(infer_path):
                log(f"⚠️ Missing inference file, skipping: {infer_path}")
                continue

            log(f"Inference for: {res} | {appliance} | {anomaly}")
            df = pd.read_csv(infer_path)

            if "active_power" not in df.columns or "ground_truth_anomaly" not in df.columns:
                log(f"⚠️ Missing required columns in inference file, skipping: {infer_path}")
                continue

            windows = create_windows(df["active_power"].values)
            if len(windows) == 0:
                log(f"⚠️ No inference windows produced, skipping: {infer_path}")
                continue

            start_inf = time.time()
            with torch.no_grad():
                recon = model(
                    torch.tensor(windows).float().unsqueeze(1).to(DEVICE)
                ).cpu().numpy().squeeze()

            scores = compute_scores(windows, recon)

            # ------------------------------------------------------------
            # Expand window scores to row scores AND force equal length
            # ------------------------------------------------------------
            scores_row = np.repeat(scores, WINDOW_SIZE)

            if len(scores_row) < len(df):
                # pad missing tail rows using last score
                scores_row = np.pad(scores_row, (0, len(df) - len(scores_row)), mode="edge")
            else:
                scores_row = scores_row[:len(df)]

            # Normalize ground truth (robust to spacing/case)
            gt = df["ground_truth_anomaly"].astype(str).str.strip().str.lower()
            y_true = (gt == "anomaly").astype(int).to_numpy()

            # Actual counts
            ActualNormal = int(np.sum(y_true == 0))
            ActualAnomaly = int(np.sum(y_true == 1))

            # ------------------ LOOP THROUGH THRESHOLDS ---------------------
            for pct in THRESHOLDS:
                threshold = np.percentile(scores_train, pct)

                preds = np.where(scores_row > threshold, "anomaly", "normal")
                y_pred = (preds == "anomaly").astype(int)

                # ============================================================
                # ✅ SAVE PER-INFERENCE FILE WITH PREDICTIONS
                # ============================================================
                df_out = df.copy()

                # Keep pipeline robust (in case some files are missing columns)
                if "timestamp" not in df_out.columns:
                    df_out["timestamp"] = np.arange(len(df_out))
                if "ground_truth_appliance" not in df_out.columns:
                    df_out["ground_truth_appliance"] = ""

                df_out["prediction_anomaly"] = preds  # "anomaly" / "normal"

                # Keep ONLY requested columns and order
                df_out = df_out[[
                    "timestamp",
                    "active_power",
                    "ground_truth_anomaly",
                    "ground_truth_appliance",
                    "prediction_anomaly"
                ]]

                out_pred_csv = (
                    f"{OUT_DIR}/{res}_{appliance}_15minutes_{anomaly}_MERGED_DIFFUSION_RESIDUALSPECTRAL.csv"
                )

                df_out.to_csv(out_pred_csv, index=False)
                log(f"✅ Saved predictions file: {out_pred_csv}")

                # confusion_matrix can return 1x1 if only one class exists, so fix labels
                cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
                tn, fp, fn, tp = cm.ravel()

                normal_pct = 100.0 * float(np.mean(y_pred == 0))
                anomaly_pct = 100.0 * float(np.mean(y_pred == 1))

                inf_peak_mb = psutil.Process().memory_info().rss / (1024 ** 2)

                stats.append({
                    "Residence": res,
                    "Appliance": appliance,
                    "Anomaly": anomaly,
                    "ThresholdPct": pct,

                    # Requested metrics
                    "Accuracy": float(accuracy_score(y_true, y_pred)),
                    "Precision": float(precision_score(y_true, y_pred, zero_division=0)),
                    "Recall": float(recall_score(y_true, y_pred, zero_division=0)),
                    "F1-Score": float(f1_score(y_true, y_pred, zero_division=0)),
                    "Normal_pct": float(normal_pct),
                    "Anomaly_pct": float(anomaly_pct),
                    "Total": int(len(y_true)),
                    "TP": int(tp), "TN": int(tn), "FP": int(fp), "FN": int(fn),
                    "ActualNormal": int(ActualNormal),
                    "ActualAnomaly": int(ActualAnomaly),

                    # Extra diagnostics (kept minimal)
                    "TrainTimeSec": float(train_time),
                    "InferenceTimeSec": float(time.time() - start_inf),
                    "TrainPeakMB": float(train_peak_mb),
                    "InferencePeakMB": float(inf_peak_mb),
                })

    # ============================================================
    # SAVE PER-RESIDENCE OUTLINE CSV
    # ============================================================
    if len(stats) > 0:
        out_csv = f"{SUMMARY_DIR}/{res}_ANOMALY_DIFFUSION_OUTLINE.csv"
        pd.DataFrame(stats).to_csv(out_csv, index=False)
        log(f"✅ Saved: {out_csv}")
    else:
        log(f"⚠️ No stats collected for {res}; nothing saved.")

log("✅ Pipeline completed successfully")
