In [None]:
##### ANOMALY DETECTION WITH 3-SIGMA #####
#
# This code performs Anomaly Detection with 3-SIGMA algorithm.
# Essentially, this translates to anomalies being greater/less than "Mean +/- 3xStandardDeviation"
#
### Training Process (per residence)
# 1. Select the training file
#    - Use: {residence}_Fridge_15minutes_StepChange_MERGED.csv
# 2. Load and clean the training data
#    - Read the CSV
#    - Convert active_power to numeric (invalid values become missing)
#    - Parse timestamp if present and sort rows chronologically (if parsing works)
#    - Normalize ground_truth_anomaly
# 3. reate the train split
#    - Take the first 80% of rows as the training portion (df_train)
# 4. Compute the 3-sigma thresholds from training active power
#    - Remove missing and infinite active_power values
#    - Compute the training mean and standard deviation
#    - If the standard deviation is zero, replace it with a tiny value to avoid degenerate thresholds
#    - Compute lower and upper using k = 3.0
# 5. Store training artifacts (in memory)
#    - Keep mu, lower, upper for this residence
#    - Record training time and peak memory (for reporting)
#
#
### Inference Algorithm (per residence - using the whole dataset)
# 1. Collect all files for the residence
#    - Match: {residence}*.csv in the MERGED directory
# 2. For each file
#    1. Load and clean the file
#       - Read the CSV
#       - Convert active_power to numeric
#       - Normalize timestamp formatting if present
#       - Normalize ground_truth_anomaly if present
#    2. Predict anomalies using the trained thresholds
#       - Label each row as:
#         - "Anomaly" if active_power is below lower or above upper threshold
#         - "Normal" otherwise
#       - Save the predictions as a new column: prediction_anomaly
# 3. Save the per-file prediction output
#    - Write the full CSV (original columns + prediction_anomaly) to:
#      - {original_filename}_3SIGMA.csv in the output directory
# 4. Compute metrics (only if ground truth exists)
#      - TP/TN/FP/FN, Accuracy, Precision, Recall, F1-Score, Normal_%, Anomaly_%
# 3. Finalize residence summary
#      - Record time and memory
#      - Save the residence summary (metrics/time/memory) CSV to the summary directory

In [None]:
# -----------------------------
# Config - Path / Parameters
# -----------------------------
import os
import glob
import time
import math
import tracemalloc
import pandas as pd
import numpy as np

RESIDENCES = [
    "REFIT_House01",
    "REFIT_House02",
    "REFIT_House03",
    "REFIT_House05",
    "REFIT_House07",
    "REFIT_House09",
    "REFIT_House15",
    "UKDALE_House01",
    "UKDALE_House02",
    "UKDALE_House05",
    "AMPds2_House01",
    "GREEND_House00",
    "GREEND_House01",
    "GREEND_House03"
]

INPUT_TRAIN_PATTERN = "/content/drive/MyDrive/Paper02_14Datasets/MERGED/{residence}_Fridge_15minutes_StepChange_MERGED.csv"
INPUT_ALL_PATTERN   = "/content/drive/MyDrive/Paper02_14Datasets/MERGED/{residence}*.csv"

OUTPUT_DIR_PRED     = "/content/drive/MyDrive/Paper02_14Datasets/ANOMALY_3SIGMA"
OUTPUT_DIR_SUMMARY  = os.path.join(OUTPUT_DIR_PRED, "Percentiles_Summary")

MODEL_NAME          = "3SIGMA"
SIGMA_K             = 3.0  # classic 3-sigma

os.makedirs(OUTPUT_DIR_PRED, exist_ok=True)
os.makedirs(OUTPUT_DIR_SUMMARY, exist_ok=True)

# -----------------------------
# Helpers
# -----------------------------
# Ensure that the colums have the correct formatting.
def read_csv_safe(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    # Normalize columns defensively
    if "timestamp" in df.columns:
        try:
            ts = pd.to_datetime(df["timestamp"], errors="coerce")
            df["timestamp"] = ts.dt.strftime("%Y-%m-%d %H:%M:%S")
        except Exception:
            pass
    if "active_power" in df.columns:
        df["active_power"] = pd.to_numeric(df["active_power"], errors="coerce")
    if "ground_truth_anomaly" in df.columns:
        df["ground_truth_anomaly"] = df["ground_truth_anomaly"].astype(str).str.strip()
        df.loc[~df["ground_truth_anomaly"].eq("Anomaly"), "ground_truth_anomaly"] = "Normal"
    return df

# Determine the mu (mean), lower/upper thresholds (mu +/- 3xStandardDeviation)
def compute_train_thresholds(train_series: pd.Series, k: float = 3.0):
    x = pd.to_numeric(train_series, errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
    if x.empty:
        return 0.0, -np.inf, np.inf
    mu = float(x.mean())
    sigma = float(x.std(ddof=0))
    if sigma == 0 or math.isclose(sigma, 0.0):
        sigma = 1e-9
    lower = mu - k * sigma
    upper = mu + k * sigma
    return mu, lower, upper

# Predict Anomaly with threshold - Anomaly if above/below threshold
def predict_with_threshold(df: pd.DataFrame, lower: float, upper: float) -> pd.Series:
    ap = pd.to_numeric(df["active_power"], errors="coerce")
    pred = np.where((ap < lower) | (ap > upper), "Anomaly", "Normal")
    return pd.Series(pred, index=df.index, name="prediction_anomaly")

# Calculate all the Metrics
def safe_metrics(y_true: pd.Series, y_pred: pd.Series):
    yt = y_true.fillna("Normal").astype(str)
    yp = y_pred.fillna("Normal").astype(str)
    actual_anom_mask  = yt.eq("Anomaly")
    actual_norm_mask  = yt.eq("Normal")
    pred_anom_mask    = yp.eq("Anomaly")
    pred_norm_mask    = yp.eq("Normal")

    TP = int(((actual_anom_mask) & (pred_anom_mask)).sum())
    TN = int(((actual_norm_mask) & (pred_norm_mask)).sum())
    FP = int(((actual_norm_mask) & (pred_anom_mask)).sum())
    FN = int(((actual_anom_mask) & (pred_norm_mask)).sum())

    total = int(len(yt))
    actual_anom = int(actual_anom_mask.sum())
    actual_norm = int(actual_norm_mask.sum())

    accuracy  = (TP + TN) / total if total else 0.0
    precision = TP / (TP + FP) if (TP + FP) else 0.0
    recall    = TP / (TP + FN) if (TP + FN) else 0.0
    f1        = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0

    normal_pct  = (TN / actual_norm * 100.0) if actual_norm else 0.0
    anomaly_pct = (TP / actual_anom * 100.0) if actual_anom else 0.0

    return {
        "Total": total,
        "TP": TP, "TN": TN, "FP": FP, "FN": FN,
        "ActualNormal": actual_norm, "ActualAnomaly": actual_anom,
        "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1-Score": f1,
        "Normal_%": normal_pct, "Anomaly_%": anomaly_pct
    }

# Calculate megabytes
def mb(bytes_val: int) -> float:
    return round(bytes_val / (1024 * 1024), 3)

# Save and overwrite CSV
def save_csv_overwrite(df: pd.DataFrame, path: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_csv(path, index=False)

# -----------------------------
# Loop through residences
# -----------------------------
for residence in RESIDENCES:
    print(f"\n==== Processing {residence} ====")

    # -----------------------------
    # TRAINING Process
    # -----------------------------
    # Read the dataset
    train_path = INPUT_TRAIN_PATTERN.format(residence=residence)
    if not os.path.exists(train_path):
        print(f"[WARN] Training file not found: {train_path}. Skipping residence.")
        continue

    df_train_full = read_csv_safe(train_path)
    if "timestamp" in df_train_full.columns:
        try:
            df_train_full["_ts_sort"] = pd.to_datetime(df_train_full["timestamp"], errors="coerce")
            df_train_full = df_train_full.sort_values("_ts_sort").drop(columns=["_ts_sort"])
        except Exception:
            pass

    # Obtain the firsrt 80% for training
    n = len(df_train_full)
    split_idx = int(n * 0.8)
    df_train = df_train_full.iloc[:split_idx].copy()

    # Calculate mu/lower/upper thresholds along with memory and time
    tracemalloc.start()
    t0 = time.perf_counter()
    mu, lower, upper = compute_train_thresholds(df_train["active_power"], k=SIGMA_K)
    train_time_sec = time.perf_counter() - t0
    train_current, train_peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    train_peak_mb = mb(train_peak)

    print(f" Trained {MODEL_NAME}: mu={mu:.6f}, lower={lower:.6f}, upper={upper:.6f}")
    print(f" TrainingTimeSec={train_time_sec:.3f}, TrainPeakMB={train_peak_mb}")

    # -----------------------------
    # INFERENCE Process
    # -----------------------------
    pattern = INPUT_ALL_PATTERN.format(residence=residence)
    all_files = sorted(glob.glob(pattern))
    if not all_files:
        print(f"[WARN] No files found for pattern: {pattern}")
        continue

    summary_rows = []
    tracemalloc.start()
    t_inf0 = time.perf_counter()

    # Loop through all files
    for in_path in all_files:
        try:
            base = os.path.basename(in_path)
            out_path = os.path.join(OUTPUT_DIR_PRED, f"{os.path.splitext(base)[0]}_{MODEL_NAME}.csv")

            df = read_csv_safe(in_path)

            # Predict anomaly if above/below threshold
            df["prediction_anomaly"] = predict_with_threshold(df, lower, upper)
            save_csv_overwrite(df, out_path)

            # Calculate the metrics
            if "ground_truth_anomaly" in df.columns:
                m = safe_metrics(df["ground_truth_anomaly"], df["prediction_anomaly"])
            else:
                m = {
                    "Total": len(df), "TP": 0, "TN": 0, "FP": 0, "FN": 0,
                    "ActualNormal": 0, "ActualAnomaly": 0,
                    "Accuracy": np.nan, "Precision": np.nan, "Recall": np.nan, "F1-Score": np.nan,
                    "Normal_%": np.nan, "Anomaly_%": np.nan
                }

            summary_rows.append({
                "Filename": base,
                "Accuracy": m["Accuracy"],
                "Precision": m["Precision"],
                "Recall": m["Recall"],
                "F1-Score": m["F1-Score"],
                "Normal_%": m["Normal_%"],
                "Anomaly_%": m["Anomaly_%"],
                "Total": m["Total"],
                "TP": m["TP"], "TN": m["TN"], "FP": m["FP"], "FN": m["FN"],
                "ActualNormal": m["ActualNormal"], "ActualAnomaly": m["ActualAnomaly"],
                "TrainingTimeSec": None,
                "InferenceTimeSec": None,
                "TrainPeakMB": train_peak_mb,
                "InferencePeakMB": None,
            })

            print(f"  ✔ Predicted & saved: {base}  |  Acc={m['Accuracy']:.4f}  N%={m['Normal_%']:.2f}  A%={m['Anomaly_%']:.2f}")

        except Exception as e:
            print(f"  [ERROR] {in_path}: {e}")

    inference_time_sec = time.perf_counter() - t_inf0
    inf_current, inf_peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    inference_peak_mb = mb(inf_peak)

    for row in summary_rows:
        row["TrainingTimeSec"] = round(train_time_sec, 6)
        row["InferenceTimeSec"] = round(inference_time_sec, 6)
        row["InferencePeakMB"] = inference_peak_mb

    summary_df = pd.DataFrame(summary_rows)
    col_order = [
        "Filename",
        "Accuracy", "Precision", "Recall", "F1-Score",
        "Normal_%", "Anomaly_%", "Total",
        "TP", "TN", "FP", "FN", "ActualNormal", "ActualAnomaly",
        "TrainingTimeSec", "InferenceTimeSec", "TrainPeakMB", "InferencePeakMB"
    ]
    summary_df = summary_df[col_order]

    summary_path = os.path.join(OUTPUT_DIR_SUMMARY, f"{residence}_ANOMALY_{MODEL_NAME}_OUTLINE.csv")
    save_csv_overwrite(summary_df, summary_path)

    print(f" InferenceTimeSec={inference_time_sec:.3f}, InferencePeakMB={inference_peak_mb}")
    print(f" ✔ Summary saved: {summary_path}")

print("\nDone.")
