In [None]:
##### OUTLINE OF THIS FILE #####
### This file presents the final statistics
### meaning those that deal with the ALL results

#### COMBINE - Combine the NILM and ANOMALY files ####

#### BASELINE RESULTS - Anomaly_Version/Accuracy/Precision/Recall/F1_Score/Normal_%/Anomaly_% ####
#### NILM STATISTICS ####

#### TIMING AND MEMORY ####
#### TIMING AND MEMORY - AGGREGATE ####

#### BOXPLOT - Distribution of Percentage Anomaly per Anomaly Type ####
#### BOXPLOT - Distribution of Percentage Anomaly per Anomaly Type - "Median" ####

In [None]:
#### COMBINE - Combine the NILM and ANOMALY files ####
###
### Combines ANOMALY with NILM based on "timestmap"
###
### Output merged file has:
###      - timestamp, active_power, ground_truth_appliance, prediction_appliance, ground_truth_anomaly, prediction_anomaly
###
### FOLDERS ARE:
###    ANOMALY_DIR = os.path.join(BASE, "ANOMALY_{AnomalyType}")
###    NILM_DIR    = os.path.join(BASE, "NILM")
###    OUT_DIR     = os.path.join(BASE, "COMBINED_{AnomalyType}")
###
import os
import sys
import pandas as pd

# ---- Parameters you can tweak if needed ----
RESIDENCES = [
                "REFIT_House01",
                "REFIT_House02",
                "REFIT_House03",
                "REFIT_House05",
                "REFIT_House07",
                "REFIT_House09",
                "REFIT_House15",
                "UKDALE_House01",
                "UKDALE_House02",
                "UKDALE_House05",
                "AMPds2_House01",
                "GREEND_House00",
                "GREEND_House01",
                "GREEND_House03"
            ]
APPLIANCES = ["Fridge", "WashingMachine", "Dishwasher"]
ANOMALIES  = ["StepChange", "MultiStepChange", "Mirror", "Repeating", "StuckMAX", "StuckMIN", "PowerCycling"]
MODELS     = ["DIFFUSION_RESIDUALSPECTRAL"] #"HOLTWINTERS", "3SIGMATHRESHOLD", "DBSCAN", "ISOLATIONFOREST", "OCSVM", "LSTMAE", "COCA", "VAE", "TRANAD"]

BASE = "/content/drive/MyDrive/Paper02_14Datasets"
ANOMALY_DIR = os.path.join(BASE, "ANOMALY_DIFFUSION_RESIDUALSPECTRAL")
NILM_DIR    = os.path.join(BASE, "NILM_version_20260207")
OUT_DIR     = os.path.join(BASE, "COMBINED_DIFFUSION_RESIDUALSPECTRAL")

# Ensure output directory exists
os.makedirs(OUT_DIR, exist_ok=True)

def _read_csv_with_dt(path):
    """Read a CSV and parse 'timestamp' if present. Returns DataFrame or None if missing."""
    if not os.path.exists(path):
        print(f"[MISS] {path}")
        return None
    try:
        df = pd.read_csv(path)
    except Exception as e:
        print(f"[ERROR] reading {path}: {e}")
        return None
    # Parse timestamp robustly if present
    if "timestamp" in df.columns:
        try:
            df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=False)
        except Exception as e:
            print(f"[WARN] couldn't parse timestamp for {path}: {e}")
    else:
        print(f"[WARN] 'timestamp' column missing in {path}")
    return df

def _standardize_columns(df, origin):
    """
    Try to standardize column names so downstream merge succeeds.
    - Accepts common variants and renames to the expected names when possible.
    """
    if df is None:
        return None

    # Build rename map cautiously (only if source columns exist)
    rename_map = {}
    # Prediction anomaly
    for cand in ["prediction_anomaly", "pred_anomaly", "predicted_anomaly", "anomaly_pred", "y_pred_anomaly"]:
        if cand in df.columns:
            rename_map[cand] = "prediction_anomaly"
            break
    # Ground truth anomaly
    for cand in ["ground_truth_anomaly", "gt_anomaly", "true_anomaly", "label_anomaly"]:
        if cand in df.columns:
            rename_map[cand] = "ground_truth_anomaly"
            break
    # Prediction appliance
    for cand in ["prediction_appliance", "pred_appliance", "predicted_appliance", "appliance_pred", "y_pred_appliance"]:
        if cand in df.columns:
            rename_map[cand] = "prediction_appliance"
            break
    # Ground truth appliance
    for cand in ["ground_truth_appliance", "gt_appliance", "true_appliance", "label_appliance"]:
        if cand in df.columns:
            rename_map[cand] = "ground_truth_appliance"
            break
    # Active power
    for cand in ["active_power", "power_active", "P_active", "real_power"]:
        if cand in df.columns:
            rename_map[cand] = "active_power"
            break

    if rename_map:
        df = df.rename(columns=rename_map)

    # Only keep columns we'll need later (plus timestamp)
    keep_cols = [c for c in ["timestamp", "active_power",
                             "ground_truth_appliance", "prediction_appliance",
                             "ground_truth_anomaly", "prediction_anomaly"]
                 if c in df.columns]
    # If keep_cols loses all but timestamp, keep timestamp to allow merge
    if "timestamp" in df.columns and len(keep_cols) == 1:
        keep_cols = ["timestamp"]
    df = df[keep_cols] if keep_cols else df
    return df

def merge_one(residence, appliance, anomaly, model):
    # Build file paths
    anomaly_path = os.path.join(
        ANOMALY_DIR,
        f"{residence}_{appliance}_15minutes_{anomaly}_MERGED_{model}.csv",
    )
    nilm_path = os.path.join(
        NILM_DIR,
        f"{residence}_{appliance}_15minutes_{anomaly}_NILM.csv",
    )
    out_path = os.path.join(
        OUT_DIR,
        f"{residence}_{appliance}_15minutes_{anomaly}_COMBINED_{model}.csv",
    )

    # Read
    df_anom = _read_csv_with_dt(anomaly_path)
    df_nilm = _read_csv_with_dt(nilm_path)

    if df_anom is None or df_nilm is None:
        print(f"[SKIP] Missing input for {residence}/{appliance}/{anomaly}/{model}")
        return

    # Standardize columns / prune extras
    df_anom = _standardize_columns(df_anom, "ANOMALY")
    df_nilm = _standardize_columns(df_nilm, "NILM")

    # Inner merge on timestamp (only rows present in both files)
    # If timestamp is all NaT in either, the merge will be empty.
    try:
        merged = pd.merge(df_nilm, df_anom, on="timestamp", how="inner", suffixes=("_nilm", "_anom"))
    except KeyError:
        print(f"[SKIP] 'timestamp' not in both inputs for {residence}/{appliance}/{anomaly}/{model}")
        return

    # Build the final ordered columns, pulling from whichever side has them
    # active_power: favor NILM side if present there
    if "active_power_nilm" in merged.columns:
        merged["active_power"] = merged["active_power_nilm"]
    elif "active_power_anom" in merged.columns:
        merged["active_power"] = merged["active_power_anom"]

    # Consolidate fields if both sides had them (prefer NILM for appliance truth/pred,
    # ANOMALY file for anomaly truth/pred)
    def coalesce(cols):
        for c in cols:
            if c in merged.columns:
                return merged[c]
        return None

    merged["ground_truth_appliance"] = coalesce(["ground_truth_appliance_nilm", "ground_truth_appliance_anom", "ground_truth_appliance"])
    merged["prediction_appliance"]   = coalesce(["prediction_appliance_nilm", "prediction_appliance_anom", "prediction_appliance"])
    merged["ground_truth_anomaly"]   = coalesce(["ground_truth_anomaly_anom", "ground_truth_anomaly_nilm", "ground_truth_anomaly"])
    merged["prediction_anomaly"]     = coalesce(["prediction_anomaly_anom", "prediction_anomaly_nilm", "prediction_anomaly"])

    # Keep only requested columns in target order
    final_cols = ["timestamp", "active_power", "ground_truth_appliance",
                  "prediction_appliance", "ground_truth_anomaly", "prediction_anomaly"]
    # Ensure all exist; if any missing, create empty (NaN) column so file shape is consistent
    for col in final_cols:
        if col not in merged.columns:
            merged[col] = pd.NA

    merged_final = merged[final_cols].copy()

    # Sort by timestamp if possible
    if pd.api.types.is_datetime64_any_dtype(merged_final["timestamp"]):
        merged_final = merged_final.sort_values("timestamp")
        # Write as ISO-like string (keeps seconds if present)
        merged_final["timestamp"] = merged_final["timestamp"].dt.strftime("%Y-%m-%d %H:%M:%S")
    else:
        # Leave as-is if non-datetime strings
        pass

    # Save (overwrite)
    try:
        merged_final.to_csv(out_path, index=False)
        print(f"[OK] {out_path}  rows={len(merged_final)}")
    except Exception as e:
        print(f"[ERROR] writing {out_path}: {e}")

def main():
    # Optional: allow narrowing via CLI args (residence appliance anomaly model)
    # If provided, only process that combination.
    if len(sys.argv) == 5:
        res, app, anom, mod = sys.argv[1:5]
        merge_one(res, app, anom, mod)
        return

    # Otherwise process the cartesian product
    for res in RESIDENCES:
        for app in APPLIANCES:
            for anom in ANOMALIES:
                for mod in MODELS:
                    merge_one(res, app, anom, mod)

if __name__ == "__main__":
    main()


In [None]:
#### BASELINE RESULTS - Anomaly_Version/Accuracy/Precision/Recall/F1_Score/Normal_%/Anomaly_% ####

import os
import pandas as pd
import glob

# ==========================
# Configuration
# ==========================
BASE_DIR = "/content/drive/MyDrive/Paper02_14Datasets"
OUT_FILE = os.path.join(BASE_DIR, "STATISTICS", "Baseline_Stats.csv")

ANOMALY_VERSIONS = [
    "3SIGMA",
    "AE",
    "COCA",
    "DBSCAN",
    "ISOLATIONFOREST",
    "DIFFUSION_RESIDUALSPECTRAL"
    # "DIFFUSION_NORESIDUALNOSPECTRAL",
]

# ==========================
# Processing
# ==========================
all_results = []

for version in ANOMALY_VERSIONS:
    folder = os.path.join(BASE_DIR, f"ANOMALY_{version}", "Percentiles_Summary")
    files = glob.glob(os.path.join(folder, "*.csv"))

    if not files:
        print(f"⚠️ No files found for {version}")
        continue

    dfs = []

    for f in files:
        try:
            df = pd.read_csv(f)

            # --------------------------
            # Normalize column names
            # --------------------------
            if "F1-Score" in df.columns:
                df.rename(columns={"F1-Score": "F1_Score"}, inplace=True)
            if "F1 Score" in df.columns:
                df.rename(columns={"F1 Score": "F1_Score"}, inplace=True)

            # Handle DIFFUSION_RESIDUALSPECTRAL naming
            if "Normal_pct" in df.columns:
                df.rename(columns={"Normal_pct": "Normal_%"}, inplace=True)
            if "Anomaly_pct" in df.columns:
                df.rename(columns={"Anomaly_pct": "Anomaly_%"}, inplace=True)

            dfs.append(
                df[
                    [
                        "Accuracy",
                        "Precision",
                        "Recall",
                        "F1_Score",
                        "Normal_%",
                        "Anomaly_%",
                    ]
                ]
            )

        except Exception as e:
            print(f"⚠️ Skipping file {f} due to error: {e}")

    if not dfs:
        continue

    merged = pd.concat(dfs, ignore_index=True)
    mean_values = merged.mean()
    mean_values["Anomaly_Version"] = version
    all_results.append(mean_values)

# ==========================
# Save output
# ==========================
if all_results:
    result_df = pd.DataFrame(all_results)[
        [
            "Anomaly_Version",
            "Accuracy",
            "Precision",
            "Recall",
            "F1_Score",
            "Normal_%",
            "Anomaly_%",
        ]
    ]

    os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True)
    result_df.to_csv(OUT_FILE, index=False)
    print(f"✅ Saved summary to {OUT_FILE}")
else:
    print("⚠️ No valid results found.")



In [None]:
#### NILM STATISTICS ####
import os
import pandas as pd

# ================================================================
# Configuration
# ================================================================
BASE_DIR = "/content/drive/MyDrive/Paper02_14Datasets/NILM"
OUT_FILE = "/content/drive/MyDrive/Paper02_14Datasets/STATISTICS/NILM_Stats.csv"

RESIDENCES = [
    "AMPds2_House01",
    "GREEND_House00", "GREEND_House01", "GREEND_House03",
    "UKDALE_House01", "UKDALE_House02", "UKDALE_House05",
    "REFIT_House01", "REFIT_House02", "REFIT_House03",
    "REFIT_House05", "REFIT_House07", "REFIT_House09", "REFIT_House15"
]

# ================================================================
# Compute averages
# ================================================================
records = []
for res in RESIDENCES:
    file_path = os.path.join(BASE_DIR, f"{res}_NILM_Results.csv")
    if not os.path.exists(file_path):
        print(f"⚠️ Missing file: {file_path}")
        continue

    df = pd.read_csv(file_path)

    # Normalize column names
    df.columns = [c.strip().lower() for c in df.columns]

    # Extract relevant metrics
    cols = ["accuracy", "precision", "recall", "f1_score", "auc", "poc"]
    available = [c for c in cols if c in df.columns]

    # Compute averages
    averages = {c: df[c].mean() for c in available}
    averages["residence"] = res
    records.append(averages)

# ================================================================
# Save results
# ================================================================
if records:
    result_df = pd.DataFrame(records)
    result_df = result_df[
        ["residence", "accuracy", "precision", "recall", "f1_score", "auc", "poc"]
    ].round(4)
    os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True)
    result_df.to_csv(OUT_FILE, index=False)
    print(f"✅ Saved: {OUT_FILE}")
else:
    print("❌ No valid results found.")


In [None]:
 #### TIMING AND MEMORY - TrainingTimeSec/InferenceTimeSec/TrainPeakMB/InferencePeakMB ####
import os
import pandas as pd

# --- Configuration ---
BASE = "/content/drive/MyDrive/Paper02_14Datasets"
OUT_FILE = f"{BASE}/STATISTICS/Timing_and_Memory.csv"
os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True)

anomaly_versions = [
    "3SIGMA", "AE", "COCA", "DBSCAN", "ISOLATIONFOREST",
    "DIFFUSION_RESIDUALSPECTRAL"
]

residences = [
    "AMPds2_House01",
    "GREEND_House00", "GREEND_House01", "GREEND_House03",
    "UKDALE_House01", "UKDALE_House02", "UKDALE_House05",
    "REFIT_House01", "REFIT_House02", "REFIT_House03", "REFIT_House05",
    "REFIT_House07", "REFIT_House09", "REFIT_House15",
]

records = []

# --- Collect timing and memory info ---
for res in residences:
    for anom in anomaly_versions:
        if anom == "DIFFUSION_RESIDUALSPECTRAL":
          anom = "DIFFUSION"
        file_path = f"{BASE}/ANOMALY_{anom}/Percentiles_Summary/{res}_ANOMALY_{anom}_OUTLINE.csv"
        if not os.path.exists(file_path):
            print(f"⚠️ Missing file: {file_path}")
            continue

        try:
            df = pd.read_csv(file_path)
            # Compute mean across appliances if multiple rows exist
            timing_memory = df[["TrainingTimeSec", "InferenceTimeSec", "TrainPeakMB", "InferencePeakMB"]].mean()
            records.append({
                "residence": res,
                "anomaly_version": anom,
                "TrainingTimeSec": timing_memory["TrainingTimeSec"],
                "InferenceTimeSec": timing_memory["InferenceTimeSec"],
                "TrainPeakMB": timing_memory["TrainPeakMB"],
                "InferencePeakMB": timing_memory["InferencePeakMB"]
            })
        except Exception as e:
            print(f"❌ Error reading {file_path}: {e}")

# --- Save to CSV ---
out_df = pd.DataFrame(records)
out_df.to_csv(OUT_FILE, index=False)
print(f"✅ Saved summary to: {OUT_FILE}")
print(out_df.head())

In [None]:
#### TIMING AND MEMORY - AGGREGATE ####
import pandas as pd

# ==============================
# Configuration
# ==============================
input_file = "/content/drive/MyDrive/Paper02_14Datasets/STATISTICS/Timing_and_Memory.csv"
output_file = "/content/drive/MyDrive/Paper02_14Datasets/STATISTICS/Timing_and_Memory_aggregate.csv"

# ==============================
# Load Data
# ==============================
df = pd.read_csv(input_file)

# ==============================
# Compute Averages by Anomaly Version
# ==============================
agg_df = df.groupby("anomaly_version", as_index=False)[
    ["TrainingTimeSec", "InferenceTimeSec", "TrainPeakMB", "InferencePeakMB"]
].mean()

# ==============================
# Save to CSV
# ==============================
agg_df.to_csv(output_file, index=False)

print(f"✅ Aggregated file saved to: {output_file}")
print(agg_df)
