In [None]:
##### ANOMALY DETECTION WITH ISOLATION FOREST #####
#
# "Anomaly detection using random trees to isolate outliers."
# "It isolates anomalies faster (lower tree levels) because they require fewer random splits than normal points."
#
#
### Training algorithm (per residence)
# 1. Locate training file: {residence}_Fridge_15minutes_StepChange_MERGED.csv
# 2. Load + normalize
#    - Read CSV into a dataframe.
#    - Ensure timestamp exists, parse to datetime, sort by time.
#    - Ensure active_power is numeric, replace inf with NaN.
#    - Fill small gaps using backward/forward fill, then drop any remaining NaNs.
# 3. Time-ordered split (80/20)
#    - Take the first 80% of rows as the training portion.
#    - Take the last 20% of rows as the held-out test portion.
# 4. Fit Isolation Forest on training active power
#    - Use only the active_power column.
#    - Train IsolationForest(n_estimators=100, contamination="auto", random_state=42, n_jobs=-1).
# 5. Track training efficiency - time / memory
#
#
### Inference algorithm (per residence, per file)
# 1. Collect all files to score
#    - Find every file matching MERGED/{residence}*.csv.
# 2. For each file
#    1. Load + normalize
#       - Read CSV.
#       - Parse/sort timestamp.
#       - Clean active_power (numeric, remove inf, fill gaps, drop leftover NaNs).
#    2. Predict anomaly labels
#       - Run model.predict() on active_power for every row.
#       - Convert predictions to:
#         - "Anomaly" when model returns -1
#         - "Normal" when model returns 1
#       - Store in a new column: prediction_anomaly.
# 3. Track inference efficiency - time/memory
# 4. Save scored CSV
#    - Write to ANOMALY_ISOLATIONFOREST/ with suffix _ISOLATIONFOREST.csv.
# 5. Compute per-file metrics
#    - Convert ground truth to binary anomaly vs normal.
#    - Compare against prediction_anomaly.
#    - Compute accuracy/precision/recall/F1, confusion matrix, and per-class “hit rates” (Normal% and Anomaly_%).
# 3. Save per-residence summary
#

In [None]:
# -----------------------------
# Configuration - Paths & Parameters
# -----------------------------
import os
import glob
import time
import math
import json
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

import tracemalloc
try:
    import resource  # Unix
    HAVE_RESOURCE = True
except Exception:
    HAVE_RESOURCE = False
try:
    import psutil    # Fallback
    HAVE_PSUTIL = True
except Exception:
    HAVE_PSUTIL = False

warnings.filterwarnings("ignore")


BASE = "/content/drive/MyDrive/Paper02_14Datasets"
MERGED_DIR = f"{BASE}/MERGED"
OUT_DIR = f"{BASE}/ANOMALY_ISOLATIONFOREST"
SUMMARY_DIR = f"{OUT_DIR}/Percentiles_Summary"
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
Path(SUMMARY_DIR).mkdir(parents=True, exist_ok=True)

RESIDENCES = [
              "REFIT_House01",
              "REFIT_House02",
              "REFIT_House03",
              "REFIT_House05",
              "REFIT_House07",
              "REFIT_House09",
              "REFIT_House15",
              "UKDALE_House01",
              "UKDALE_House02",
              "UKDALE_House05",
              "AMPds2_House01",
              "GREEND_House00",
              "GREEND_House01",
              "GREEND_House03"
          ] #, "REFITT_House03", "UKDALE_House01", "UKDALE_House05"]
TRAIN_FILE_TEMPLATE = "{residence}_Fridge_15minutes_StepChange_MERGED.csv"  # training-only file per residence
RANDOM_STATE = 42

# -----------------------------
# Helpers
# -----------------------------
# Make directories to store content
def safe_make_dirs_for(file_path: str):
    Path(os.path.dirname(file_path)).mkdir(parents=True, exist_ok=True)

# Normalize: existence/cleaning/timestamp & sort
def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
    # Ensure required columns exist and are clean
    expected_cols = ["timestamp", "active_power"]
    for col in expected_cols:
        if col not in df.columns:
            raise ValueError(f"Missing required column '{col}' in input.")
    # Convert timestamp & sort
    df = df.copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=False)
    df = df.sort_values("timestamp").reset_index(drop=True)

    # Clean active_power
    df["active_power"] = pd.to_numeric(df["active_power"], errors="coerce")
    df["active_power"] = df["active_power"].replace([np.inf, -np.inf], np.nan)
    # forward/backward fill small gaps, then finally drop leftover NaNs
    df["active_power"] = df["active_power"].bfill().ffill()
    df = df.dropna(subset=["active_power"])
    return df

# Return ground_truth_anomaly as binary (1=Anomaly, 0=Normal)
def y_from_ground_truth(df: pd.DataFrame):
    if "ground_truth_anomaly" not in df.columns:
        return None, False
    col = df["ground_truth_anomaly"]

    def map_val(v):
        if isinstance(v, str):
            v = v.strip().lower()
            if v == "anomaly": return 1
            # everything else counts as Normal
            return 0
        try:
            # treat any nonzero as anomaly
            return 1 if float(v) != 0 else 0
        except Exception:
            return 0
    y = col.map(map_val).astype(int)
    return y.values, True

# Converts to binary labels (0 or 1) and readable labels ("Normal" or "Anomaly")
def preds_to_labels(preds_1normal_minus1anom):
    bin_ = np.where(preds_1normal_minus1anom == -1, 1, 0)
    str_ = np.where(bin_==1, "Anomaly", "Normal")
    return bin_, str_

# Compute metrics: Accuracy/Precision/Recall/F1-Score/TN/FP/FN/TP/ActualNormal/ActualAnomaly/Normal%/Anomaly_%/Total(total anomaly)
def compute_metrics(y_true_bin, y_pred_bin):
    # y_* are {0,1}; handle cases with a single class in y_true safely
    labels_present = list(sorted(set(y_true_bin)))
    average = "binary" if set(labels_present) == {0,1} else "micro"
    acc = accuracy_score(y_true_bin, y_pred_bin)
    try:
        prec = precision_score(y_true_bin, y_pred_bin, zero_division=0)
        rec  = recall_score(y_true_bin, y_pred_bin, zero_division=0)
        f1   = f1_score(y_true_bin, y_pred_bin, zero_division=0)
    except Exception:
        # Fallback if class imbalance is extreme
        prec = rec = f1 = 0.0

    # Confusion matrix with labels [0,1] (Normal, Anomaly)
    cm = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1])
    # Handle shape safety (in case one class absent)
    if cm.shape == (2,2):
        tn, fp, fn, tp = cm.ravel()
    else:
        # Degenerate case
        tn = fp = fn = tp = 0
        if labels_present == [0]:
            tn = int((y_true_bin==0).sum())
        elif labels_present == [1]:
            tp = int((y_true_bin==1).sum())

    actual_normal  = int((y_true_bin==0).sum())
    actual_anomaly = int((y_true_bin==1).sum())
    normal_pct  = (tn / actual_normal * 100.0) if actual_normal > 0 else 0.0
    anomaly_pct = (tp / actual_anomaly * 100.0) if actual_anomaly > 0 else 0.0
    total = len(y_true_bin)

    return {
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-Score": f1,
        "TN": int(tn),
        "FP": int(fp),
        "FN": int(fn),
        "TP": int(tp),
        "ActualNormal": int(actual_normal),
        "ActualAnomaly": int(actual_anomaly),
        "Normal%": normal_pct,
        "Anomaly_%": anomaly_pct,
        "Total": int(total),
    }

# Memory functions
def start_mem_trace():
    tracemalloc.start()
def stop_mem_trace_mb():
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    # Convert bytes to MB
    return peak / (1024*1024)
def alt_peak_mb():
    # Best-effort platform peak memory (not always available)
    if HAVE_RESOURCE:
        # ru_maxrss: on Linux returns KB, on macOS bytes. Normalize cautiously:
        ru = resource.getrusage(resource.RUSAGE_SELF)
        peak = ru.ru_maxrss
        # Heuristic: if very large, assume bytes (macOS), else KB (Linux)
        if peak > 10**9:  # bytes
            return peak / (1024*1024)
        else:             # KB
            return peak / 1024.0
    if HAVE_PSUTIL:
        proc = psutil.Process(os.getpid())
        mem = getattr(proc.memory_info(), "rss", 0)
        return mem / (1024*1024)
    return None

# Training the Isolation forest
# contaminaton="auto" -> all data is normal, RANDOM_STATE for repeatability
def fit_isolation_forest(train_active_power: np.ndarray):
    model = IsolationForest(
        n_estimators=100,
        contamination="auto",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )
    # Train time + mem
    start = time.perf_counter()
    start_mem_trace()
    model.fit(train_active_power.reshape(-1, 1))
    train_peak_mb_trace = stop_mem_trace_mb()
    train_time_sec = time.perf_counter() - start

    # Alternate peak (if available)
    train_peak_mb_alt = alt_peak_mb()
    train_peak_mb = train_peak_mb_trace if train_peak_mb_trace is not None else (train_peak_mb_alt or 0.0)

    return model, train_time_sec, train_peak_mb

# Make prediction from the IsolationForest and also record memory/time
def predict_with_timing(model, x_array: np.ndarray):
    start = time.perf_counter()
    start_mem_trace()
    preds = model.predict(x_array.reshape(-1,1))  # 1=inlier, -1=outlier
    infer_peak_mb_trace = stop_mem_trace_mb()
    infer_time_sec = time.perf_counter() - start

    infer_peak_mb_alt = alt_peak_mb()
    infer_peak_mb = infer_peak_mb_trace if infer_peak_mb_trace is not None else (infer_peak_mb_alt or 0.0)
    return preds, infer_time_sec, infer_peak_mb

# Make predictions (with predict_with_timing) and records the metrics
def evaluate_on_test(df_test: pd.DataFrame, model):
    y_true_bin, has_gt = y_from_ground_truth(df_test)
    metrics = {}
    if has_gt:
        preds_raw, infer_time, infer_peak = predict_with_timing(model, df_test["active_power"].values.astype(float))
        y_pred_bin, _ = preds_to_labels(preds_raw)
        metrics = compute_metrics(y_true_bin, y_pred_bin)
        metrics.update({
            "InferenceTimeSec": float(infer_time),
            "InferencePeakMB": float(infer_peak),
        })
    else:
        metrics = {
            "Accuracy": np.nan, "Precision": np.nan, "Recall": np.nan, "F1-Score": np.nan,
            "TN": np.nan, "FP": np.nan, "FN": np.nan, "TP": np.nan,
            "ActualNormal": np.nan, "ActualAnomaly": np.nan,
            "Normal%": np.nan, "Anomaly_%": np.nan,
            "Total": len(df_test),
            "InferenceTimeSec": np.nan, "InferencePeakMB": np.nan,
        }
    return metrics

# Make predictions and save
def add_predictions_and_save(model, in_path: str, residence: str):
    df = pd.read_csv(in_path)
    df = normalize_df(df)

    # Predict (full file) with timing
    preds_raw, infer_time, infer_peak = predict_with_timing(model, df["active_power"].values.astype(float))
    _, preds_text = preds_to_labels(preds_raw)
    df["prediction_anomaly"] = preds_text

    # Save to required location/name
    # Turn: MERGED/{residence}*.csv  -> ANOMALY_ISOLATIONFOREST/{residence}*_ISOLATIONFOREST.csv
    in_name = os.path.basename(in_path)
    out_name = f"{os.path.splitext(in_name)[0]}_ISOLATIONFOREST.csv"
    out_path = os.path.join(OUT_DIR, out_name)
    safe_make_dirs_for(out_path)
    df.to_csv(out_path, index=False)

    # If ground truth exists, compute per-file metrics too
    y_true_bin, has_gt = y_from_ground_truth(df)
    if has_gt:
        y_pred_bin = (df["prediction_anomaly"].str.lower() == "anomaly").astype(int).values
        metrics = compute_metrics(y_true_bin, y_pred_bin)
    else:
        metrics = {
            "Accuracy": np.nan, "Precision": np.nan, "Recall": np.nan, "F1-Score": np.nan,
            "TN": np.nan, "FP": np.nan, "FN": np.nan, "TP": np.nan,
            "ActualNormal": np.nan, "ActualAnomaly": np.nan,
            "Normal%": np.nan, "Anomaly_%": np.nan,
            "Total": len(df),
        }

    # Attach inference timing/memory for this file
    metrics.update({
        "InferenceTimeSec": float(infer_time),
        "InferencePeakMB": float(infer_peak),
    })

    return out_path, metrics

# -----------------------------
# Main - Loop through residences
# -----------------------------
all_residence_reports = {}

for residence in RESIDENCES:
    print(f"\n=== Residence: {residence} ===")
    train_file = os.path.join(MERGED_DIR, TRAIN_FILE_TEMPLATE.format(residence=residence))
    if not os.path.exists(train_file):
        print(f"!! Training file not found: {train_file}. Skipping this residence.")
        continue

    # Load and clean training file
    df_train_full = pd.read_csv(train_file)
    df_train_full = normalize_df(df_train_full)

    # Split 80/20 by time order
    n = len(df_train_full)
    split_idx = int(0.8 * n)
    df_train = df_train_full.iloc[:split_idx].copy()
    df_test  = df_train_full.iloc[split_idx:].copy()

    # Train ONLY on active_power (assumed all normal)
    model, train_time_sec, train_peak_mb = fit_isolation_forest(df_train["active_power"].values.astype(float))

    # Evaluate on the held-out 20% (if GT available)
    test_metrics = evaluate_on_test(df_test, model)

    # Scan every {residence}*.csv in MERGED and create predictions
    pattern = os.path.join(MERGED_DIR, f"{residence}*.csv")
    files = sorted(glob.glob(pattern))
    if not files:
        print(f"No files found for pattern: {pattern}")
        continue

    rows = []
    for f in files:
        out_path, file_metrics = add_predictions_and_save(model, f, residence)
        # Compose summary row
        row = {
            "Filename": os.path.basename(f),
            "Accuracy": file_metrics["Accuracy"],
            "Precision": file_metrics["Precision"],
            "Recall": file_metrics["Recall"],
            "F1-Score": file_metrics["F1-Score"],
            "Normal%": file_metrics["Normal%"],
            "Anomaly_%": file_metrics["Anomaly_%"],
            "TrainingTimeSec": float(train_time_sec),
            "InferenceTimeSec": float(file_metrics["InferenceTimeSec"]),
            "TrainPeakMB": float(train_peak_mb),
            "InferencePeakMB": float(file_metrics["InferencePeakMB"]),
            "Total": file_metrics["Total"],
            "TP": file_metrics["TP"] if not (isinstance(file_metrics["TP"], float) and math.isnan(file_metrics["TP"])) else 0,
            "TN": file_metrics["TN"] if not (isinstance(file_metrics["TN"], float) and math.isnan(file_metrics["TN"])) else 0,
            "FP": file_metrics["FP"] if not (isinstance(file_metrics["FP"], float) and math.isnan(file_metrics["FP"])) else 0,
            "FN": file_metrics["FN"] if not (isinstance(file_metrics["FN"], float) and math.isnan(file_metrics["FN"])) else 0,
            "ActualNormal": file_metrics["ActualNormal"] if not (isinstance(file_metrics["ActualNormal"], float) and math.isnan(file_metrics["ActualNormal"])) else 0,
            "ActualAnomaly": file_metrics["ActualAnomaly"] if not (isinstance(file_metrics["ActualAnomaly"], float) and math.isnan(file_metrics["ActualAnomaly"])) else 0,
        }
        rows.append(row)
        print(f"Wrote predictions: {out_path}")

    # Save outline CSV for this residence
    outline_df = pd.DataFrame(rows)
    outline_path = os.path.join(SUMMARY_DIR, f"{residence}_ANOMALY_ISOLATIONFOREST_OUTLINE.csv")
    safe_make_dirs_for(outline_path)
    outline_df.to_csv(outline_path, index=False)
    all_residence_reports[residence] = outline_path
    print(f"Summary saved: {outline_path}")

# Final hint line so you can quickly find outputs
print("\nDone. Per-residence outline CSVs:")
for k, v in all_residence_reports.items():
    print(f" - {k}: {v}")
print(f"\nPredicted CSVs live under: {OUT_DIR}")
