In [None]:
##### OUTLINE OF FILE #####
###
### Basically, this code creates all the datasets to be used in DANDY.
###
### This files processes the original data (REFT/UKDALE/Ampds2/GREEND)
### It creates the Centroids file
### It also creates the anomalies and merges to get the final file
### This final file has the following:
### 'timestamp', 'active_power', 'ground_truth_anomaly', 'ground_truth_appliance'
###
###
##### SECTIONS OF CODE IN THIS FILE #####
###
#### REFIT DATASET - Create Dataset and combine for 15 minutes ####
#### UKDALE DATASET - Create Dataset and combine for 15 minutes ####
#### AMPds2 Dataset - Create Dataset and combine for 15 minutes ####
#### GREEND Dataset - Create Dataset and combine for 15 minutes ####
###
#### CENTROIDS  - Average Power Consumption for appliances within each dataset-house combination ####
#### ANOMALIES - Create anomalies (7 types for 7 days) plus ground_truth_anomaly ####
#### MERGE - combines anomalies and normal data plus sets ground_truth_appliance ####
###
###

In [None]:
#### REFIT DATASET - Create Dataset and combine for 15 minutes ####
#
# REFIT Dataset: https://pureportal.strath.ac.uk/en/datasets/refit-electrical-load-measurements-cleaned/
#
# The dataset is from : /content/drive/MyDrive/Paper02_14Datasets/REFIT_ORIGINALS
# The files are: {residence}.csv
# residence is: "House02', "House03", "House08", "House09"
# appliance is: "Fridge", "WashingMachine", "Microwave"
#
# For residence "House02", Fridge = "1", WashingMachine = "2", Microwave = "5"
# For residence "House03", Fridge = "2", WashingMachine = "6", Microwave = "8"
# For residence "House08", Fridge = "1", WashingMachine = "4", Microwave = "8"
# For residence "House09", Fridge = "1", WashingMachine = "3", Microwave = "6"
#
# Create files:
# /content/drive/MyDrive/Paper02_14Datasets/REFIT_ORIGINALS/REFIT_{residence}_{appliance}.csv
#
# For appliance each appliance, please extract from {residence}.csv the columns 'Time" and "Appliance{appliance}"
# Rename "Time" to "timestamp", and "Appliance{WashingMachine}" to "active_power"
# Save to: /content/drive/MyDrive/Paper02_14Datasets/REFIT_ORIGINALS/REFIT_{residence}_{appliance}.csv
#
# Aggregate "active_power" for every 1 minute.
# Get the distribution of this "active_power" and zero the bottom 90th percentile.
# Next, please add the "active_power" for every 15 minutes.
#
# Save to: /content/drive/MyDrive/Paper02_14Datasets/ORIGINALS_15MINUTES/REFIT_{residence}_{appliance}_15minutes.csv
#
# For each {residence}: please print the date range: "First Date" and the "Last Date"

import os
import pandas as pd

# ---------------------------
# Paths
# ---------------------------
BASE = "/content/drive/MyDrive/Paper02_14Datasets/REFIT_ORIGINALS"
OUT_15 = "/content/drive/MyDrive/Paper02_14Datasets/ORIGINALS_15MINUTES"
os.makedirs(OUT_15, exist_ok=True)

# ---------------------------
# Residence → Appliance channel IDs
# ---------------------------
house_appliances = {
    "House01": {"Fridge": "1", "WashingMachine": "5", "Dishwasher": "6"},
    "House02": {"Fridge": "1", "WashingMachine": "2", "Dishwasher": "3"},
    "House03": {"Fridge": "2", "WashingMachine": "6", "Dishwasher": "5"},
    "House05": {"Fridge": "1", "WashingMachine": "3", "Dishwasher": "4"},
	  "House07": {"Fridge": "1", "WashingMachine": "5", "Dishwasher": "6"},
	  "House09": {"Fridge": "1", "WashingMachine": "3", "Dishwasher": "4"},
	  "House15": {"Fridge": "1", "WashingMachine": "3", "Dishwasher": "4"},
}

# ---------------------------
# Tries two date formats, keeps the one that works.
#
# Formats are:
# Month-first (default) → MM/DD/YYYY (e.g., 12/31/2023)
# Day-first → DD/MM/YYYY (e.g., 31/12/2023)
# ---------------------------
def parse_time(series):
    """Robust timestamp parsing: try default, then dayfirst=True if needed."""
    ts = pd.to_datetime(series, errors="coerce", utc=False)
    if ts.isna().any():
        ts2 = pd.to_datetime(series, errors="coerce", dayfirst=True, utc=False)
        if ts2.notna().sum() > ts.notna().sum():
            ts = ts2
    return ts

# ---------------------------
# Loops one residence at a time.
# Safely loads house data, parses timestamps, prepares date tracking.
# Then processes data.
# ---------------------------
for residence, appmap in house_appliances.items():
    # Load once per residence
    input_csv = f"{BASE}/{residence}.csv"
    if not os.path.exists(input_csv):
        print(f"[WARN] Missing file: {input_csv}")
        continue
    df_src = pd.read_csv(input_csv)

    # Parse Time → timestamp
    if "Time" not in df_src.columns:
        print(f"[ERROR] 'Time' column not found in {input_csv}. Columns: {list(df_src.columns)[:10]}...")
        continue
    df_src["timestamp"] = parse_time(df_src["Time"])
    if df_src["timestamp"].isna().all():
        print(f"[ERROR] Could not parse any timestamps in {input_csv}.")
        continue

    # Track per-residence date range across appliances
    residence_first = None
    residence_last = None

    # ---------------------------
    # For each appliance in the residence, determine the 15 minute active power
    # ---------------------------
    for appliance, app_id in appmap.items():
        col = f"Appliance{app_id}"
        if col not in df_src.columns:
            print(f"[WARN] {residence} - {appliance}: column '{col}' not found. Skipping.")
            continue

        # ---------------------------
        # 1) Extract & Save per-appliance ORIGINAL
        # ---------------------------
        df_ap = df_src[["timestamp", col]].copy()
        # Rename selected Appliance{id} column to active_power
        df_ap.rename(columns={col: "active_power"}, inplace=True)

        # Clean & sort
        df_ap = df_ap.dropna(subset=["timestamp"]).sort_values("timestamp")
        df_ap["active_power"] = pd.to_numeric(df_ap["active_power"], errors="coerce").fillna(0)

        out_appliance_csv = f"{BASE}/REFIT_{residence}_{appliance}.csv"
        df_ap.to_csv(out_appliance_csv, index=False)

        # ---------------------------
        # 2) 1-minute aggregation (sum)
        # ---------------------------
        df_min = df_ap.set_index("timestamp").sort_index()
        df_1min = df_min.resample("1min").sum(numeric_only=True)

        # ---------------------------
        # 3) Zero bottom 90% by value (values < 90th percentile → 0)
        # ---------------------------
        if df_1min.empty:
            print(f"[WARN] {residence} - {appliance}: No data after 1-minute resample.")
            continue

        p90 = df_1min["active_power"].quantile(0.9)
        df_1min["active_power"] = df_1min["active_power"].where(df_1min["active_power"] >= p90, 0)

        # ---------------------------
        # 4) 15-minute aggregation (sum)
        # ---------------------------
        df_15 = df_1min.resample("15min").sum(numeric_only=True)

        # ---------------------------
        # 5) Save 15-minute file
        # ---------------------------
        out_15_csv = f"{OUT_15}/REFIT_{residence}_{appliance}_15minutes.csv"
        df_15.to_csv(out_15_csv, index_label="timestamp")

        # Update per-residence date range
        if not df_15.empty:
            first_dt = df_15.index.min()
            last_dt  = df_15.index.max()
            residence_first = first_dt if residence_first is None else min(residence_first, first_dt)
            residence_last  = last_dt  if residence_last  is None else max(residence_last,  last_dt)

    # ---------------------------
    # 6) Print per-residence date range summary
    # ---------------------------
    if residence_first is None or residence_last is None:
        print(f"{residence}: First Date = N/A, Last Date = N/A (no 15-minute files written)")
    else:
        print(f"{residence}: First Date = {residence_first}, Last Date = {residence_last}")


In [None]:
#### UKDALE DATASET - Create Dataset and combine for 15 minutes ####
#
# UKDALE dataset: https://ukerc.rl.ac.uk/cgi-bin/dataDiscover.pl?Action=detail&dataid=7d78f943-f9fe-413b-af52-1816f9d968b0
#
# The dataset is from : /content/drive/MyDrive/Paper02_14Datasets/UKDALE_ORIGINALS
# residence is: "House01', "House02", "House05"
# appliance is: "Fridge", "WashingMachine", "Microwave"
# The files are: UKDALE_{residence}_{appliance}.dat
#
# For these files, the first column is "timestamp" and the second is "active_power"
#
# Aggregate "active_power" for every 1 minute.
# Get the distribution of this "active_power" and zero the bottom 90th percentile.
# Next, please add the "active_power" for every 15 minutes.
#
# Save to: /content/drive/MyDrive/Paper02_14Datasets/ORIGINALS_15MINUTES/UKDALE_{residence}_{appliance}_15minutes.csv
#
# For each {residence}: please print the date range: "First Date" and the "Last Date"

import os
import pandas as pd

# ----------------------------
# Paths & Config
# ----------------------------
input_dir = "/content/drive/MyDrive/Paper02_14Datasets/UKDALE_ORIGINALS"
output_dir = "/content/drive/MyDrive/Paper02_14Datasets/ORIGINALS_15MINUTES"
os.makedirs(output_dir, exist_ok=True)

residences = ["House01", "House02", "House05"]
appliances = ["Fridge", "WashingMachine", "Dishwasher"]

# Track overall first/last timestamps per residence
residence_ranges = {r: {"first": None, "last": None} for r in residences}

# ----------------------------
# Helper to update residence range
# ----------------------------
def _update_range(residence, idx):
    if idx.size == 0:
        return
    first, last = idx.min(), idx.max()
    current = residence_ranges[residence]
    if current["first"] is None or first < current["first"]:
        current["first"] = first
    if current["last"] is None or last > current["last"]:
        current["last"] = last

# ----------------------------
# Processing Loop -  per residence and per appliance
# ----------------------------
for residence in residences:
    for appliance in appliances:
        filename = f"UKDALE_{residence}_{appliance}.dat"
        filepath = os.path.join(input_dir, filename)

        if not os.path.exists(filepath):
            print(f"❌ File not found: {filepath}")
            continue

        # Load 2-column whitespace-delimited (timestamp, active_power)
        # If there are header lines, set header=None; UK-DALE .dat typically has no header.
        df = pd.read_csv(
            filepath,
            sep=r"\s+",
            header=None,
            names=["timestamp", "active_power"],
            dtype={"timestamp": "int64", "active_power": "float64"},
            engine="python",
        )

        # Convert UNIX seconds to datetime index; sort & combine any duplicate timestamps
        df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")
        df = df.sort_values("timestamp").set_index("timestamp")
        # If duplicate timestamps exist, sum them before resampling
        df = df.groupby(level=0)["active_power"].sum().to_frame()

        # ----------------------------
        # For each appliance in the residence, determine the 15 minute active power
        # ----------------------------
        # 1) Resample to 1-minute sums
        df_1min = df.resample("1min").sum()

        # 2) Zero the bottom 90th percentile (keep top 10% by value)
        threshold = df_1min["active_power"].quantile(0.9)
        df_1min.loc[df_1min["active_power"] < threshold, "active_power"] = 0.0

        # 3) Resample to 15-minute sums
        df_15min = df_1min.resample("15min").sum()

        # Save to CSV with timestamp as first column
        outname = f"UKDALE_{residence}_{appliance}_15minutes.csv"
        outpath = os.path.join(output_dir, outname)
        df_15min.to_csv(outpath, index_label="timestamp")

        # Update the overall residence date range
        _update_range(residence, df_15min.index)

        print(f"✅ Saved: {outpath}  |  Rows: {len(df_15min)}")

# ----------------------------
# Print overall date ranges per residence
# ----------------------------
print("\n=== Date Ranges by Residence (across all three appliances) ===")
for residence in residences:
    first = residence_ranges[residence]["first"]
    last = residence_ranges[residence]["last"]
    if first is None or last is None:
        print(f"{residence}: First Date = N/A, Last Date = N/A (no files found)")
    else:
        # Display as ISO timestamps
        print(f"{residence}: First Date = {first}, Last Date = {last}")


In [None]:
#### AMPds2 Dataset - Create Dataset and combine for 15 minutes ####
#
# AMPds2 dataset: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/FIE0S4
#
# The dataset is from : /content/drive/MyDrive/Paper02_14Datasets/AMPds2_ORIGINALS
# residence is: "House01"
# appliance is: "Fridge", "WashingMachine", "DishWasher"
# The files are: AMPds2_{residence}_{appliance}.csv
#
# For these files, the columns to use are "unix_ts" and the second is "P"
#
# Change the column names to: "unix_ts" is "timestamp" and "P" is "active_power"
#
# Aggregate "active_power" for every 1 minute.
# Get the distribution of this "active_power" and zero the bottom 90th percentile.
# Next, please add the "active_power" for every 15 minutes.
#
# Save only "timestamp" and "P" to: /content/drive/MyDrive/Paper02_14Datasets/ORIGINALS_15MINUTES/AMPds2_{residence}_{appliance}_15minutes.csv
#
# For each {residence}: please print the date range: "First Date" and the "Last Date"
#

import pandas as pd
import glob
import os

# ----------------------------
# Paths & Config
# ----------------------------
input_dir = "/content/drive/MyDrive/Paper02_14Datasets/AMPds2_ORIGINALS"
output_dir = "/content/drive/MyDrive/Paper02_14Datasets/ORIGINALS_15MINUTES"
os.makedirs(output_dir, exist_ok=True)

residence = "House01"
appliances = ["Fridge", "WashingMachine", "Dishwasher"]

# ----------------------------
# Loop each appliance
# ----------------------------
for appliance in appliances:
    file_path = f"{input_dir}/AMPds2_{residence}_{appliance}.csv"

    # --- Load data ---
    df = pd.read_csv(file_path)

    # Use only unix_ts and P, rename columns
    df = df[["unix_ts", "P"]].rename(columns={"unix_ts": "timestamp", "P": "active_power"})

    # Convert timestamp to datetime
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")
    df = df.set_index("timestamp").sort_index()

    # ----------------------------
    # For each appliance in the residence, determine the 15 minute active power
    # ----------------------------
    # --- Step 1: resample to 1-minute sums ---
    df_1min = df.resample("1min").sum()

    # --- Step 2: zero out bottom 90th percentile of active_power distribution ---
    thresh = df_1min["active_power"].quantile(0.90)
    df_1min.loc[df_1min["active_power"] < thresh, "active_power"] = 0

    # --- Step 3: resample to 15-minute sums ---
    df_15min = df_1min.resample("15min").sum()

    # --- Save only timestamp and active_power ---
    out_path = f"{output_dir}/AMPds2_{residence}_{appliance}_15minutes.csv"
    df_15min.reset_index()[["timestamp", "active_power"]].to_csv(out_path, index=False)

    # --- Print date range ---
    first_date = df_15min.index.min()
    last_date = df_15min.index.max()
    print(f"{residence} - {appliance}: First Date = {first_date}, Last Date = {last_date}")


In [None]:
#### GREEND Dataset - Create Dataset and combine for 15 minutes ####
#
# GREEND dataset: https://www.kaggle.com/datasets/p111110/greend-energy-dataset
#
# The files are: /content/drive/MyDrive/Paper02_14Datasets/GREEND_ORIGINALS/{residence}/dataset_{year}-{month}-{day}.csv
# residence is: "House00", "House01", "House03"
# appliance is: "Fridge", "WashingMachine", "Microwave"
# year is from "2014", "2015"
# month is from "01" to "12"
# day is from "01" to "31"
#
# merge these files together in order of dates ({year}-{month}-{day})
#
# For "House00":
# keep only the columns: "timestamp", "000D6F0002907C89" renamed as "Fridge", "000D6F0002907BC8" renamed as "WashingMachine", "000D6F0002908150", renamed as "Dishwasher"
#
# For "House01":
# keep only the columns: "timestamp", "000D6F00036BB04C" renamed as "Fridge", "000D6F0003562C48" renamed as "WashingMachine", "000D6F00029C2BD7", renamed as "Dishwasher"
#
# For "House03":
# keep only the columns: "timestamp", "000D6F000356174D" renamed as "Fridge", "000D6F0003561FFD" renamed as "WashingMachine", "000D6F0003561747", renamed as "Dishwasher"
#
# create "/content/drive/MyDrive/Paper02_14Datasets/GREEND_ORIGINALS/GREEND_House01_Fridge.csv"
# with only "timestamp" and "Fridge"
# rename "Fridge" to "active_power"
#
# create "/content/drive/MyDrive/Paper02_14Datasets/GREEND_ORIGINALS/GREEND_House01_WashingMachine.csv"
# with only "timestamp" and "WashingMachine"
# rename "WashingMachine" to "active_power"
#
# create "/content/drive/MyDrive/Paper02_14Datasets/GREEND_ORIGINALS/GREEND_House01_Dishwasher.csv"
# with only "timestamp" and "Dishwasher"
# rename "Dishwasher" to "active_power"
#
# Now, for each of these file: "/content/drive/MyDrive/Paper02_14Datasets/GREEND_ORIGINALS/GREEND_{residence}_{appliance}.csv"
#
# Aggregate "active_power" for every 1 minute.
# Get the distribution of this "active_power" and zero the bottom 90th percentile.
# Next, please add the "active_power" for every 15 minutes.
#
# Save only "timestamp" and "active_power" to: /content/drive/MyDrive/Paper02_14Datasets/ORIGINALS_15MINUTES/GREEND_{residence}_{appliance}_15minutes.csv
#
# For each {residence}: please print the date range: "First Date" and the "Last Date"
#

import os, glob
import pandas as pd

# ----------------------------
# Paths & Config
# ----------------------------
BASE = "/content/drive/MyDrive/Paper02_14Datasets/GREEND_ORIGINALS"
OUT15 = "/content/drive/MyDrive/Paper02_14Datasets/ORIGINALS_15MINUTES"
os.makedirs(OUT15, exist_ok=True)

ID_MAPS = {
    "House00": {"000D6F0002907C89": "Fridge", "000D6F0002907BC8": "WashingMachine", "000D6F0002908150": "Dishwasher"},
    "House01": {"000D6F00036BB04C": "Fridge", "000D6F0003562C48": "WashingMachine", "000D6F00029C2BD7": "Dishwasher"},
    "House03": {"000D6F000356174D": "Fridge", "000D6F0003561FFD": "WashingMachine", "000D6F0003561747": "Dishwasher"},
}
RESIDENCES = ["House00", "House01", "House03"]

# ----------------------------
# Timestamp parsing: seconds->ms
# ----------------------------
def parse_timestamp(s: pd.Series) -> pd.Series:
    # Cast to numeric first to avoid FutureWarning and support "1386374400"-style strings
    s_num = pd.to_numeric(s, errors="coerce")
    ts = pd.to_datetime(s_num, unit="s", errors="coerce")
    if ts.isna().all():
        ts = pd.to_datetime(s_num, unit="ms", errors="coerce")
    if ts.isna().all():
        ts = pd.to_datetime(s, errors="coerce")
    return ts

# ----------------------------
# Loads CSV flexibly, cleans headers, ignores broken rows.
# ----------------------------
def read_csv_relaxed(fp: str) -> pd.DataFrame | None:
    try:
        df = pd.read_csv(
            fp,
            sep=None, engine="python",      # autodetect delimiter
            on_bad_lines="skip",
            encoding="utf-8", encoding_errors="replace",
        )
        # normalize column names (strip spaces/quotes)
        df.columns = df.columns.astype(str).str.strip().str.replace('"', '', regex=False)
        return df
    except Exception as e:
        print(f"  ! Skipping file (cannot parse): {fp} ({e})")
        return None

# ----------------------------
# Loops through residences
#
# For each residence, it loads many daily CSV files and keeps only the needed columns.
# merges them into one clean time-ordered table,
# renames plug IDs to appliance names, and forces appliance values to be numeric.
# ----------------------------
for res in RESIDENCES:
    id_map = ID_MAPS[res]
    want_cols = ["timestamp"] + list(id_map.keys())

    # 1) Collect & sort daily files (YYYY-MM-DD order via filename)
    files = sorted(glob.glob(f"{BASE}/{res}/dataset_*.csv"))
    if not files:
        print(f"[{res}] No files found.")
        continue

    merged = []
    for fp in files:
        df = read_csv_relaxed(fp)
        if df is None:
            continue

        # keep only needed columns that exist
        keep = [c for c in want_cols if c in df.columns]
        if "timestamp" not in keep:
            # try to find timestamp by case-insensitive match
            maybe_ts = [c for c in df.columns if c.lower().strip() == "timestamp"]
            if maybe_ts:
                df = df.rename(columns={maybe_ts[0]: "timestamp"})
                keep = ["timestamp"] + [c for c in id_map.keys() if c in df.columns]
            else:
                continue

        df = df[keep]
        merged.append(df)

    if not merged:
        print(f"[{res}] No usable rows.")
        continue

    df = pd.concat(merged, ignore_index=True)

    # 2) Parse timestamp, sort
    df["timestamp"] = parse_timestamp(df["timestamp"])
    df = df.dropna(subset=["timestamp"]).sort_values("timestamp").reset_index(drop=True)

    # 3) Rename plug IDs → appliance names
    rename_map = {k: v for k, v in id_map.items() if k in df.columns}
    df = df.rename(columns=rename_map)

    # Force appliance columns to numeric
    for col in rename_map.values():
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace(",", "", regex=False).str.strip()
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # ----------------------------
    # For each appliance in the residence, determine the 15 minute active power
    # ----------------------------
    for appliance in rename_map.values():
        if appliance not in df.columns:
            continue

        # (1) Write originals
        out_orig = f"{BASE}/GREEND_{res}_{appliance}.csv"
        df_orig = df[["timestamp", appliance]].rename(columns={appliance: "active_power"})
        df_orig.to_csv(out_orig, index=False)

        # (2) 1-min sum
        ser = df_orig.set_index("timestamp")["active_power"]
        ser = pd.to_numeric(ser, errors="coerce")
        s_1m = ser.resample("1min").sum(min_count=1)

        # (3) Zero bottom 90% (keep top 10%)
        if s_1m.notna().any():
            thr = s_1m.quantile(0.9)
            if pd.isna(thr):
                thr = 0.0
        else:
            thr = 0.0
        s_1m = s_1m.fillna(0)
        s_1m = s_1m.where(s_1m >= float(thr), 0.0)

        # (4) 15-min sum and save
        s_15m = s_1m.resample("15min").sum().reset_index()
        s_15m.columns = ["timestamp", "active_power"]

        out_15 = f"{OUT15}/GREEND_{res}_{appliance}_15minutes.csv"
        s_15m.to_csv(out_15, index=False)

    # 5) Print date range per residence
    first, last = df["timestamp"].min(), df["timestamp"].max()
    print(f"{res}: First Date = {first}, Last Date = {last}")



In [None]:
#### CENTROIDS  - Average Power Consumption for appliances within each dataset-house combination ####
#
# Calculates the average active power consumption for specified appliances
# within each provided dataset_house combination, and saves the results
# to new CSV files.
#
# The appliances are: Fridge, WashingMachine, Dishwasher, and Nothing
#
#

import pandas as pd
import os

def calculate_appliance_centroids(combinations, appliances):
    print("Starting calculation of appliance centroids for specified combinations...")

    # Iterate residence combination
    for dataset, house in combinations:
        print(f"\nProcessing {dataset} - {house}...")
        # List to store centroid data for the current dataset_house combination
        centroids_data = []

        # Iterate through each appliance
        for appliance in appliances:
            # Construct the input filename with the "_15minutes" suffix
            input_filename = f"/content/drive/MyDrive/Paper02_14Datasets/ORIGINALS_15MINUTES/{dataset}_{house}_{appliance}_15minutes.csv"

            # Check if the input file exists before attempting to read it
            if os.path.exists(input_filename):
                print(f"  Reading {input_filename}...")
                try:
                    # Read the CSV file into a pandas DataFrame
                    # Assuming 'active_power' is a column in the CSV
                    df = pd.read_csv(input_filename)

                    # Ensure 'active_power' column exists
                    if 'active_power' in df.columns:
                        # Calculate the average active_power consumption for the appliance
                        average_active_power = df['active_power'].mean()
                        print(f"    Average active_power for {appliance}: {average_active_power:.2f}")

                        # Add the appliance and its average consumption to our list
                        centroids_data.append({
                            'combination': appliance,
                            'active_power': average_active_power
                        })
                    else:
                        print(f"    Warning: 'active_power' column not found in {input_filename}. Skipping.")
                except pd.errors.EmptyDataError:
                    print(f"    Warning: {input_filename} is empty. Skipping.")
                except Exception as e:
                    print(f"    Error reading {input_filename}: {e}. Skipping.")
            else:
                print(f"  File not found: {input_filename}. Skipping.")

        # --- Add 'Nothing' entry with active_power of 0 ---
        centroids_data.append({
            'combination': 'Nothing',
            'active_power': 0.0
        })
        print(f"  Added 'Nothing' entry with active_power: 0.0")

        # After processing all appliances for the current house and dataset,
        # create a DataFrame from the collected centroid data
        if centroids_data:
            centroids_df = pd.DataFrame(centroids_data)

            # Construct the output filename
            output_filename = f"/content/drive/MyDrive/Paper02_14Datasets/CENTROIDS/{dataset}_{house}_centroids.csv"

            # Save the DataFrame to a new CSV file
            centroids_df.to_csv(output_filename, index=False)
            print(f"Successfully created {output_filename}")
        else:
            print(f"No valid appliance data found for {dataset} - {house}. No centroid file created.")

    print("\nCentroid calculation complete.")

if __name__ == "__main__":
    # --- Configuration ---
    # Define the specific dataset and house combinations to process
    VALID_COMBINATIONS = [
          ("REFIT", "House01"),
          ("REFIT", "House02"),
          ("REFIT", "House03"),
          ("REFIT", "House05"),
          ("REFIT", "House07"),
          ("REFIT", "House09"),
          ("REFIT", "House15"),
          ("UKDALE", "House01"),
          ("UKDALE", "House02"),
          ("UKDALE", "House05"),
          ("AMPds2", "House01"),
          ("GREEND", "House00"),
          ("GREEND", "House01"),
          ("GREEND", "House03")
    ]
    APPLIANCES = ["Fridge", "WashingMachine", "Dishwasher"]

    # Call the main function to perform the calculation and file creation
    calculate_appliance_centroids(VALID_COMBINATIONS, APPLIANCES)



In [None]:
#### ANOMALIES - Create anomalies (7 types for 7 days) plus ground_truth_anomaly ####
#
#
# Anomalies are: StepChange, MultiStepChange, Mirror, Repeating, StuckMAX, StuckMIN, PowerCycling
# Use percentiles for values
#

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

# ----------------------------
# Paths & Config
# ----------------------------
BASE_INPUT_DIR = "/content/drive/MyDrive/Paper02_14Datasets/ORIGINALS_15MINUTES/"
BASE_OUTPUT_ANOMALOUS_DIR = "/content/drive/MyDrive/Paper02_14Datasets/ORIGINALS_15MINUTES/anomalous_data/"
BASE_OUTPUT_PLOTS_DIR = "/content/drive/MyDrive/Paper02_14Datasets/ORIGINALS_15MINUTES/anomaly_plots/"  # Plots displayed but not saved here

os.makedirs(BASE_OUTPUT_ANOMALOUS_DIR, exist_ok=True)
os.makedirs(BASE_OUTPUT_PLOTS_DIR, exist_ok=True)

datasets = ["REFIT", "UKDALE", "AMPds2", "GREEND"]

refitt_houses = ["House01", "House02", "House03", "House05", "House07", "House09", "House15"]
ukdale_houses = ["House01", "House02", "House05"]
ampds2_houses = ["House01"]
greend_houses = ["House00", "House01", "House03"]

appliances = ["Fridge", "WashingMachine", "Dishwasher"]

anomaly_types = [
    "StepChange",
    "MultiStepChange",
    "Mirror",
    "Repeating",
    "StuckMAX",
    "StuckMIN",
    "PowerCycling"
]

def get_anomaly_dates(dataset, house):
    """Returns a list of anomaly dates for a given dataset and house."""
    if dataset == "REFIT" and house == "House01":
        return [
            pd.to_datetime("2015-03-09"), # Monday
            pd.to_datetime("2015-03-24"), # Tuesday
            pd.to_datetime("2015-04-15"), # Wednesday
            pd.to_datetime("2015-05-07"), # Thursday
            pd.to_datetime("2015-05-29"), # Friday
            pd.to_datetime("2015-06-20"), # Saturday
            pd.to_datetime("2015-07-05"), # Sunday
        ]
    elif dataset == "REFIT" and house == "House02":
        return [
            pd.to_datetime("2015-01-26"),
            pd.to_datetime("2015-02-17"),
            pd.to_datetime("2015-03-04"),
            pd.to_datetime("2015-03-26"),
            pd.to_datetime("2015-04-17"),
            pd.to_datetime("2015-05-09"),
            pd.to_datetime("2015-05-24"),
        ]
    elif dataset == "REFIT" and house == "House03":
        return [
            pd.to_datetime("2015-02-02"),
            pd.to_datetime("2015-02-17"),
            pd.to_datetime("2015-03-11"),
            pd.to_datetime("2015-04-02"),
            pd.to_datetime("2015-04-24"),
            pd.to_datetime("2015-05-16"),
            pd.to_datetime("2015-05-31"),
        ]
    elif dataset == "REFIT" and house == "House05":
        return [
            pd.to_datetime("2015-03-02"),
            pd.to_datetime("2015-03-24"),
            pd.to_datetime("2015-04-08"),
            pd.to_datetime("2015-04-30"),
            pd.to_datetime("2015-05-22"),
            pd.to_datetime("2015-06-13"),
            pd.to_datetime("2015-07-05"),
        ]
    elif dataset == "REFIT" and house == "House07":
        return [
            pd.to_datetime("2015-03-09"),
            pd.to_datetime("2015-03-31"),
            pd.to_datetime("2015-04-15"),
            pd.to_datetime("2015-05-07"),
            pd.to_datetime("2015-05-29"),
            pd.to_datetime("2015-06-20"),
            pd.to_datetime("2015-07-05"),
        ]
    elif dataset == "REFIT" and house == "House09":
        return [
            pd.to_datetime("2015-03-23"),
            pd.to_datetime("2015-04-07"),
            pd.to_datetime("2015-04-22"),
            pd.to_datetime("2015-05-14"),
            pd.to_datetime("2015-05-29"),
            pd.to_datetime("2015-06-20"),
            pd.to_datetime("2015-07-05"),
        ]
    elif dataset == "REFIT" and house == "House15":
        return [
            pd.to_datetime("2015-03-16"),
            pd.to_datetime("2015-04-07"),
            pd.to_datetime("2015-04-22"),
            pd.to_datetime("2015-05-14"),
            pd.to_datetime("2015-05-29"),
            pd.to_datetime("2015-06-20"),
            pd.to_datetime("2015-07-05"),
        ]
    elif dataset == "UKDALE" and house == "House01":
        return [
            pd.to_datetime("2016-06-06"),
            pd.to_datetime("2016-07-26"),
            pd.to_datetime("2016-09-21"),
            pd.to_datetime("2016-11-17"),
            pd.to_datetime("2017-01-06"),
            pd.to_datetime("2017-03-04"),
            pd.to_datetime("2017-04-23"),
        ]
    elif dataset == "UKDALE" and house == "House02":
        return [
            pd.to_datetime("2013-09-16"),
            pd.to_datetime("2013-09-17"),
            pd.to_datetime("2013-09-18"),
            pd.to_datetime("2013-09-26"),
            pd.to_datetime("2013-10-04"),
            pd.to_datetime("2013-10-05"),
            pd.to_datetime("2013-10-06"),
        ]
    elif dataset == "UKDALE" and house == "House05":
        return [
            pd.to_datetime("2014-10-20"),
            pd.to_datetime("2014-10-21"),
            pd.to_datetime("2014-10-29"),
            pd.to_datetime("2014-10-30"),
            pd.to_datetime("2014-11-07"),
            pd.to_datetime("2014-11-08"),
            pd.to_datetime("2014-11-09"),
        ]
    elif dataset == "AMPds2" and house == "House01":
        return [
            pd.to_datetime("2013-11-11"),
            pd.to_datetime("2013-12-03"),
            pd.to_datetime("2013-12-25"),
            pd.to_datetime("2014-01-16"),
            pd.to_datetime("2014-02-14"),
            pd.to_datetime("2014-03-08"),
            pd.to_datetime("2014-03-30"),
        ]
    elif dataset == "GREEND" and house == "House00":
        return [
            pd.to_datetime("2014-08-18"),
            pd.to_datetime("2014-08-26"),
            pd.to_datetime("2014-09-03"),
            pd.to_datetime("2014-09-11"),
            pd.to_datetime("2014-09-26"),
            pd.to_datetime("2014-10-04"),
            pd.to_datetime("2014-10-12"),
        ]
    elif dataset == "GREEND" and house == "House01":
        return [
            pd.to_datetime("2014-09-15"),
            pd.to_datetime("2014-09-23"),
            pd.to_datetime("2014-10-01"),
            pd.to_datetime("2014-10-09"),
            pd.to_datetime("2014-10-17"),
            pd.to_datetime("2014-10-25"),
            pd.to_datetime("2014-10-26"),
        ]
    elif dataset == "GREEND" and house == "House03":
        return [
            pd.to_datetime("2014-09-22"),
            pd.to_datetime("2014-09-23"),
            pd.to_datetime("2014-10-01"),
            pd.to_datetime("2014-10-09"),
            pd.to_datetime("2014-10-17"),
            pd.to_datetime("2014-10-25"),
            pd.to_datetime("2014-10-26"),
        ]
    return []

# ----------------------------
# Anomaly Generation Helpers
# ----------------------------
# Given a list, determine the value for a specified percentile.
def get_percentile_value(ap_list, percentile):
    if not ap_list:
        return 0
    index = int(len(ap_list) * percentile / 100)
    return ap_list[min(index, len(ap_list) - 1)]

# Determine if a day is flat by standard deviation of active_power being less than a threshold
# A standard deviation less than 1 means the values are tightly clustered around the average, with very little variation from it.
def is_day_flat(df_day_segment, threshold=1.0):
    if df_day_segment.empty or len(df_day_segment) < 2:
        return True
    return np.std(df_day_segment['active_power'].values) < threshold

# Searches backwards from given day, for the nearest non-flat day.
def find_non_flat_day_data(df_full, current_anomaly_date, threshold=1.0):
    df_full_indexed = df_full.copy().set_index('timestamp').sort_index()
    search_date = current_anomaly_date - pd.Timedelta(days=1)

    while search_date >= df_full_indexed.index.min().floor('D'):
        day_start = search_date.floor('D')
        day_end = search_date.floor('D') + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)
        df_day_candidate = df_full_indexed.loc[day_start:day_end].copy()

        if not df_day_candidate.empty and not is_day_flat(df_day_candidate, threshold):
            print(f"Found non-flat day for pattern: {day_start.date()}")
            return df_day_candidate[['active_power']]

        search_date -= pd.Timedelta(days=1)

    print(f"Warning: No non-flat day found in historical data for date {current_anomaly_date.date()} for this appliance.")
    return pd.DataFrame(columns=['active_power'], index=pd.to_datetime([]))

# ----------------------------
# Anomaly Definitions
# ----------------------------
# StepChange: first half 15th percentile, second half 98th percentile
def apply_step_change(df_day, ap_list):
    df_anomalous = df_day.copy()
    if len(ap_list) == 0:
        print("Warning: active_power_list is empty. Cannot apply step change.")
        return df_anomalous

    percentile_15 = get_percentile_value(ap_list, 15)
    percentile_98 = get_percentile_value(ap_list, 98)

    midpoint_idx = len(df_anomalous) // 2
    df_anomalous.iloc[:midpoint_idx, df_anomalous.columns.get_loc('active_power')] = percentile_15
    df_anomalous.iloc[midpoint_idx:, df_anomalous.columns.get_loc('active_power')] = percentile_98
    return df_anomalous

# MultiStepChange: first third 15th, middle third 75th, last third 98th percentile
def apply_multi_step_change(df_day, ap_list):
    df_anomalous = df_day.copy()
    if len(ap_list) == 0:
        print("Warning: active_power_list is empty. Cannot apply multi-step change.")
        return df_anomalous

    percentile_15 = get_percentile_value(ap_list, 15)
    percentile_75 = get_percentile_value(ap_list, 75)
    percentile_98 = get_percentile_value(ap_list, 98)

    len_df = len(df_anomalous)
    first_third_end = len_df // 3
    second_third_end = 2 * len_df // 3

    df_anomalous.iloc[:first_third_end, df_anomalous.columns.get_loc('active_power')] = percentile_15
    df_anomalous.iloc[first_third_end:second_third_end, df_anomalous.columns.get_loc('active_power')] = percentile_75
    df_anomalous.iloc[second_third_end:, df_anomalous.columns.get_loc('active_power')] = percentile_98
    return df_anomalous

# Mirror:
# Flips active power horizontally.
# If current day is flat, uses provided non-flat source data (tiled if needed).
# Ensures minimum value >= 15th percentile.
def apply_mirror(df_day, ap_list, df_source_for_pattern):
    """
    Applies a mirror anomaly:
    """
    df_anomalous = df_day.copy()
    source_power_values = df_anomalous['active_power'].values

    if is_day_flat(df_anomalous) and not df_source_for_pattern.empty:
        print(f"Using non-flat source data for Mirror anomaly on {df_day.index[0].date()}")
        if len(df_source_for_pattern) >= len(df_anomalous):
            source_power_values = df_source_for_pattern['active_power'].values[:len(df_anomalous)]
        else:
            source_power_values = np.tile(
                df_source_for_pattern['active_power'].values,
                int(np.ceil(len(df_anomalous) / len(df_source_for_pattern)))
            )[:len(df_anomalous)]
    elif is_day_flat(df_anomalous) and df_source_for_pattern.empty:
        print(f"Mirror anomaly on {df_day.index[0].date()}: Current day is flat and no non-flat historical data found. Applying horizontal flip to current flat data.")
    else:
        print(f"Applying Mirror anomaly to non-flat current day data on {df_day.index[0].date()}")

    mirrored_values = source_power_values[::-1]

    if ap_list:
        percentile_15 = get_percentile_value(ap_list, 15)
        mirrored_values[mirrored_values < percentile_15] = percentile_15
        print(f"Ensured all mirrored values are at least {percentile_15:.2f} (15th percentile).")

    df_anomalous.loc[:, 'active_power'] = mirrored_values
    return df_anomalous

# Repeating:
# Finds a 'bump' (max avg 2-hour window) and repeats it 6 times.
# If no bump available, uses source or simple [15th, 98th] pattern.
# Ensures minimum value >= 15th percentile.
def apply_repeating(df_day, ap_list, df_source_for_pattern):
    # Work on a copy so original data is not modified
    df_anomalous = df_day.copy()

    # Guard: empty day → nothing to modify
    if len(df_anomalous) == 0:
        print("Warning: df_anomalous is empty. Cannot apply repeating anomaly.")
        return df_anomalous

    # Total number of time intervals in the day (96 for 15-min data)
    total_intervals = len(df_anomalous)

    # Number of repeated blocks to insert across the day
    num_repetitions = 6
    if num_repetitions == 0:
        print("Warning: Number of repetitions is 0. Cannot apply repeating anomaly.")
        return df_anomalous

    # Length of each repeated segment
    segment_length = total_intervals // num_repetitions
    if segment_length == 0:
        # Day too short → fallback to simple alternating low/high pattern
        print("Warning: Segment length is 0. Cannot repeat the bump. Day too short or too many repetitions.")
        df_anomalous.loc[:, 'active_power'] = np.array(
            [get_percentile_value(ap_list, 15), get_percentile_value(ap_list, 98)]
        )[np.arange(total_intervals) % 2]
        return df_anomalous

    # Maximum duration of the repeating “bump” (2 hours = 8 × 15-min)
    bump_duration_intervals = min(8, segment_length)

    # Default: search for bump pattern in the current day
    df_for_bump_search = df_anomalous

    # If current day is flat, try to borrow a pattern from historical data
    if is_day_flat(df_anomalous) and not df_source_for_pattern.empty:
        print(f"Using non-flat source data for Repeating anomaly on {df_day.index[0].date()}")
        df_for_bump_search = df_source_for_pattern.copy()
    elif is_day_flat(df_anomalous) and df_source_for_pattern.empty:
        print(f"Repeating anomaly on {df_day.index[0].date()}: Current day is flat and no non-flat historical data found. Falling back to simple pattern.")
    else:
        print(f"Applying Repeating anomaly to non-flat current day data on {df_day.index[0].date()}")

    # Find the highest-power contiguous window (best “bump”)
    max_avg_power = -1
    best_bump_start_idx = 0

    if not df_for_bump_search.empty and len(df_for_bump_search) >= bump_duration_intervals:
        # Slide a window and pick the segment with the largest mean power
        for i in range(len(df_for_bump_search) - bump_duration_intervals + 1):
            current_avg_power = df_for_bump_search.iloc[
                i : i + bump_duration_intervals,
                df_for_bump_search.columns.get_loc('active_power')
            ].mean()
            if current_avg_power > max_avg_power:
                max_avg_power = current_avg_power
                best_bump_start_idx = i

        # Extract the selected bump pattern
        bump_pattern = df_for_bump_search.iloc[
            best_bump_start_idx : best_bump_start_idx + bump_duration_intervals,
            df_for_bump_search.columns.get_loc('active_power')
        ].values

    elif not df_for_bump_search.empty:
        # Fallback: use entire available signal as pattern
        bump_pattern = df_for_bump_search['active_power'].values
    else:
        bump_pattern = np.array([])

    # Final fallback if no pattern was found
    if len(bump_pattern) == 0:
        print("Warning: No suitable bump pattern found (even with fallback). Using simple on-off pattern.")
        bump_pattern = np.array([
            get_percentile_value(ap_list, 15),
            get_percentile_value(ap_list, 98)
        ])

    # Build one repetition block by cycling through bump values
    repetition_block = np.zeros(segment_length)
    for i in range(segment_length):
        repetition_block[i] = bump_pattern[i % len(bump_pattern)]

    # Enforce minimum power floor using 15th percentile
    if ap_list and len(repetition_block) > 0:
        percentile_15 = get_percentile_value(ap_list, 15)
        repetition_block[repetition_block < percentile_15] = percentile_15
        print(f"Ensured all values in repetition block are at least {percentile_15:.2f} (15th percentile).")

    # Tile repetition blocks across the entire day
    repeated_values = np.tile(repetition_block, num_repetitions)

    # Handle leftover or excess intervals
    if len(repeated_values) < total_intervals:
        remainder_len = total_intervals - len(repeated_values)
        repeated_values = np.append(repeated_values, repetition_block[:remainder_len])
    elif len(repeated_values) > total_intervals:
        repeated_values = repeated_values[:total_intervals]

    # Overwrite active power with repeating anomaly
    df_anomalous.loc[:, 'active_power'] = repeated_values

    return df_anomalous

# StuckMAX: sets active power to 98th percentile
def apply_stuck_max(df_day, ap_list):
    df_anomalous = df_day.copy()
    if len(ap_list) == 0:
        print("Warning: active_power_list is empty. Cannot apply stuck max.")
        return df_anomalous
    percentile_98 = get_percentile_value(ap_list, 98)
    df_anomalous.loc[:, 'active_power'] = percentile_98
    return df_anomalous

# StuckMIN: sets active power to 15th percentile.
def apply_stuck_min(df_day, ap_list):
    df_anomalous = df_day.copy()
    if len(ap_list) == 0:
        print("Warning: active_power_list is empty. Cannot apply stuck min.")
        return df_anomalous
    percentile_15 = get_percentile_value(ap_list, 15)
    df_anomalous.loc[:, 'active_power'] = percentile_15
    return df_anomalous

# PowerCycling: alternates between 15th and 98th percentile 10 times.
def apply_power_cycling(df_day, ap_list):
    df_anomalous = df_day.copy()
    if len(ap_list) == 0:
        print("Warning: active_power_list is empty. Cannot apply power cycling.")
        return df_anomalous

    percentile_15 = get_percentile_value(ap_list, 15)
    percentile_98 = get_percentile_value(ap_list, 98)

    num_cycles = 10
    total_segments = num_cycles * 2
    if total_segments == 0:
        print("Warning: Total segments for power cycling is 0. Cannot apply power cycling.")
        return df_anomalous

    segment_intervals = round(len(df_anomalous) / total_segments)
    if segment_intervals == 0:
        print("Warning: Segment interval for power cycling is 0. Day too short for 10 cycles.")
        return df_anomalous

    current_value = percentile_15
    current_segment_count = 0
    for i in range(len(df_anomalous)):
        df_anomalous.iloc[i, df_anomalous.columns.get_loc('active_power')] = current_value
        current_segment_count += 1
        if current_segment_count >= segment_intervals:
            current_value = percentile_98 if current_value == percentile_15 else percentile_15
            current_segment_count = 0

    return df_anomalous

# ----------------------------
# Main Loop
# Loop though datasets and houses within the datasets.
# ----------------------------
for dataset in datasets:
    # Determine houses based on dataset
    if dataset == "REFIT":
        current_houses = refitt_houses
    elif dataset == "UKDALE":
        current_houses = ukdale_houses
    elif dataset == "AMPds2":
        current_houses = ampds2_houses
    elif dataset == "GREEND":
        current_houses = greend_houses
    else:
        continue  # Skip if dataset not defined

    for house in current_houses:
        for appliance in appliances:
            file_path = os.path.join(BASE_INPUT_DIR, f"{dataset}_{house}_{appliance}_15minutes.csv")

            if not os.path.exists(file_path):
                print(f"File not found: {file_path}. Skipping.")
                continue

            print(f"Processing {dataset}, {house}, {appliance}...")
            df_original_full = pd.read_csv(file_path)
            df_original_full['timestamp'] = pd.to_datetime(df_original_full['timestamp'])
            # Ensure 'active_power' is numeric
            df_original_full['active_power'] = pd.to_numeric(
                df_original_full['active_power'], errors='coerce'
            ).fillna(0)

            # Create sorted list of unique active power values
            active_power_list = sorted(df_original_full['active_power'].dropna().unique().tolist())
            print(f"Unique active power values for {appliance} in {house} ({dataset}):")
            print(active_power_list)

            # Get anomaly dates for the current dataset and house
            anomaly_dates = get_anomaly_dates(dataset, house)

            if not anomaly_dates:
                print(f"No anomaly dates defined for {dataset} {house}. Skipping.")
                continue

            # Loop through the different anomaly types
            for anomaly_type in anomaly_types:
                # Initialize df_modified_full and ground_truth_anomaly for each anomaly type
                df_modified_full = df_original_full.copy()
                df_modified_full['ground_truth_anomaly'] = "Normal"

                # Loop through the anomaly dates
                for anomaly_date in anomaly_dates:
                    # Get the anomaly start and end dates (end is one day)
                    anomaly_day_start = anomaly_date.floor('D')
                    anomaly_day_end = anomaly_date.floor('D') + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)

                    # Original day slice (for plotting)
                    df_day_original = df_original_full[
                        (df_original_full['timestamp'] >= anomaly_day_start) &
                        (df_original_full['timestamp'] <= anomaly_day_end)
                    ].copy()

                    if df_day_original.empty:
                        print(f"No data for anomaly date {anomaly_date.date()} in {dataset}_{house}_{appliance}. Skipping anomaly generation for this date for {anomaly_type}.")
                        continue

                    # Get the date to add the anomaly
                    df_day_segment_to_modify = df_modified_full[
                        (df_modified_full['timestamp'] >= anomaly_day_start) &
                        (df_modified_full['timestamp'] <= anomaly_day_end)
                    ].copy()

                    if df_day_segment_to_modify.empty:
                        print(f"No modifiable segment found for anomaly date {anomaly_date.date()} in {dataset}_{house}_{appliance}. Cannot apply {anomaly_type}.")
                        continue

                    # Find non-flat day for Mirror and Repeating anomalies
                    non_flat_source_data = find_non_flat_day_data(df_original_full, anomaly_date)

                    # Indices of the anomaly day in the full dataframe
                    anomaly_indices = df_modified_full[
                        (df_modified_full['timestamp'] >= anomaly_day_start) &
                        (df_modified_full['timestamp'] <= anomaly_day_end)
                    ].index

                    if anomaly_indices.empty:
                        print(f"No indices found for anomaly date {anomaly_date.date()} in {dataset}_{house}_{appliance}. Cannot apply {anomaly_type}.")
                        continue

                    # Extract the day's active_power segment
                    active_power_day_segment = df_original_full.loc[anomaly_indices, ['timestamp', 'active_power']].copy()
                    active_power_day_segment.set_index('timestamp', inplace=True)

                    # For each anomaly type, add the anomaly in the range (active_power_day_segment)
                    df_anomalous_day_slice = None
                    if anomaly_type == "StepChange":
                        df_anomalous_day_slice = apply_step_change(active_power_day_segment, active_power_list)
                    elif anomaly_type == "MultiStepChange":
                        df_anomalous_day_slice = apply_multi_step_change(active_power_day_segment, active_power_list)
                    elif anomaly_type == "Mirror":
                        df_anomalous_day_slice = apply_mirror(active_power_day_segment, active_power_list, non_flat_source_data)
                    elif anomaly_type == "Repeating":
                        df_anomalous_day_slice = apply_repeating(active_power_day_segment, active_power_list, non_flat_source_data)
                    elif anomaly_type == "StuckMAX":
                        df_anomalous_day_slice = apply_stuck_max(active_power_day_segment, active_power_list)
                    elif anomaly_type == "StuckMIN":
                        df_anomalous_day_slice = apply_stuck_min(active_power_day_segment, active_power_list)
                    elif anomaly_type == "PowerCycling":
                        df_anomalous_day_slice = apply_power_cycling(active_power_day_segment, active_power_list)

                    if df_anomalous_day_slice is None:
                        print(f"Anomaly type {anomaly_type} not implemented or failed for {anomaly_date.date()}.")
                        continue

                    # Update modified full dataframe
                    df_modified_full.loc[anomaly_indices, 'active_power'] = df_anomalous_day_slice['active_power'].values
                    df_modified_full.loc[anomaly_indices, 'ground_truth_anomaly'] = "Anomaly"

                    # Plotting for display only
                    plt.figure(figsize=(15, 7))
                    plt.plot(df_day_original['timestamp'], df_day_original['active_power'],
                             label='Original Active Power', color='blue', alpha=0.7)
                    plt.plot(df_anomalous_day_slice.index, df_anomalous_day_slice['active_power'],
                             label=f'Anomalous Active Power ({anomaly_type})', color='red', linestyle='--', alpha=0.8)

                    plt.title(f"{dataset} {house} {appliance}: {anomaly_type} Anomaly on {anomaly_date.date()}")
                    plt.xlabel("Timestamp")
                    plt.ylabel("Active Power")
                    plt.legend()
                    plt.grid(True)
                    plt.tight_layout()
                    plt.show()
                    plt.close()
                    print(f"Displayed plot for {dataset}_{house}_{appliance}_{anomaly_type}_{anomaly_date.strftime('%Y-%m-%d')}.")

                # --- Save the data for this specific anomaly type ---
                output_anomalous_filename = f"{dataset}_{house}_{appliance}_15minutes_{anomaly_type}.csv"
                output_anomalous_filepath = os.path.join(BASE_OUTPUT_ANOMALOUS_DIR, output_anomalous_filename)

                df_modified_full[['timestamp', 'active_power', 'ground_truth_anomaly']].to_csv(
                    output_anomalous_filepath, index=False
                )
                print(f"Saved anomalous data for {anomaly_type}: {output_anomalous_filepath}")

print("Anomaly generation complete!")


In [None]:
#### MERGE - combines anomalies and normal data plus sets ground_truth_appliance ####
#
# Loops through specified dataset, house, appliance, and anomaly combinations.
# For each anomalous file, it merges the data with normal appliance data from
# other appliances in the same house. It then calculates the total active power
# and determines the 'ground_truth_appliance' string based on active appliances.
# The resulting merged data is saved to a new CSV file.
#
# Essentially, it uses the current anomaly file (with specified appliance)
# and adds the non-anomaly (other appliances).
#

# Imports
import pandas as pd
import os

def process_anomalous_files():

    # ----------------------------
    # Paths & Config
    # ----------------------------
    dataset_list = ["REFIT", "UKDALE", "AMPds2", "GREEND"]
    house_list = ["House00", "House01", "House02", "House03", "House05", "House07", "House09", "House15"]
    appliance_list = ["Fridge", "WashingMachine", "Dishwasher"]
    anomaly_list = ["StepChange", "MultiStepChange", "Mirror", "Repeating", "StuckMAX", "StuckMIN", "PowerCycling"]

    merged_output_dir = "/content/drive/MyDrive/Paper02_14Datasets/MERGED"
    os.makedirs(merged_output_dir, exist_ok=True)
    print(f"Ensured output directory exists: {merged_output_dir}")

    # ----------------------------
    # Loop through the datasets
    # ----------------------------
    for dataset in dataset_list:
        for house in house_list:
            print(f"\nProcessing {dataset} - {house}...")

            # Load all normal appliance data for the current dataset and house ---
            normal_appliance_data = {}
            for appliance in appliance_list:
                normal_file_path = f"/content/drive/MyDrive/Paper02_14Datasets/ORIGINALS_15MINUTES/{dataset}_{house}_{appliance}_15minutes.csv"
                try:
                    df = pd.read_csv(normal_file_path)
                    # Convert timestamp to datetime objects for accurate merging
                    df['timestamp'] = pd.to_datetime(df['timestamp'])
                    normal_appliance_data[appliance] = df
                    print(f"  Loaded normal data for {appliance}: {normal_file_path}")
                except FileNotFoundError:
                    # If a normal file is not found, print a warning and mark it as None
                    print(f"  Warning: Normal file not found: {normal_file_path}. This appliance's normal data will be considered zero for merges in this house.")
                    normal_appliance_data[appliance] = None
                except Exception as e:
                    # Catch any other potential errors during file loading
                    print(f"  Error loading normal file {normal_file_path}: {e}")
                    normal_appliance_data[appliance] = None

            # Loop through each appliance
            for anomaly_appliance in appliance_list:
                # Loop through each type of anomaly
                for anomaly_type in anomaly_list:
                    anomalous_file_path = f"/content/drive/MyDrive/Paper02_14Datasets/ORIGINALS_15MINUTES/anomalous_data/{dataset}_{house}_{anomaly_appliance}_15minutes_{anomaly_type}.csv"
                    merged_output_file_path = os.path.join(merged_output_dir, f"{dataset}_{house}_{anomaly_appliance}_15minutes_{anomaly_type}_MERGED.csv")

                    try:
                        # Load the specific anomalous appliance data ---
                        anomalous_df = pd.read_csv(anomalous_file_path)
                        anomalous_df['timestamp'] = pd.to_datetime(anomalous_df['timestamp'])
                        if 'ground_truth_anomaly' not in anomalous_df.columns:
                            anomalous_df['ground_truth_anomaly'] = True

                        print(f"  Loaded anomalous data: {anomalous_file_path}")

                        # Initialize a base with the above timestamp and ground_truth_anomaly
                        base_df = anomalous_df[['timestamp', 'ground_truth_anomaly']].copy()

                        # Dictionary to hold the active power series for each appliance, aligned by timestamp
                        aligned_power_data = {}

                        # Loop through the appliances
                        for current_appliance in appliance_list:
                            df_to_use = None
                            if current_appliance == anomaly_appliance:
                                # If current appliance has anomaly, use its anomalous data
                                df_to_use = anomalous_df
                            elif normal_appliance_data.get(current_appliance) is not None:
                                # Otherwise, use its normal data
                                df_to_use = normal_appliance_data[current_appliance]
                            else:
                                # If normal data for this appliance is missing for the current house,
                                # assume zero power for its contribution in the merge.
                                aligned_power_data[f'active_power_{current_appliance}'] = pd.Series(
                                    0.0, index=base_df.index, name=f'active_power_{current_appliance}'
                                )
                                print(f"    Note: Using zero power for {current_appliance} in this merge scenario due to missing normal data for {house}.")
                                continue # Move to the next appliance

                            # Merge the current appliance's power data with the base_df to align timestamps.
                            # Use a temporary column name to prevent conflicts during the merge,
                            # then fill any NaN values (where timestamps didn't match) with 0.0.
                            temp_col_name = 'temp_active_power'
                            # Select only 'timestamp' and 'active_power' from df_to_use for the merge
                            merged_temp = pd.merge(
                                base_df[['timestamp']], # Merge only on timestamp from base_df
                                df_to_use[['timestamp', 'active_power']].rename(columns={'active_power': temp_col_name}),
                                on='timestamp',
                                how='left' # Left merge keeps all timestamps from base_df
                            )
                            # Fill any NaN values
                            aligned_power_data[f'active_power_{current_appliance}'] = merged_temp[temp_col_name].fillna(0.0)

                        # Compute total active power and ground truth appliance string
                        final_merged_df = base_df.copy() # Start with timestamps and ground_truth_anomaly

                        # Pre-allocate lists to store results for better performance
                        total_powers_list = []
                        ground_truths_list = []

                        # Iterate through each row (timestamp) to calculate total power and ground truth appliance string
                        # Using .itertuples() or iterating over the index can be efficient
                        for index, row in base_df.iterrows():
                            current_total_power = 0.0
                            current_active_appliances = []

                            # Iterate through appliances in the specified order to build the ground truth string
                            for app in appliance_list:
                                # Get the aligned power value for the current appliance at this timestamp
                                # Use .loc with index for robust lookup
                                app_power = aligned_power_data[f'active_power_{app}'].loc[index]
                                current_total_power += app_power

                                # If appliance's power is above zero, add its name to the active list
                                if app_power > 0:
                                    current_active_appliances.append(app)

                            total_powers_list.append(current_total_power)
                            # Form the ground truth string: '+' separated or 'Nothing' if no appliances are active
                            ground_truths_list.append('+'.join(current_active_appliances) if current_active_appliances else 'Nothing')

                        # Assign the computed lists back to the DataFrame columns
                        final_merged_df['active_power'] = total_powers_list
                        final_merged_df['ground_truth_appliance'] = ground_truths_list

                        # Save 'timestamp', 'active_power', 'ground_truth_anomaly', 'ground_truth_appliance'
                        final_output_df = final_merged_df[['timestamp', 'active_power', 'ground_truth_anomaly', 'ground_truth_appliance']]

                        # Save the merged DataFrame to the specified path
                        final_output_df.to_csv(merged_output_file_path, index=False)
                        print(f"  Successfully created merged file: {merged_output_file_path}")

                    except FileNotFoundError:
                        print(f"  Anomalous file not found: {anomalous_file_path}. Skipping merge for this anomaly combination.")
                    except Exception as e:
                        print(f"  Error processing {anomalous_file_path} for {dataset}_{house}_{anomaly_appliance}_{anomaly_type}: {e}")

# Execute the main function when the script is run
if __name__ == "__main__":
    process_anomalous_files()
