In [None]:
##### OUTLINE OF FILE #####
### Essentially, this code performs "prediction_appliance"

#### NILM CLASSIFICATION - Runs the Autoencoder/CosineSimilarity/DecisionTree to determine "prediction_appliance" ####
#### CLEAN SPACES - ground_truth_appliance/prediction_appliance may have problems in spaces ####

#### Per residence:
#### timestamp, active_power, ground_truth_appliance, ground_truth_anomaly, and prediction_appliance
#### Summary:
#### current_iteration, file_name, appliance, random_seed, epochs,
#### batch_size, accuracy, precision, recall, f1_score, auc, poc

In [None]:
### NILM CLASSIFICATION ###
#
# The purpose of this script is to predict what the turned on appliances are based on active_power
# We find the statistics of when an appliance is present in the combination
#
# The Algorithms are as follows
#
# TRAINING ALGORITHM (per residence)
# 1. Fit scaler and train autoencoder
# 2. Encode data using trained encoder
# 3. Compute cosine similarity to centroids
# 4. Build per-appliance binary labels
# 5. Train decision trees once
# 6. Save trained models and trees
#
# INFERENCE ALGORITHM (per residence)
# 1. Load encoder, scaler, decision trees
# 2. Encode test data with encoder
# 3. Compute cosine similarity features
# 4. Predict appliances using decision trees
# 5. Form appliance-combination predictions
# 6. Save predictions and evaluation metrics

In [None]:
# ======================================================================
# IMPORTS
# ======================================================================
import os
import warnings
import joblib
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.tree import DecisionTreeClassifier

warnings.filterwarnings('ignore', category=UserWarning, module='tensorflow')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # suppress TF C++ logs


# ======================================================================
# CONFIGURATION (EDIT THESE TO MATCH YOUR FOLDERS / EXPERIMENT SETUP)
# ======================================================================

# Residences (houses) you will process independently.
# IMPORTANT: each residence trains its *own* scaler, autoencoder, and trees.
RESIDENCES = [
    "REFIT_House01",
    "REFIT_House02",
    "REFIT_House03",
    "REFIT_House05",
    "REFIT_House07",
    "REFIT_House09",
    "REFIT_House15",
    "UKDALE_House01",
    "UKDALE_House02",
    "UKDALE_House05",
    "AMPds2_House01",
    "GREEND_House00",
    "GREEND_House01",
    "GREEND_House03",
]

# Base directory structure (your Google Drive paths)
BASE_DIR     = "/content/drive/MyDrive/Paper02_14Datasets"
MERGED_DIR   = f"{BASE_DIR}/MERGED"         # your labelled merged datasets live here
CENTROIDS_DIR= f"{BASE_DIR}/CENTROIDS"      # centroid CSVs (one per residence) live here
NILM_OUT_DIR = f"{BASE_DIR}/NILM_version_20260207"  # outputs (models/preds/metrics)

# Ensure output folder exists
os.makedirs(NILM_OUT_DIR, exist_ok=True)

# Appliances you want to detect as ON/OFF at each time step
# This script will train ONE binary DecisionTree per appliance.
APPLIANCES = ["Fridge", "WashingMachine", "Dishwasher"]

# These are anomaly types used to name your MERGED files.
# Note: anomalies are not predicted here; they just define which input files we run.
ANOMALIES = [
    "StepChange", "MultiStepChange", "Mirror",
    "Repeating", "StuckMAX", "StuckMIN", "PowerCycling"
]

# Autoencoder training hyperparams
EPOCHS      = 5
BATCH_SIZE  = 8

# Fixed seed for reproducibility
RANDOM_SEED = 42


# ======================================================================
# TIMESTAMP PARSING HELPER IN ORDER TO HAVE CONSISTENCY ACROSS DATASETS
# ======================================================================
def _parse_timestamp_series(ts: pd.Series) -> pd.Series:
    # Attempt direct parsing
    s = pd.to_datetime(ts, errors="coerce")
    if s.notna().mean() > 0.9:
        return s

    # If numeric timestamps (epoch) try seconds then milliseconds
    if pd.api.types.is_numeric_dtype(ts):
        s = pd.to_datetime(ts, unit="s", errors="coerce")
        if s.notna().mean() > 0.9:
            return s

        s = pd.to_datetime(ts, unit="ms", errors="coerce")
        if s.notna().mean() > 0.9:
            return s

    # If still bad, return whatever we got (mostly NaT)
    return s


# ======================================================================
# FIRST CALENDAR MONTH SPLIT RATIO - Based on the first date
# ======================================================================
def compute_first_month_split(file_path: str) -> float:
    try:
        df = pd.read_csv(file_path, usecols=['timestamp'])
    except Exception as e:
        print(f"Warning: could not read timestamps from {file_path}: {e}")
        return 0.1  # safe fallback if file missing/bad

    # Parse timestamps and drop NaT
    ts = _parse_timestamp_series(df['timestamp']).dropna()
    if ts.empty:
        print(f"Warning: no valid timestamps in {file_path}; using default split=0.1")
        return 0.1

    start_date = ts.min()
    end_date   = start_date + pd.DateOffset(months=1)

    # Count how many rows fall in the first month window
    first_month_count = ((ts >= start_date) & (ts < end_date)).sum()
    total_count       = len(ts)

    # Convert to ratio
    split_ratio = float(first_month_count) / float(total_count) if total_count > 0 else 0.1

    # Guardrails to avoid zero or full split
    if not np.isfinite(split_ratio) or split_ratio <= 0:
        split_ratio = 0.1
    elif split_ratio >= 1.0:
        split_ratio = 0.99

    return split_ratio


def get_training_split(file_path: str) -> float:
    try:
        return compute_first_month_split(file_path)
    except Exception as e:
        print(f"Warning: could not compute split ratio for {file_path}: {e}")
        return 0.1


# ======================================================================
# DATA LOADING + PREPROCESSING
# ======================================================================
def load_and_preprocess_data(file_path, centroids_path):
    print(f"Loading data from {file_path}...")

    # Columns that should exist in your MERGED files
    cols_to_load = ['timestamp', 'active_power', 'ground_truth_appliance', 'ground_truth_anomaly']

    try:
        # Use a lambda to safely load colums
        df = pd.read_csv(file_path, usecols=lambda c: c in cols_to_load)
    except FileNotFoundError:
        print(f"Warning: File not found: {file_path}. Skipping.")
        return None, None, None, None, None, None
    except Exception as e:
        print(f"Warning: Failed to read {file_path}: {e}. Skipping.")
        return None, None, None, None, None, None

    # If there are missing values, fill numeric columns with median
    if df.isnull().values.any():
        print("Warning: Missing values detected. Filling with column medians where applicable.")
        df = df.fillna(df.median(numeric_only=True))

    # Extract active power (active_power) and appliance labels (ground_truth_appliance)
    active_power = df["active_power"].values.reshape(-1, 1)
    ground_truth_appliance = df["ground_truth_appliance"].astype(str).values

    # Normalize active power to [0,1] using MinMaxScaler
    scaler = MinMaxScaler(feature_range=(0, 1))
    active_power_normalized = scaler.fit_transform(active_power)

    # Compute time-based split using first-month ratio
    split_percentage = get_training_split(file_path)
    split_idx = int(split_percentage * len(active_power_normalized))

    # Split into train/test
    train_data = active_power_normalized[:split_idx]
    test_data  = active_power_normalized[split_idx:]
    test_ground_truth_appliance = ground_truth_appliance[split_idx:]

    # Keep the full test dataframe (for writing predictions with timestamps)
    test_df = df.iloc[split_idx:].reset_index(drop=True)

    # Load centroids (per residence)
    print(f"Loading centroids from {centroids_path}...")
    try:
        centroids_df = pd.read_csv(centroids_path)
    except Exception as e:
        print(f"Warning: could not load centroids file {centroids_path}: {e}")
        # Return without centroids if missing
        return train_data, test_data, test_ground_truth_appliance, None, scaler, test_df

    # Fill centroid missing numeric values too
    if centroids_df.isnull().values.any():
        centroids_df = centroids_df.fillna(centroids_df.median(numeric_only=True))

    return train_data, test_data, test_ground_truth_appliance, centroids_df, scaler, test_df


# ======================================================================
# AUTOENCODER: BUILD + TRAIN
# It is a simple Autoencoder with MSE reconstruction loss
# Train the autoencoder
# Save the autoencoder and the encoder
# ======================================================================
def build_and_train_autoencoder(train_data, encoding_dim=10, max_epochs=5, batch_size=8):
    # Set seeds for reproducible weight init + batching randomness
    tf.random.set_seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    input_dim = train_data.shape[1]  # should be 1 (active power)

    # ----- Encoder -----
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(16, activation='relu', kernel_initializer='he_uniform')(input_layer)
    encoded = Dense(encoding_dim, activation='relu', kernel_initializer='he_uniform')(encoded)

    # ----- Decoder -----
    decoded = Dense(16, activation='relu', kernel_initializer='he_uniform')(encoded)
    decoded = Dense(input_dim, activation='sigmoid')(decoded)  # output in [0,1] because inputs are in [0,1]

    # Full AE model reconstructs x
    autoencoder = Model(input_layer, decoded)

    # Encoder model outputs the embedding
    encoder = Model(input_layer, encoded)

    # Train with MSE reconstruction loss
    autoencoder.compile(optimizer='adam', loss='mse')
    autoencoder.summary()

    print("\nTraining autoencoder...")
    autoencoder.fit(
        train_data, train_data,             # x -> x reconstruction
        epochs=max_epochs,
        batch_size=batch_size,
        shuffle=True,
        validation_split=0.1,               # hold out 10% of train_data for validation
        verbose=1
    )
    print("Training complete.")
    return encoder, autoencoder


# ======================================================================
# FEATURE BUILDERS: CENTROIDS + TEST ENCODING
# ======================================================================
def process_centroids(centroids_df, encoder, scaler):
    # Convert centroids into encoded space
    # Return the encoded centroids + centroid labels
    if centroids_df is None or centroids_df.empty:
        return np.empty((0, encoder.output_shape[-1])), np.array([])

    centroids_active_power = centroids_df["active_power"].values.reshape(-1, 1)

    # IMPORTANT: use SAME scaler as the residence model
    centroids_active_power_normalized = scaler.transform(centroids_active_power)

    # Encode centroid points
    centroids_encoded = encoder.predict(centroids_active_power_normalized, verbose=0)

    return centroids_encoded, centroids_df["combination"].astype(str).values


def encode_test_data(test_data, encoder):
    # Encode test into embedding space.
    if test_data is None or len(test_data) == 0:
        return np.empty((0, encoder.output_shape[-1]))
    return encoder.predict(test_data, batch_size=512, verbose=0)


# Consists of combination and appliance
# If appliance is in combination, then True is returned
def vectorized_is_appliance_in_combination(combinations, appliance):
    combinations_str = np.array(combinations, dtype=str)
    return np.char.find(combinations_str, appliance) >= 0


# ======================================================================
# TRAIN DECISION TREES ONCE (PER RESIDENCE)
# ======================================================================
def build_tree_training_set(residence: str, encoder, scaler, centroids_df):
    # Build the training dataset (X_tree, y_tree) for DecisionTrees.
    #
    #     - We want a classifier that predicts appliance ON/OFF from similarity features.
    #     - Similarity features are cosine similarities between:
    #          - encoded sample vector vs encoded centroid vectors
    #
    # Training Sequence
    #     - we take only the FIRST-MONTH portion (time-based training region)
    #       and use it to build similarity features + labels.
    #
    # X_tree - Cosine-similarity feature matrix (inputs to trees)
    # y_tree - Dictionary of binary labels per appliance
    #
    # Encode the centroids once
    centroids_encoded, _ = process_centroids(centroids_df, encoder, scaler)

    # Collect feature matrices from each file
    X_list = []

    # Collect labels for each appliance (multi-label)
    y_dict = {appl: [] for appl in APPLIANCES}

    # Loop over all anomaly-type files
    for anomaly in ANOMALIES:
        for appliance_loop in APPLIANCES:
            fp = f"{MERGED_DIR}/{residence}_{appliance_loop}_15minutes_{anomaly}_MERGED.csv"

            # Skip missing files
            if not os.path.exists(fp):
                continue

            # Read raw (NOT normalized here) so we can apply the residence scaler
            try:
                df_raw = pd.read_csv(fp, usecols=[
                    'timestamp', 'active_power', 'ground_truth_appliance', 'ground_truth_anomaly'
                ])
            except Exception as e:
                print(f"Warning: could not read {fp}: {e}")
                continue

            # Fill missing numeric values
            if df_raw.isnull().values.any():
                df_raw = df_raw.fillna(df_raw.median(numeric_only=True))

            # FIRST-MONTH split index
            split_pct = get_training_split(fp)
            split_idx = int(split_pct * len(df_raw))
            if split_idx <= 0:
                continue

            # Take only first-month portion for training
            raw_first_month = df_raw['active_power'].values[:split_idx].reshape(-1, 1)
            gt_first_month  = df_raw['ground_truth_appliance'].astype(str).values[:split_idx]

            # Normalize using the RESIDENCE scaler (IMPORTANT)
            x_first_month = scaler.transform(raw_first_month)

            # Encode in embedding space
            x_encoded = encoder.predict(x_first_month, batch_size=512, verbose=0)

            # Compute cosine similarity features vs encoded centroids
            if centroids_encoded.size:
                sim = cosine_similarity(x_encoded, centroids_encoded)  # shape: (n_samples, n_centroids)
            else:
                sim = np.zeros((len(x_encoded), 0))  # no centroids -> empty features

            X_list.append(sim)

            # Build binary label vector per appliance
            for appl in APPLIANCES:
                # y=1 if appliance substring appears in ground_truth string
                y = vectorized_is_appliance_in_combination(gt_first_month, appl).astype(int)
                y_dict[appl].append(y)

    # If no files existed or no features built, return empty
    if not X_list:
        return np.zeros((0, 0)), {appl: np.array([]) for appl in APPLIANCES}

    # Stack all similarity feature blocks vertically
    X_tree = np.vstack(X_list)

    # Concatenate label vectors per appliance
    y_tree = {
        appl: (np.concatenate(y_dict[appl]) if len(y_dict[appl]) else np.array([]))
        for appl in APPLIANCES
    }

    return X_tree, y_tree


def train_decision_trees_once(X_tree, y_tree_dict):
    # Trains one DecisionTree per appliance
    # Input: similarity-to-centroids feature vector
    # output: 0/1 for "this appliance is present"

    classifiers = {}

    for appl in APPLIANCES:
        y = y_tree_dict.get(appl, np.array([]))
        clf = DecisionTreeClassifier(random_state=RANDOM_SEED)

        # If no data or only one class, we cannot train a meaningful classifier
        if X_tree.size == 0 or y.size == 0 or len(np.unique(y)) <= 1:
            classifiers[appl] = None
        else:
            clf.fit(X_tree, y)
            classifiers[appl] = clf

    return classifiers


def predict_with_decision_tree(similarities_matrix, classifiers, appliances):
    # Use the trained DecisionTrees to predict which appliances are ON
    #
    # Steps:
    #  1) For each appliance, run its binary classifier -> 0/1
    #  2) Combine all appliance predictions into a string:
    #     - if none: "Nothing"
    #     - else: "Fridge + Dishwasher" etc.
    if similarities_matrix.size == 0:
        return np.array([])

    num_samples = similarities_matrix.shape[0]

    # Store predictions per appliance
    appliance_preds = {}
    for appliance in appliances:
        clf = classifiers.get(appliance)
        if clf is not None:
            appliance_preds[appliance] = clf.predict(similarities_matrix)
        else:
            # If classifier missing, default to all zeros
            appliance_preds[appliance] = np.zeros(num_samples, dtype=int)

    # Combine into final multi-label string
    predictions = []
    for i in range(num_samples):
        active_appliances = [a for a in appliances if appliance_preds[a][i] == 1]
        predictions.append("Nothing" if not active_appliances else " + ".join(active_appliances))

    return np.array(predictions)


# ======================================================================
# METRICS - current_iteration, file_name, appliance, random_seed, epochs,
#           batch_size, accuracy, precision, recall, f1_score, auc, poc
# ======================================================================
def calculate_metrics(test_ground_truth, predictions, appliances, file_name):
    results = []

    for appliance in appliances:
        y_true = vectorized_is_appliance_in_combination(test_ground_truth, appliance).astype(int)
        y_pred = vectorized_is_appliance_in_combination(predictions, appliance).astype(int)

        # AUC needs both classes present in y_true
        auc = np.nan
        if len(np.unique(y_true)) > 1:
            try:
                auc = roc_auc_score(y_true, y_pred)
            except ValueError:
                auc = np.nan

        results.append({
            'current_iteration': 1,
            'file_name': file_name,
            'appliance': appliance,
            'random_seed': RANDOM_SEED,
            'epochs': EPOCHS,
            'batch_size': BATCH_SIZE,
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, zero_division=0),
            'recall': recall_score(y_true, y_pred, zero_division=0),
            'f1_score': f1_score(y_true, y_pred, zero_division=0),
            'auc': auc,
            'poc': np.mean(y_true == y_pred)  # same as accuracy for binary labels
        })

    return results


# ======================================================================
# PER-RESIDENCE: TRAINING + INFERENCE
# ======================================================================
def run_for_residence(residence: str):
    #    TRAINING:
    #      - Train Autoencoder using TRAINING_FILE (first-month portion).
    #      - Using the Encoder build cosine-similarity features.
    #      - Train DecisionTrees ONCE (one per appliance) and save them.
    #
    #    INFERENCE:
    #      - Load saved AE + scaler + trees.
    #      - For each MERGED file:
    #          - take test portion only
    #          - compute similarities
    #          - predict appliance combination
    #          - save per-file prediction CSV
    #          - compute metrics and append to summary CSV
    #
    print("\n" + "="*80)
    print(f"Processing residence: {residence}")
    print("="*80)

    # Output file storing per-file per-appliance metrics
    RESULTS_FILE = f"{NILM_OUT_DIR}/{residence}_NILM_Results.csv"

    # Residence-specific centroid file
    CENTROIDS_FILE = f"{CENTROIDS_DIR}/{residence}_centroids.csv"

    # Choose ONE training file to fit the scaler + train autoencoder
    TRAINING_FILE = f"{MERGED_DIR}/{residence}_Fridge_15minutes_StepChange_MERGED.csv"

    # Saved artifacts (per residence)
    SAVED_MODEL_PATH  = f"{NILM_OUT_DIR}/{residence}_autoencoder.keras"
    SAVED_SCALER_PATH = f"{NILM_OUT_DIR}/{residence}_scaler.save"
    SAVED_TREES_PATH  = f"{NILM_OUT_DIR}/{residence}_trees.save"

    # ==================================================================
    # TRAINING
    # ==================================================================
    print("--- TRAINING: Autoencoder + Trees ---")

    # Load training file (this fits a scaler on TRAINING_FILE)
    train_data, _, _, centroids_df, scaler_trainfile, _ = load_and_preprocess_data(
        TRAINING_FILE, CENTROIDS_FILE
    )

    # Guard checks: if no training data or no centroids, skip this residence
    if train_data is None or len(train_data) == 0:
        print(f"Error: No training data loaded from {TRAINING_FILE}. Skipping residence {residence}.")
        return
    if centroids_df is None or centroids_df.empty:
        print(f"Error: No centroids loaded from {CENTROIDS_FILE}. Skipping residence {residence}.")
        return

    # Train Autoencoder (train_data already normalized by scaler_trainfile)
    encoder, autoencoder = build_and_train_autoencoder(
        train_data,
        max_epochs=EPOCHS,
        batch_size=BATCH_SIZE
    )

    # Save the trained autoencoder + scaler
    autoencoder.save(SAVED_MODEL_PATH)
    joblib.dump(scaler_trainfile, SAVED_SCALER_PATH)
    print(f"Autoencoder saved to {SAVED_MODEL_PATH}")
    print(f"Scaler saved to {SAVED_SCALER_PATH}")

    # Build training set for DecisionTrees (using first-month portions from many files)
    X_tree, y_tree = build_tree_training_set(
        residence=residence,
        encoder=encoder,
        scaler=scaler_trainfile,
        centroids_df=centroids_df
    )

    # Train trees once and save them
    classifiers = train_decision_trees_once(X_tree, y_tree)
    joblib.dump(classifiers, SAVED_TREES_PATH)
    print(f"Decision trees saved to {SAVED_TREES_PATH}")

    # ==================================================================
    # INFERENCE
    # ==================================================================
    print("\n--- INFERENCE: Predict on All Files (NO FITTING) ---")

    # Load saved artifacts
    try:
        loaded_autoencoder = load_model(SAVED_MODEL_PATH)

        # Rebuild encoder model:
        # - autoencoder.layers[2] is the second Dense in encoder path (encoding_dim output)
        encoder = Model(
            inputs=loaded_autoencoder.input,
            outputs=loaded_autoencoder.layers[2].output
        )

        scaler      = joblib.load(SAVED_SCALER_PATH)
        classifiers = joblib.load(SAVED_TREES_PATH)

    except Exception as e:
        print(f"Error loading artifacts for {residence}: {e}")
        return

    # Create results file if not exists
    header = [
        'current_iteration', 'file_name', 'appliance', 'random_seed', 'epochs', 'batch_size',
        'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'poc'
    ]
    if not os.path.exists(RESULTS_FILE):
        pd.DataFrame(columns=header).to_csv(RESULTS_FILE, index=False)

    # Loop over all anomaly files and appliances (just to enumerate filenames)
    for anomaly in ANOMALIES:
        for appliance_loop in APPLIANCES:

            file_name = f"{MERGED_DIR}/{residence}_{appliance_loop}_15minutes_{anomaly}_MERGED.csv"
            print(f"\n--- Processing: {file_name} ---")

            # Skip missing input files
            if not os.path.exists(file_name):
                print(f"Skipping missing file: {file_name}")
                continue

            # Load test portion from file (but DO NOT use its own scaler for encoding)
            _, test_data, test_ground_truth, centroids_df, _, test_df_full = load_and_preprocess_data(
                file_name, CENTROIDS_FILE
            )
            if test_data is None or len(test_data) == 0:
                print(f"Skipping {file_name}: no test data.")
                continue

            # Encode centroids + test data using residence encoder + residence scaler
            centroids_encoded, _ = process_centroids(centroids_df, encoder, scaler)
            test_encoded = encode_test_data(test_data, encoder)

            # Build similarity feature matrix (test samples x centroid count)
            if centroids_encoded.size:
                similarities_matrix = cosine_similarity(test_encoded, centroids_encoded)
            else:
                similarities_matrix = np.zeros((len(test_encoded), 0))

            # PREDICT ONLY (NO TRAINING HERE)
            predictions = predict_with_decision_tree(
                similarities_matrix,
                classifiers,
                APPLIANCES
            )

            # Save per-file predictions CSV with timestamp + GT + prediction
            output_df = test_df_full[['timestamp', 'active_power', 'ground_truth_appliance', 'ground_truth_anomaly']].copy()
            output_df['prediction_appliance'] = predictions

            output_filename = f"{NILM_OUT_DIR}/{residence}_{appliance_loop}_15minutes_{anomaly}_NILM.csv"
            output_df.to_csv(output_filename, index=False)
            print(f"Predictions saved to {output_filename}")

            # Compute and append metrics
            iteration_results = calculate_metrics(
                test_ground_truth,
                predictions,
                APPLIANCES,
                file_name
            )
            results_df = pd.DataFrame(iteration_results)
            results_df.to_csv(RESULTS_FILE, mode='a', header=False, index=False)
            print(f"Metrics appended to {RESULTS_FILE}")

            # Free TF graph memory between loops (helps in Colab)
            tf.keras.backend.clear_session()

    print(f"\n--- Completed residence: {residence} ---")


# ======================================================================
# MAIN EXECUTION
# ======================================================================
def main():
    """
    Runs the full pipeline for all residences.
    """
    for residence in RESIDENCES:
        run_for_residence(residence)

    print("\n\n=== All residences processed. ===")


if __name__ == "__main__":
    main()


In [None]:
#### CLEAN SPACES - ground_truth_appliance/prediction_appliance may have problems in spaces ####
#### This code removes " " with "" (white spaces) in the fields ground_truth_appliance and prediction_appliance ####
import os
import glob
import pandas as pd

# Path to NILM directory
NILM_DIR = "/content/drive/MyDrive/Paper02_14Datasets/NILM_version_20260207"

# Find all CSV files in the directory
csv_files = glob.glob(os.path.join(NILM_DIR, "*.csv"))

for file in csv_files:
    try:
        df = pd.read_csv(file)

        # Clean columns if they exist
        if "ground_truth_appliance" in df.columns:
            df["ground_truth_appliance"] = df["ground_truth_appliance"].astype(str).str.replace(" ", "")
        if "prediction_appliance" in df.columns:
            df["prediction_appliance"] = df["prediction_appliance"].astype(str).str.replace(" ", "")

        # Save cleaned file (overwrite original)
        df.to_csv(file, index=False)
        print(f"✅ Cleaned and saved: {os.path.basename(file)}")

    except Exception as e:
        print(f"⚠️ Skipped {os.path.basename(file)} due to error: {e}")
