In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from nilm_pre_processing import NilmPreProcessing
from tqdm.notebook import tqdm

In [2]:
nilm_pp = NilmPreProcessing()

In [8]:
base_path = "/opt/nilm-shared-data/nilm_device_detection/iawe/labeled_segments"
train_df, test_df = None, None
for dir in tqdm(os.listdir(base_path)):
    dir_path = os.path.join(base_path, dir)
    label = dir
    print(f"[+] Working on {dir_path}")
    for file in tqdm(os.listdir(dir_path)):
        if file.endswith(".xlsx"):
            # Read the data
            df = pd.read_excel(os.path.join(dir_path, file))
            df = df[["unix_ts", "Irms", "P", "AvgPowerFactor", "Q", "S", "Label"]]
            if len(df) < 10:
                continue
            df = df.sort_values("unix_ts")
            df = df.reset_index(drop=True)
            
            # Preprocess the data
            train_arr, test_arr = [], [] 
            for i in range(0, len(df), 10):
                train_arr.extend(df.iloc[i:i+9].to_dict('records'))
                if i + 9 < len(df):
                    test_arr.append(df.iloc[i+9].to_dict())
            current_train_df = pd.DataFrame(train_arr)
            current_test_df = pd.DataFrame(test_arr)
            
            # If the current dataframe is empty, skip it
            if current_train_df.empty or current_test_df.empty:
                continue
            else:
                if train_df is None:
                    train_df = current_train_df
                    test_df = current_test_df
                else:
                    train_df = pd.concat([train_df, current_train_df], ignore_index=True)
                    test_df = pd.concat([test_df, current_test_df], ignore_index=True)

  0%|          | 0/8 [00:00<?, ?it/s]

[+] Working on /opt/nilm-shared-data/nilm_device_detection/iawe/labeled_segments/television


  0%|          | 0/1 [00:00<?, ?it/s]

[+] Working on /opt/nilm-shared-data/nilm_device_detection/iawe/labeled_segments/air_conditioner_1


  0%|          | 0/12 [00:00<?, ?it/s]

[+] Working on /opt/nilm-shared-data/nilm_device_detection/iawe/labeled_segments/clothes_iron


  0%|          | 0/35 [00:00<?, ?it/s]

[+] Working on /opt/nilm-shared-data/nilm_device_detection/iawe/labeled_segments/computer


  0%|          | 0/103 [00:00<?, ?it/s]

[+] Working on /opt/nilm-shared-data/nilm_device_detection/iawe/labeled_segments/washing_machine


  0%|          | 0/20 [00:00<?, ?it/s]

[+] Working on /opt/nilm-shared-data/nilm_device_detection/iawe/labeled_segments/air_conditioner_2


  0%|          | 0/19 [00:00<?, ?it/s]

[+] Working on /opt/nilm-shared-data/nilm_device_detection/iawe/labeled_segments/wet_appliance


  0%|          | 0/255 [00:00<?, ?it/s]

[+] Working on /opt/nilm-shared-data/nilm_device_detection/iawe/labeled_segments/fridges


  0%|          | 0/264 [00:00<?, ?it/s]

In [9]:
nilm_pp.count_labels(train_df["Label"])

{'ac1_s1': 25042,
 'ac1_s2': 11897,
 'ac1_s3': 209843,
 'ac2_s1': 100894,
 'ac2_s2': 11857,
 'ac2_s3': 163647,
 'clothes_iron': 1505,
 'computer': 1104845,
 'fridges_s1': 1707387,
 'television': 254748,
 'washing_machine_s1': 12141,
 'wet_appliance_s1': 10997}

In [10]:
nilm_pp.count_labels(test_df["Label"])

{'ac1_s1': 2776,
 'ac1_s2': 1330,
 'ac1_s3': 23309,
 'ac2_s1': 11215,
 'ac2_s2': 1329,
 'ac2_s3': 18159,
 'clothes_iron': 153,
 'computer': 122706,
 'fridges_s1': 189584,
 'television': 28305,
 'washing_machine_s1': 1338,
 'wet_appliance_s1': 1087}

In [48]:
train_df = pd.read_csv("/opt/nilm-shared-data/nilm_device_detection/iawe/train_test_data/iawe_train.csv")
test_df = pd.read_csv("/opt/nilm-shared-data/nilm_device_detection/iawe/train_test_data/iawe_test.csv")

In [49]:
# Remove rows with Inf or -Inf values
train_df = train_df.replace([np.inf, -np.inf], np.nan).dropna()
test_df = test_df.replace([np.inf, -np.inf], np.nan).dropna()

In [50]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Rename the column
train_df.rename(columns={"AvgPowerFactor": "MeanPF"}, inplace=True)
test_df.rename(columns={"AvgPowerFactor": "MeanPF"}, inplace=True)

# Encode the labels
label_encoder = LabelEncoder()
train_df["Label"] = label_encoder.fit_transform(train_df["Label"])
test_df["Label"] = label_encoder.transform(test_df["Label"])

In [None]:
print(f"[+] Label Encoder Classes: {label_encoder.classes_}")
np.save(f"/opt/nilm-shared-data/nilm_device_detection/iawe/utils/label_encoder_classes.npy", label_encoder.classes_)

In [39]:
# Normalize data
scaler = StandardScaler()
scaled_train_df = train_df[['Irms', 'P', 'MeanPF', 'Q', 'S']].copy()
scaled_test_df = test_df[['Irms', 'P', 'MeanPF', 'Q', 'S']].copy()

# Normalize the data
scaled_train_df = pd.DataFrame(scaler.fit_transform(scaled_train_df), columns=scaled_train_df.columns)
scaled_test_df = pd.DataFrame(scaler.transform(scaled_test_df), columns=scaled_test_df.columns)
scaled_train_df["Label"] = train_df["Label"]
scaled_test_df["Label"] = test_df["Label"]
scaled_train_df["unix_ts"] = train_df["unix_ts"]
scaled_test_df["unix_ts"] = test_df["unix_ts"]

In [43]:
from joblib import dump
dump(scaler, "/opt/nilm-shared-data/nilm_device_detection/iawe/utils/scaler.joblib")

['/opt/nilm-shared-data/nilm_device_detection/iawe/utils/scaler.joblib']

In [53]:
# Save the dataframes
saved_path = "/opt/nilm-shared-data/nilm_device_detection/iawe/train_test_data"
print(f"[+] Saving train, val and test dataframes: {saved_path}")
train_df.to_csv(f"{saved_path}/iawe_train.csv", index=False)
test_df.to_csv(f"{saved_path}/iawe_test.csv", index=False)

[+] Saving train, val and test dataframes: /opt/nilm-shared-data/nilm_device_detection/iawe/train_test_data
