In [2]:
import pandas as pd
import polars as pl
import numpy as np
import os
from sklearn.model_selection import train_test_split
from nilm_pre_processing import NilmPreProcessing
from tqdm.notebook import tqdm

In [3]:
nilm_pp = NilmPreProcessing()

In [15]:
label_data_dir = "/opt/nilm-shared-data/nilm_device_detection/RAE/transform_data/labeled_data"
train_df, test_df = None, None
file_skips = 0
for appliance in tqdm(os.listdir(label_data_dir), desc="Overall Progress, file skips {}".format(file_skips)):
    dir_path = os.path.join(label_data_dir, appliance)
    for files in tqdm(os.listdir(dir_path), desc="Processing Appliance: {}".format(appliance)):
        if files.endswith(".xlsx"):
            # Read the data
            df = pd.read_excel(os.path.join(dir_path, files))
            df.rename(columns={"pf": "MeanPF", "label": "Label"}, inplace=True)
            df = df[["unix_ts", "Irms", "P", "MeanPF", "Q", "S", "Label"]]
            if len(df) < 10:
                file_skips += 1
                continue
            df = df.sort_values("unix_ts")
            df = df.reset_index(drop=True)
            df = df.dropna()
            
            # Preprocess the data
            train_arr, test_arr = [], [] 
            for i in range(0, len(df), 10):
                train_arr.extend(df.iloc[i:i+9].to_dict('records'))
                if i + 9 < len(df):
                    test_arr.append(df.iloc[i+9].to_dict())
            current_train_df = pd.DataFrame(train_arr)
            current_test_df = pd.DataFrame(test_arr)
            
            # If the current dataframe is empty, skip it
            if current_train_df.empty or current_test_df.empty:
                continue
            else:
                if train_df is None:
                    train_df = current_train_df
                    test_df = current_test_df
                else:
                    train_df = pd.concat([train_df, current_train_df], ignore_index=True)
                    test_df = pd.concat([test_df, current_test_df], ignore_index=True)

Overall Progress, file skips: 0:   0%|          | 0/20 [00:00<?, ?it/s]

Processing Appliance: home_office:   0%|          | 0/1088 [00:00<?, ?it/s]

Processing Appliance: bathroom:   0%|          | 0/2651 [00:00<?, ?it/s]

Processing Appliance: basement_blue_plug:   0%|          | 0/1190 [00:00<?, ?it/s]

Processing Appliance: garage_sub_panel:   0%|          | 0/6 [00:00<?, ?it/s]

Processing Appliance: lp16:   0%|          | 0/298 [00:00<?, ?it/s]

Processing Appliance: clothes_dryer:   0%|          | 0/44 [00:00<?, ?it/s]

Processing Appliance: misc_plug:   0%|          | 0/209 [00:00<?, ?it/s]

Processing Appliance: heat_pump:   0%|          | 0/419 [00:00<?, ?it/s]

Processing Appliance: lp20:   0%|          | 0/372 [00:00<?, ?it/s]

Processing Appliance: rental_suite_sub_panel:   0%|          | 0/3439 [00:00<?, ?it/s]

Processing Appliance: oven:   0%|          | 0/6 [00:00<?, ?it/s]

Processing Appliance: clothes_washer:   0%|          | 0/534 [00:00<?, ?it/s]

Processing Appliance: furniture_and_hot_water_unit:   0%|          | 0/6217 [00:00<?, ?it/s]

Processing Appliance: lp3:   0%|          | 0/70 [00:00<?, ?it/s]

Processing Appliance: bedroom_plug:   0%|          | 0/3837 [00:00<?, ?it/s]

Processing Appliance: kitchen_counter_plug:   0%|          | 0/202 [00:00<?, ?it/s]

Processing Appliance: upstair_plugs:   0%|          | 0/520 [00:00<?, ?it/s]

Processing Appliance: kitchen_dishwasher:   0%|          | 0/211 [00:00<?, ?it/s]

Processing Appliance: fridge:   0%|          | 0/3144 [00:00<?, ?it/s]

Processing Appliance: basement_plug_and_light:   0%|          | 0/3490 [00:00<?, ?it/s]

In [16]:
train_df.head()

Unnamed: 0,unix_ts,Irms,P,MeanPF,Q,S,Label
0,1462064304,0.9,78,0.702703,5,111,home_office
1,1462064305,1.7,179,0.864734,25,207,home_office
2,1462064306,1.7,180,0.857143,25,210,home_office
3,1462064307,1.8,194,0.889908,27,218,home_office
4,1462064308,1.8,197,0.895455,27,220,home_office


In [17]:
from utils import count_labels

In [18]:
count_labels(train_df["Label"].to_numpy())

{'basement_blue_plugs': 301912,
 'bathrooms': 1369855,
 'clothes_dryer_s1': 30427,
 'clothes_dryer_s2': 54706,
 'clothes_washer': 44271,
 'fridge_s1': 2029157,
 'fridge_s2': 20017,
 'furnace_and_hot_water_unit': 4108399,
 'garage_sub_panel_s1': 500,
 'garage_sub_panel_s2': 356,
 'heat_pump': 499987,
 'home_office': 1737034,
 'kitchen_counter_plugs': 37190,
 'kitchen_dishwasher': 143726,
 'kitchen_oven_s1': 16285,
 'kitchen_oven_s2': 5361,
 'kitchen_oven_s3': 2741,
 'lp16_s1': 633971,
 'lp20_s1': 689155,
 'lp20_s2': 9176,
 'lp3_s1': 273427,
 'lp3_s2': 20606,
 'misc_plugs': 189084,
 'rental_suite_sub_panel': 6119286,
 'upstairs_bedroom_AFCI_arc-fault_plugs': 2258671,
 'upstairs_plug_and_lights': 1462691}

In [20]:
len(train_df), len(test_df)

(22057991, 2434943)

In [19]:
train_df.to_csv("/opt/nilm-shared-data/nilm_device_detection/RAE/train_test_data/rae/train.csv", index=False)
test_df.to_csv("/opt/nilm-shared-data/nilm_device_detection/RAE/train_test_data/rae/test.csv", index=False)