## Import Necessary Libraries

In [1]:
## Import necessary libraries
import pandas as pd
import numpy as np
import random 
from urllib.parse import quote, unquote
from datetime import timedelta

## Import libraries for the model 
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import trange
import statistics
from sklearn.metrics import classification_report

## Set path for saving model training results  
import os
os.makedirs('./result', exist_ok=True)

## Set Cuda for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Set random seed
def set_seed(seed_val):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    
# Set seed 
seed_val = 77
set_seed(seed_val)

cuda


## Selecting Data Columns
* Tag names are loaded in sequential order.
* The process of selecting the required tag names from the tag name list.

In [2]:
# Function to display tag names
def show_column(URL):
    
    # Load tag name data
    df = pd.read_csv(URL)
    
    # Convert to list format
    df = df.values.reshape(-1)
    
    return df.tolist()

In [3]:
## Set parameters for displaying tag names
table = 'home'

NAME_URL = f'http://127.0.0.1:5654/db/tql/datahub/api/v1/get-tag-names.tql?table={table}'

## Generate tag name list 
name = show_column(NAME_URL)

In [4]:
name

['TAG-Barn [kW]',
 'TAG-Dishwasher [kW]',
 'TAG-Fridge [kW]',
 'TAG-Furnace 1 [kW]',
 'TAG-Furnace 2 [kW]',
 'TAG-Garage door [kW]',
 'TAG-Home office [kW]',
 'TAG-House overall [kW]',
 'TAG-Kitchen 12 [kW]',
 'TAG-Kitchen 14 [kW]',
 'TAG-Kitchen 38 [kW]',
 'TAG-Living room [kW]',
 'TAG-Microwave [kW]',
 'TAG-Solar [kW]',
 'TAG-Well [kW]',
 'TAG-Wine cellar [kW]',
 'TAG-apparentTemperature',
 'TAG-dewPoint',
 'TAG-gen [kW]',
 'TAG-humidity',
 'TAG-precipIntensity',
 'TAG-precipProbability',
 'TAG-pressure',
 'TAG-temperature',
 'TAG-use [kW]',
 'TAG-visibility',
 'TAG-windBearing',
 'TAG-windSpeed']

## Converting TAG Name Format
* After checking all the Tag Names from the Smart home dataset in the previous step, extract only the columns to be used and convert them into parameter format.
* Use tag names TAG-windBearing, TAG-windSpeed

In [5]:
# Set the desired tag names
tags = name[-2:]

# Wrap each item in the list with single quotes and separate with commas
tags_ = ",".join(f"'{tag}'" for tag in tags)

# Check the selected tag names
print(tags_)

'TAG-windBearing','TAG-windSpeed'


## Load Smart Home Dataset
* Load the data using the Tag Names.

In [16]:
# Data load function
# '1D': Daily interval (1 day)
# '1H': Hourly interval (1 hour)
# '1T' or 'min': Minute interval (1 minute)
# '1S': Second interval (1 second)
def data_load(table, name, start_time_, end_time_, timeformat, resample_time):
    
    # Load data 
    df = pd.read_csv(f'http://127.0.0.1:5654/db/tql/datahub/api/v1/select-rawdata.tql?table={table}&name={name}&start={start_time_}&end={end_time_}&timeformat={timeformat}')
    
    # Convert to data grouped by the same time
    df = df.pivot_table(index='TIME', columns='NAME', values='VALUE', aggfunc='first').reset_index()
    
    # Set time index
    df = df.set_index(pd.to_datetime(df['TIME']))
    df = df.drop(['TIME'], axis=1)
    
    # Resampling with 1-second intervals
    # Can be modified to desired intervals such as day, hour, minute, etc.
    df = df.resample(f'{resample_time}').mean()
    
    return df

In [17]:
# Data time loading function
def time_data_load(table, name, start_time, end_time, timeformat):
    
    target = 'time'
    
    # Load the data  
    df = pd.read_csv(f"http://127.0.0.1:5654/db/tql/datahub/api/v1/select-rawdata.tql?target={target}&table={table}&name={name}&start={start_time}&end={end_time}&timeformat={timeformat}")
    
    # Create a dummy value column for resampling
    df['value'] = 0
    
    # Perform resampling
    df['time'] = pd.to_datetime(df['time'])
    df.set_index('time', inplace=True)
    df = df.resample('1s').mean()
    
    # Remove missing values
    df = df.dropna()
    
    # Remove the dummy value column
    df = df.drop(['value'], axis=1)
    
    return df

In [36]:
# Time update function
# Update start and end times based on batch size
# window_size: The period over which data is collected in batches
# step_size: The interval between the data points
def update_time(time_df, start_time, batch_size, window_size, step_size):
    
    # Calculate how many data points need to be loaded
    time = (batch_size * step_size)+ window_size - step_size - 1
    
    # Check the index number of the current time
    # If not found, set to the first index as there is no data for the current time
    try:
        index_now = time_df.index.get_loc(start_time)
    except KeyError:
        index_now = 0
    
    # Set the end time for the batch data based on the current time 
    end_time_ = str(time_df.index[index_now + time])
    
    # Set the index number for the next start time
    index_next = index_now + time - abs(window_size - step_size - 1)
    
    # Set the next start time
    next_start_time_ = str(time_df.index[index_next])
    
    # URL encoding
    start_time_ = quote(start_time)
    end_time_ = quote(end_time_)
    next_start_time_ = quote(next_start_time_)
    
    return start_time_, end_time_, next_start_time_, index_next

In [37]:
# Function to calculate the maximum and minimum values for selected tag names
def set_minmax_value(table, name, start_time_train, end_time_train):
    
    # URL encoding
    start = quote(start_time_train)
    end = quote(end_time_train)
    
    # Load Min, Max data
    df_ = pd.read_csv(f'http://127.0.0.1:5654/db/tql/datahub/api/v1/select-scale.tql?table={table}&name={name}&start={start}&end={end}')
    
    # Set Min, Max values
    Min = df_.iloc[:,1:-1].T
    Max = df_.iloc[:,2:].T
    
    return Min, Max 

## Data Preprocessing

* 1 MinMax Scaling
* 2 Window sliding

### 1. Min-Max Scaling Setup
* Set up a Min-Max Scaler that uses the maximum and minimum values, as the entire dataset is not loaded due to the process concept.

In [38]:
# Definition of the MinMaxScaler class
class MinMaxScaler_custom:
    def __init__(self):
        self.min_ = None
        self.max_ = None

    # Set scale values based on the specified parameters
    def transform(self, X, min_values, max_values):
        X = np.array(X)
        self.min_ = np.array(min_values)
        self.max_ = np.array(max_values)
        
        if self.min_ is None or self.max_ is None:
            raise ValueError("Min and Max values are not set.")
        
        # Add 1e-6 to prevent the scale value from becoming 0
        scale = (self.max_ - self.min_) + 1e-6
        if np.any(scale == 0):
            raise ValueError("Min and Max values are the same, resulting in a scale of 0.")
        
        return (X - self.min_) / scale
    
    # Normalize data based on calculated scale values
    def fit_transform(self, X, min_values, max_values):
        """Set parameters and then transform X"""
        return self.transform(X, min_values, max_values)

    # Inverse the normalized data back to original values
    def inverse_transform(self, X_scaled):
        """Inverse the transformation and return original values"""
        if self.min_ is None or self.max_ is None:
            raise ValueError("Min and Max values are not set.")
        
        X_scaled = np.array(X_scaled)
        scale = (self.max_ - self.min_) + 1e-6
        
        return X_scaled * scale + self.min_

### 2. Window sliding setup
* Window size: Determines how many time points to group together.
* Step size: The time interval by which the window moves.

In [39]:
def make_window(data, window_size, step_size):
    
    # List to store the results of sliding windows
    windows = []

    # Apply sliding window
    for i in range(0, data.shape[0] - window_size + 1, step_size):
        window = data[i:i + window_size, :]
        windows.append(window)
        
    return windows

## Model Configuration
* Using LSTM AE model.

In [40]:
# LSTM Autoencoder class definition
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(LSTMAutoencoder, self).__init__()
        
        # Encoder LSTM
        self.encoder_lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.encoder_fc = nn.Linear(hidden_dim, 2*hidden_dim)
        
        # Decoder LSTM
        self.decoder_fc = nn.Linear(2*hidden_dim, hidden_dim)
        self.decoder_lstm = nn.LSTM(hidden_dim, input_dim, num_layers, batch_first=True)

    def forward(self, x):
        # Encoder part
        _, (h, _) = self.encoder_lstm(x)
        latent = self.encoder_fc(h[-1])
        
        # Decoder part
        hidden = self.decoder_fc(latent).unsqueeze(0).repeat(x.size(1), 1, 1).permute(1, 0, 2)
        output, _ = self.decoder_lstm(hidden)
        
        return output

In [41]:
# Model configuration parameters

# number of input data columns
input_dim = len(tags)

# LSMT hidden state size
hidden_dim = 2*len(tags)

# layer size
num_layers = 3

# Learning rate 
learning_rate = 0.01

# Model configuration
model = LSTMAutoencoder(input_dim, hidden_dim, num_layers).to(device)

# Configure loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Check the model architecture
print(model)

LSTMAutoencoder(
  (encoder_lstm): LSTM(2, 4, num_layers=3, batch_first=True)
  (encoder_fc): Linear(in_features=4, out_features=8, bias=True)
  (decoder_fc): Linear(in_features=8, out_features=4, bias=True)
  (decoder_lstm): LSTM(4, 2, num_layers=3, batch_first=True)
)


## Model Training

* Save the model with the Best Loss based on the training data during training.

In [44]:
# Model training function
def train(table, name, timeformat, model, batch_size, window_size, step_size, epochs, Min, Max, scaler, time_df_train):
    
    # Initialize training loss
    train_loss = []
    
    # Initialize Best Loss value 
    best_Loss=np.inf
    
    for epoch in epochs:
         
        model.train()
    
        running_loss = 0.0
        total_step = 0
        
        # Set initial start time
        start_time_ = str(time_df_train.index[0])
        
        # Set end time
        end_time_train = str(time_df_train.index[-1])

        # Use a while loop to call data  
        while start_time_ < end_time_train:
            
            # Set the time for loading data based on the batch size
            start_time_, end_time_, next_start_time_, index_next= update_time(time_df_train, start_time_, batch_size, window_size, step_size)
            
            # Load batch data 
            data = data_load(table, name, start_time_, end_time_, timeformat, resample_time="1s")
            
            # Apply MinMax scaler
            scaled_data = scaler.fit_transform(data, Min, Max)
            
            # Set window 
            windows = make_window(scaled_data, window_size, step_size)
            
            # Print if the loaded data is empty 
            if len(scaled_data) == 0:
                print("No data available.")
            
            # Input the data into the model when it accumulates to the batch size
            if len(windows) == batch_size:
                
                # Check total batch count 
                total_step = total_step + 1
                
                # Convert data to numpy arrays
                input_data = np.array(windows)

                # Convert data to Tensor
                input_data = torch.tensor(input_data, dtype=torch.float32).to(device).float()
 
                # Optimize the optimizer
                optimizer.zero_grad()
                
                # Input to the model
                outputs = model(input_data)
                
                # Calculate loss
                loss = criterion(outputs, input_data)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
                
                # Reset batch data
                windows = []
            
            # Set the next start time   
            start_time_ = unquote(next_start_time_)
            
            # Prevent fetching beyond the last time
            if index_next + (batch_size * step_size)+ window_size - step_size - 1 >= len(time_df_train):
                break
            
        if total_step > 0:
            train_loss.append(running_loss / total_step)
            print(f'\ntrain loss: {np.mean(train_loss)}')
        
        # best model save     
        if best_Loss > np.mean(train_loss):
            best_Loss = np.mean(train_loss)
            torch.save(model, f'./result/Smart_home_LSTM_AE_New_Batch.pt')
            print('Save the best model.') 
            
        epochs.set_postfix_str(f"epoch = {epoch}, best_Loss = {best_Loss}")
               
    return model

In [45]:
########################################### Training Parameter Settings ################################################
# Set tag table name
table = 'home'
# Set tag name
name = quote(tags_, safe=":/")
# Set the start time for the train data
start_time_train = '2016-01-01 14:00:00'
# Set the end time for the train data
end_time_train = '2016-01-01 15:00:00'
# Set time format 
timeformat = 'Default'
# Set batch size
batch_size = 32
# Set window size
window_size = 3
# Set step size -> it must be less than or equal to the window size
step_size = 1
# Set number of epochs
epochs = trange(100, desc='training')
# Set Min, Max value 
Min, Max = set_minmax_value(table, name, start_time_train, end_time_train)
# Set Min-Max scaler
scaler = MinMaxScaler_custom()
# Load training time list 
time_df_train = time_data_load(table, name, quote(start_time_train), quote(end_time_train), timeformat)

########################################### Proceed with training ################################################
train(table, name, timeformat, model, batch_size, window_size, step_size, epochs, Min, Max, scaler, time_df_train)

training:   0%|          | 0/100 [00:00<?, ?it/s]


train loss: 0.06853104370280302
Save the best model.

train loss: 0.07128580011807831

train loss: 0.07216161777025609

train loss: 0.07226099306886422

train loss: 0.0719388827928924

train loss: 0.0715274648341403

train loss: 0.07115997141843355

train loss: 0.07085420131319811

train loss: 0.0706027934074449

train loss: 0.0703952223592101

train loss: 0.07022234958744125

train loss: 0.07007698614385542

train loss: 0.0699536116404528

train loss: 0.069847991950118

train loss: 0.06975685262919537

train loss: 0.06967763928621806

train loss: 0.06960833308333068

train loss: 0.0695473242156743

train loss: 0.06949331391264342

train loss: 0.06944524595502508

train loss: 0.0694022541122272

train loss: 0.06936362115854312

train loss: 0.06932875077861654

train loss: 0.06929714246988235

train loss: 0.06926837529159717

train loss: 0.06924209155015258

train loss: 0.06921798678394268

train loss: 0.0691958007696392

train loss: 0.06917530949614034

train loss: 0.06915631982481588

LSTMAutoencoder(
  (encoder_lstm): LSTM(2, 4, num_layers=3, batch_first=True)
  (encoder_fc): Linear(in_features=4, out_features=8, bias=True)
  (decoder_fc): Linear(in_features=8, out_features=4, bias=True)
  (decoder_lstm): LSTM(4, 2, num_layers=3, batch_first=True)
)

## Threshold Setting
* Calculate the threshold using validation data:
    * 1 Mean + Standard Deviation
    * 2 Maximum Value
    * 3 99% - Standard Deviation

In [46]:
# Model validation function
def threshold_set(table, name, timeformat, model, batch_size, window_size, step_size, option, Min, Max, scaler, time_df_valid):
    
    # Initialize validation loss
    valid_loss = []
    with torch.no_grad():
        
        model.eval()
        
        # Set initial start time
        start_time_ = str(time_df_valid.index[0])
        
        # Set end time
        end_time_val = str(time_df_valid.index[-1])
        
        # Use a while loop to call data
        while start_time_ < end_time_val:
            
            # Set the time for loading data based on the batch size
            start_time_, end_time_, next_start_time_, index_next_ = update_time(time_df_valid, start_time_, batch_size, window_size, step_size)
            
            # Load batch data 
            data = data_load(table, name, start_time_, end_time_, timeformat, resample_time="1s")
            
            # Apply Min Max Scaling
            scaled_data = scaler.fit_transform(data, Min, Max)
            
            # Set window 
            windows = make_window(scaled_data, window_size, step_size)
            
            # Print if the loaded data is empty 
            if len(scaled_data) == 0:
                print("No data available.")
            
            # Input the data into the model when it accumulates to the batch size
            if len(windows) == batch_size:
                
                # Convert data to numpy arrays
                input_data_val = np.array(windows)
                
                # Convert data to Tensor
                input_data_val = torch.tensor(input_data_val, dtype=torch.float32).to(device).float()

                # Input to the model
                outputs_val = model(input_data_val)
                
                # Calculate loss
                loss_val = criterion(outputs_val, input_data_val)
            
                valid_loss.append(loss_val.item())
                
                # Reset batch data
                windows = []
                
            # Set the next start time    
            start_time_ = unquote(next_start_time_)
            
            # Prevent fetching beyond the last time
            if index_next_ + (batch_size * step_size)+ window_size - step_size - 1 >= len(time_df_valid):
                break
            
        # Calculate threshold
        if option == 0:
            # Mean + Standard Deviation
            threshold =  statistics.mean(valid_loss) + statistics.stdev(valid_loss)

        # Calculate threshold
        if option == 1:
            # Maximum Value
            threshold =  max(valid_loss)

        # Calculate threshold
        if option == 2:
            # 99th Percentile - Standard Deviation
            threshold =  np.percentile(valid_loss, 99) - statistics.stdev(valid_loss)
      
    return threshold

In [62]:
########################################### validation Parameter Settings ################################################
# Load the best model
model_ = torch.load(f'./result/Smart_home_LSTM_AE_New_Batch.pt')
# Set the start time for the validation data
start_time_val = '2016-01-01 15:00:00'
# Set the end time for the validation data
end_time_val = '2016-01-01 16:00:00'
# Set the threshold Option
option = 1
# Load validation time list
time_df_valid = time_data_load(table, name, quote(start_time_val), quote(end_time_val), timeformat)

########################################### Proceed with validation ################################################
threshold = threshold_set(table, name, timeformat, model_, batch_size, window_size, step_size, option, Min, Max, scaler, time_df_valid)
print(threshold)

4.180762767791748


## Model Testing

* Proceed with model testing on the test data based on the threshold calculated in the previous step.

In [63]:
# Model testing function
def test(table, name, timeformat, model, batch_size, window_size, step_size, threshold, Min, Max, scaler, time_df_test):
    
    # Initial settings 
    test_loss = []
    with torch.no_grad():
        
        # Set the initial start time
        start_time_ = str(time_df_test.index[0])
        
        # Set the end time
        end_time_test = str(time_df_test.index[-1])
        
        # Use a while loop to call data 
        while start_time_ < end_time_test:
            
            # Set the time for loading data based on the batch size
            start_time_, end_time_, next_start_time_, index_next_ = update_time(time_df_test, start_time_, batch_size, window_size, step_size)

            # Load batch data 
            data = data_load(table, name, start_time_, end_time_, timeformat, resample_time="1s")
            
            # Apply MinMax scaler
            scaled_data = scaler.fit_transform(data, Min, Max)
            
            # Set window 
            windows = make_window(scaled_data, window_size, step_size)
            
            # Print if the loaded data is empty   
            if len(scaled_data) == 0:
                print("No data available.")
            
            # Input the data into the model when it accumulates to the batch size
            if len(windows) == batch_size:
                
                # Convert data to numpy arrays
                input_data_test = np.array(windows)
                
                # Convert data to Tensor
                input_data_test = torch.tensor(input_data_test, dtype=torch.float32).to(device).float()

                # Input to the model
                outputs_test = model(input_data_test)
                
                # Calculate loss
                loss_test = criterion(outputs_test, input_data_test)
            
                test_loss.append(loss_test.item())
                
                # Reset batch data
                windows = []
                
            # Set the next start time   
            start_time_ = unquote(next_start_time_) 
            
            # Prevent fetching beyond the last time
            if index_next_ + (batch_size * step_size)+ window_size - step_size - 1 >= len(time_df_test):
                break
            
    # Generate final results 
    final_df = pd.DataFrame(test_loss, columns=['reconst_score'])
    final_df['label'] = 0

    # Set labels based on each threshold
    final_df['pred_label'] = np.where(final_df['reconst_score']>threshold,1,0)
    
    return final_df

In [64]:
########################################### Test Parameter Settings ################################################
# Set the start time for the test data
start_time_test = '2016-01-01 16:00:00'
# Set the end time for the test data
end_time_test = '2016-01-01 17:00:00'
# Load the test time list
time_df_test = time_data_load(table, name, quote(start_time_test), quote(end_time_test), timeformat)

######################################## Proceed with testing #############################################
result_df = test(table, name, timeformat, model_, batch_size, window_size, step_size, threshold, Min, Max, scaler, time_df_test)

## Model Performance Evaluation
* Evaluate performance based on the F1 Score.
* After evaluating performance across different thresholds, fix the threshold that shows the best performance.

In [56]:
# 0. Threshold Setting using Mean + Standard Deviation
print(classification_report(result_df['label'], result_df['pred_label'],labels=[0]))

              precision    recall  f1-score   support

           0       1.00      0.49      0.66       112

   micro avg       1.00      0.49      0.66       112
   macro avg       1.00      0.49      0.66       112
weighted avg       1.00      0.49      0.66       112



In [65]:
# 1. Threshold Setting using Maximum Value
print(classification_report(result_df['label'], result_df['pred_label'],labels=[0]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       112

    accuracy                           1.00       112
   macro avg       1.00      1.00      1.00       112
weighted avg       1.00      1.00      1.00       112



In [61]:
# 2. Threshold Setting using 99% - Standard Deviation
print(classification_report(result_df['label'], result_df['pred_label'],labels=[0]))

              precision    recall  f1-score   support

           0       1.00      0.72      0.84       112

   micro avg       1.00      0.72      0.84       112
   macro avg       1.00      0.72      0.84       112
weighted avg       1.00      0.72      0.84       112

