## Import Necessary Libraries

In [1]:
## Import necessary libraries
import pandas as pd
import numpy as np
import random 
from urllib.parse import quote
from sklearn.preprocessing import MinMaxScaler

## Import libraries for the model 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from tqdm.notebook import trange
import statistics
from sklearn.metrics import classification_report

## Set path for saving model training results  
import os
os.makedirs('./result', exist_ok=True)

## Set Cuda for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Set random seed
def set_seed(seed_val):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

# Set seed 
seed_val = 77
set_seed(seed_val)

cuda


## Selecting Data Columns
* Tag names are loaded in sequential order.
* The process of selecting the required tag names from the tag name list.

In [2]:
# Function to display tag names
def show_column(URL):
    
    # Load tag name data
    df = pd.read_csv(URL)
    
    # Convert to list format
    df = df.values.reshape(-1)
    
    return df.tolist()

In [3]:
## Set parameters for displaying tag names
table = 'home'

NAME_URL = f'http://127.0.0.1:5654/db/tql/datahub/api/v1/get_tag_names.tql?table={table}'

## Generate tag name list 
name = show_column(NAME_URL)

In [4]:
name

['TAG-Barn [kW]',
 'TAG-Dishwasher [kW]',
 'TAG-Fridge [kW]',
 'TAG-Furnace 1 [kW]',
 'TAG-Furnace 2 [kW]',
 'TAG-Garage door [kW]',
 'TAG-Home office [kW]',
 'TAG-House overall [kW]',
 'TAG-Kitchen 12 [kW]',
 'TAG-Kitchen 14 [kW]',
 'TAG-Kitchen 38 [kW]',
 'TAG-Living room [kW]',
 'TAG-Microwave [kW]',
 'TAG-Solar [kW]',
 'TAG-Well [kW]',
 'TAG-Wine cellar [kW]',
 'TAG-apparentTemperature',
 'TAG-dewPoint',
 'TAG-gen [kW]',
 'TAG-humidity',
 'TAG-precipIntensity',
 'TAG-precipProbability',
 'TAG-pressure',
 'TAG-temperature',
 'TAG-use [kW]',
 'TAG-visibility',
 'TAG-windBearing',
 'TAG-windSpeed']

## Converting TAG Name Format
* After checking all the Tag Names from the Smart home dataset in the previous step, extract only the columns to be used and convert them into parameter format.
* Use tag names TAG-windBearing, TAG-windSpeed

In [5]:
# Set the desired tag names
tags = name[-2:]

# Wrap each item in the list with single quotes and separate with commas
tags_ = ",".join(f"'{tag}'" for tag in tags)

# Check the selected tag names
print(tags_)

'TAG-windBearing','TAG-windSpeed'


##  Load Smart Home Dataset
* When loading the dataset, load the train, validation, and test datasets separately.
* As an example, use 1 hour of data for each dataset.

In [6]:
# Data loading parameter settings

# Set the tag table name
table = 'home'
# Set the tag names
name = quote(tags_, safe=":/")
# Set the time format  
timeformat = 'Default'

# Set the train data start time
start_time_train = quote('2016-01-01 14:00:00')
# Set the train data end time
end_time_train = quote('2016-01-01 15:00:00')

# Set the validation data start time
start_time_val = quote('2016-01-01 15:00:00')
# Set the validation data end time
end_time_val = quote('2016-01-01 16:00:00')

# Set the test data start time
start_time_test = quote('2016-01-01 16:00:00')
# Set the test data end time
end_time_test = quote('2016-01-01 17:00:00')

In [7]:
# Data load function
# '1D': Daily interval (1 day)
# '1H': Hourly interval (1 hour)
# '1T' or 'min': Minute interval (1 minute)
# '1S': Second interval (1 second)
def data_load(table, name, start_time, end_time, timeformat, resample_time):
    
    # Load data
    df = pd.read_csv(f'http://127.0.0.1:5654/db/tql/datahub/api/v1/select-rawdata.tql?table={table}&name={name}&start={start_time}&end={end_time}&timeformat={timeformat}')
    
    # Convert to data grouped by the same time
    df = df.pivot_table(index='TIME', columns='NAME', values='VALUE', aggfunc='first').reset_index()
    
    # Set time index
    df = df.set_index(pd.to_datetime(df['TIME']))
    df = df.drop(['TIME'], axis=1)
    
    # Resampling with 1-second intervals
    # Can be modified to desired intervals such as day, hour, minute, etc.
    df = df.resample(f'{resample_time}').mean()
    
    return df

In [8]:
# Load training data
train = data_load(table, name, start_time_train, end_time_train, timeformat, "1s")
# Load validation data
valid = data_load(table, name, start_time_val, end_time_val, timeformat, "1s")
# Load test data
test = data_load(table, name, start_time_test, end_time_test, timeformat, "1s")

In [9]:
print(train)
print(valid)
print(test)

NAME                 TAG-windBearing  TAG-windSpeed
TIME                                               
2016-01-01 14:00:00            282.0           9.18
2016-01-01 14:00:01            282.0           9.18
2016-01-01 14:00:02            282.0           9.18
2016-01-01 14:00:03            282.0           9.18
2016-01-01 14:00:04            282.0           9.18
...                              ...            ...
2016-01-01 14:59:56            253.0          11.30
2016-01-01 14:59:57            253.0          11.30
2016-01-01 14:59:58            253.0          11.30
2016-01-01 14:59:59            253.0          11.30
2016-01-01 15:00:00            253.0          11.30

[3601 rows x 2 columns]
NAME                 TAG-windBearing  TAG-windSpeed
TIME                                               
2016-01-01 15:00:00            253.0          11.30
2016-01-01 15:00:01            253.0          11.30
2016-01-01 15:00:02            253.0          11.30
2016-01-01 15:00:03            253.0   

## Data Preprocessing

* 1 MinMax Scaling

In [10]:
# Scaler Setup
scaler = MinMaxScaler()

# Apply Scaler
train_ = scaler.fit_transform(train.values)
valid_ = scaler.transform(valid.values)
test_ = scaler.transform(test.values)

# Set DataFrames
train_scaled = pd.DataFrame(train_ , columns=train.columns)
valid_scaled = pd.DataFrame(valid_ , columns=valid.columns)
test_scaled = pd.DataFrame(test_ , columns=test.columns)

# Reset time index
train_scaled.index = train.index
valid_scaled.index = valid.index
test_scaled.index = test.index

In [11]:
print(train_scaled)
print(valid_scaled)
print(test_scaled)

NAME                 TAG-windBearing  TAG-windSpeed
TIME                                               
2016-01-01 14:00:00         0.966667       0.563459
2016-01-01 14:00:01         0.966667       0.563459
2016-01-01 14:00:02         0.966667       0.563459
2016-01-01 14:00:03         0.966667       0.563459
2016-01-01 14:00:04         0.966667       0.563459
...                              ...            ...
2016-01-01 14:59:56         0.644444       0.859135
2016-01-01 14:59:57         0.644444       0.859135
2016-01-01 14:59:58         0.644444       0.859135
2016-01-01 14:59:59         0.644444       0.859135
2016-01-01 15:00:00         0.644444       0.859135

[3601 rows x 2 columns]
NAME                 TAG-windBearing  TAG-windSpeed
TIME                                               
2016-01-01 15:00:00         0.644444       0.859135
2016-01-01 15:00:01         0.644444       0.859135
2016-01-01 15:00:02         0.644444       0.859135
2016-01-01 15:00:03         0.644444   

## Dataset & Loader Setup

### Window Dataset Configuration
* To train on time series data, you need to set the window size and the sliding step.

* Window size: Determines how many time points to group together.
* Step size: The time interval by which the window moves.

In [12]:
# Sliding Window Dataset setup 
class SlidingWindowDataset(Dataset):
    def __init__(self, data, window_size, step_size):
        self.data = data
        self.window_size = window_size
        self.step_size = step_size
        self.windows = self._create_windows()
    
    # Set up sliding windows
    def _create_windows(self):
        windows = []
        for i in range(0, len(self.data) - self.window_size + 1, self.step_size):
            window = self.data[i:i + self.window_size]
            windows.append(torch.Tensor(window.values))
        return windows
    
    def __len__(self):
        return len(self.windows)
    
    def __getitem__(self, idx):
        return self.windows[idx]

In [13]:
# Sliding window configuration
window_size = 3
step_size = 1 

# Set up datasets  
train_ = SlidingWindowDataset(train_scaled, window_size, step_size)
valid_ = SlidingWindowDataset(valid_scaled, window_size, step_size)
test_ = SlidingWindowDataset(test_scaled, window_size, step_size)

# Set up data loaders
train_dataloader = DataLoader(train_, batch_size=32, shuffle=False)
valid_dataloader = DataLoader(valid_, batch_size=32, shuffle=False)
test_dataloader = DataLoader(test_, batch_size=32, shuffle=False)

In [14]:
# Verify DataLoader application and check the shape of the input data
print(list(train_dataloader)[0].shape)

torch.Size([32, 3, 2])


## Model Configuration
* Using LSTM AE model.

In [15]:
# LSTM Autoencoder class definition
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(LSTMAutoencoder, self).__init__()
        
        # Encoder LSTM
        self.encoder_lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.encoder_fc = nn.Linear(hidden_dim, 2*hidden_dim)
        
        # Decoder LSTM
        self.decoder_fc = nn.Linear(2*hidden_dim, hidden_dim)
        self.decoder_lstm = nn.LSTM(hidden_dim, input_dim, num_layers, batch_first=True)

    def forward(self, x):
        # Encoder part
        _, (h, _) = self.encoder_lstm(x)
        latent = self.encoder_fc(h[-1])
        
        # Decoder part
        hidden = self.decoder_fc(latent).unsqueeze(0).repeat(x.size(1), 1, 1).permute(1, 0, 2)
        output, _ = self.decoder_lstm(hidden)
        
        return output

In [16]:
# Model configuration parameters

# number of input data columns
# last number in print(list(train_dataloader)[0][0].shape)
input_dim = 2

# LSMT hidden state size
hidden_dim = 4

# layer size
num_layers = 3

# Learning rate 
learning_rate = 0.01

# Model configuration
model = LSTMAutoencoder(input_dim, hidden_dim, num_layers).to(device)

# Configure loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Check the model architecture
print(model)

LSTMAutoencoder(
  (encoder_lstm): LSTM(2, 4, num_layers=3, batch_first=True)
  (encoder_fc): Linear(in_features=4, out_features=8, bias=True)
  (decoder_fc): Linear(in_features=8, out_features=4, bias=True)
  (decoder_lstm): LSTM(4, 2, num_layers=3, batch_first=True)
)


## Model Training

* Save the model with the Best Loss based on the training data during training.

In [17]:
# Initialize loss
train_loss = []
# Initialize total step
total_step = len(train_dataloader)
# Set number of epochs
epoch_in = trange(100, desc='training')
# Initialize best Loss value
best_Loss= np.inf

# Start model training
for epoch in epoch_in:
    model.to(device)
    model.train()
    running_loss = 0.0

    preds_ = []
    targets_ = []

    for batch_idx, train_data in enumerate(train_dataloader):

        inputs = train_data.to(device).float()

        optimizer.zero_grad()

        # Input to the model
        outputs = model(inputs)
        
        # Calculate loss
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    train_loss.append(running_loss/total_step)
    print(f'\ntrain loss: {np.mean(train_loss)}')

    
    if best_Loss > np.mean(train_loss):
        best_Loss = np.mean(train_loss)
        torch.save(model, f'./result/Smart_home_LSTM_AE.pt')
        print('Model saved')
    epoch_in.set_postfix_str(f"epoch = {epoch}, best_Loss = {best_Loss}")

training:   0%|          | 0/100 [00:00<?, ?it/s]


train loss: 0.11583049455245512
Model saved

train loss: 0.09801602310426863
Model saved

train loss: 0.09103955404907134
Model saved

train loss: 0.08712847792261189
Model saved

train loss: 0.08458687174916926
Model saved

train loss: 0.08279100409605894
Model saved

train loss: 0.08144886465213354
Model saved

train loss: 0.08040474738489911
Model saved

train loss: 0.0795677098365008
Model saved

train loss: 0.0788807465470256
Model saved

train loss: 0.0783061602357805
Model saved

train loss: 0.07781798646656198
Model saved

train loss: 0.07739774099236686
Model saved

train loss: 0.0770318977666934
Model saved

train loss: 0.07671032088512011
Model saved

train loss: 0.07642526288923243
Model saved

train loss: 0.07617069679367676
Model saved

train loss: 0.0759418656626687
Model saved

train loss: 0.07573496423397971
Model saved

train loss: 0.07554691351014725
Model saved

train loss: 0.07537519451296698
Model saved

train loss: 0.07521772793324083
Model saved

train loss: 0.

## Threshold Setting
* Calculate the threshold using validation data:
    * 1 Mean + Standard Deviation
    * 2 Maximum Value
    * 3 99% - Standard Deviation

In [18]:
# Load the best model
model_ = torch.load(f'./result/Smart_home_LSTM_AE.pt') 

In [19]:
# Calculate validation data reconstruction loss
valid_loss = []
with torch.no_grad():
    
    for batch_idx, valid_data in enumerate(valid_dataloader):

        inputs_val = valid_data.to(device).float()

        outputs_val = model_(inputs_val)
        loss = criterion(outputs_val, inputs_val)
        
        valid_loss.append(loss.item())
        
# Threshold setting
# The threshold should be adjusted according to your own criteria
threshold_1 =  statistics.mean(valid_loss) + statistics.stdev(valid_loss)
threshold_2 =  max(valid_loss)
threshold_3 =  np.percentile(valid_loss, 99) - statistics.stdev(valid_loss) 

print(threshold_1)
print(threshold_2)
print(threshold_3)

1.2459881743170853
4.045138359069824
3.0967311209932786


## Model Testing

* Proceed with model testing on the test data based on the threshold calculated in the previous step.

In [20]:
# Apply the model to the test data
test_loss = []
with torch.no_grad():
    
    for batch_idx, test_data in enumerate(test_dataloader):

        inputs_test = test_data.to(device).float()

        outputs_test = model_(inputs_test)
        loss = criterion(outputs_test, inputs_test)
        
        test_loss.append(loss.item())
        
# Create a DataFrame for the test results
result = pd.DataFrame(test_loss, columns=['Reconst_Loss'])
# Assume that there are no abnormal data
result['label'] = 0

# Classify normal and abnormal based on each threshold
result['pred_1'] = np.where(result['Reconst_Loss']>threshold_1,1,0)
result['pred_2'] = np.where(result['Reconst_Loss']>threshold_2,1,0)
result['pred_3'] = np.where(result['Reconst_Loss']>threshold_3,1,0)


## Model Performance Evaluation
* Evaluate performance based on the F1 Score.
* After evaluating performance across different thresholds, fix the threshold that shows the best performance.

In [21]:
# 1. Threshold Setting using Mean + Standard Deviation
print(classification_report(result['label'], result['pred_1'],labels=[0]))

              precision    recall  f1-score   support

           0       1.00      0.49      0.65       113

   micro avg       1.00      0.49      0.65       113
   macro avg       1.00      0.49      0.65       113
weighted avg       1.00      0.49      0.65       113



In [22]:
# 2. Threshold Setting using Maximum Value
print(classification_report(result['label'], result['pred_2'],labels=[0]))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       113

   micro avg       1.00      0.99      1.00       113
   macro avg       1.00      0.99      1.00       113
weighted avg       1.00      0.99      1.00       113



In [23]:
# 3. Threshold Setting using 99% - Standard Deviation
print(classification_report(result['label'], result['pred_3'],labels=[0]))

              precision    recall  f1-score   support

           0       1.00      0.73      0.84       113

   micro avg       1.00      0.73      0.84       113
   macro avg       1.00      0.73      0.84       113
weighted avg       1.00      0.73      0.84       113

