## Import Necessary Libraries

In [1]:
## Import necessary libraries
import pandas as pd
import numpy as np
import random 
from urllib.parse import quote, unquote
from datetime import timedelta

## Import libraries for the model
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import trange
from sklearn.metrics import f1_score, classification_report

## Set path for saving model training results  
import os
os.makedirs('./result', exist_ok=True)

## Set Cuda for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Set random seed
def set_seed(seed_val):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

# Set seed
seed_val = 77
set_seed(seed_val)

cuda


## Selecting Data Columns
* Tag names are loaded in sequential order.
* The process of selecting the required tag names from the tag name list.

In [2]:
# Function to display tag names
def show_column(URL):
    
    # Load tag name data
    df = pd.read_csv(URL)
    
    # Convert to list format
    df = df.values.reshape(-1)
    
    return df.tolist()

In [3]:
## Set parameters for displaying tag names
table = 'pump'

NAME_URL = f'http://127.0.0.1:5654/db/tql/datahub/api/v1/get-tag-names.tql?table={table}'

## Generate tag name list 
name = show_column(NAME_URL)

In [4]:
name

['machine_status',
 'sensor_01',
 'sensor_02',
 'sensor_03',
 'sensor_04',
 'sensor_05',
 'sensor_10',
 'sensor_11',
 'sensor_12',
 'sensor_13',
 'sensor_14',
 'sensor_16',
 'sensor_17',
 'sensor_18',
 'sensor_19',
 'sensor_20',
 'sensor_21',
 'sensor_22',
 'sensor_23',
 'sensor_24',
 'sensor_25',
 'sensor_26',
 'sensor_27',
 'sensor_28',
 'sensor_29',
 'sensor_30',
 'sensor_31',
 'sensor_32',
 'sensor_33',
 'sensor_34',
 'sensor_35',
 'sensor_36',
 'sensor_37',
 'sensor_38',
 'sensor_39',
 'sensor_40',
 'sensor_41',
 'sensor_42',
 'sensor_43',
 'sensor_44',
 'sensor_45',
 'sensor_46',
 'sensor_47',
 'sensor_48',
 'sensor_49']

## Converting TAG Name Format
* After checking all the Tag Names from the pump dataset in the previous step, extract only the columns to be used and convert them into parameter format.
* Use all tag names

In [5]:
# Set the desired tag names
tags = name

# Wrap each item in the list with single quotes and separate with commas
tags_ = ",".join(f"'{tag}'" for tag in tags)

# Check the selected tag names
print(tags_)

'machine_status','sensor_01','sensor_02','sensor_03','sensor_04','sensor_05','sensor_10','sensor_11','sensor_12','sensor_13','sensor_14','sensor_16','sensor_17','sensor_18','sensor_19','sensor_20','sensor_21','sensor_22','sensor_23','sensor_24','sensor_25','sensor_26','sensor_27','sensor_28','sensor_29','sensor_30','sensor_31','sensor_32','sensor_33','sensor_34','sensor_35','sensor_36','sensor_37','sensor_38','sensor_39','sensor_40','sensor_41','sensor_42','sensor_43','sensor_44','sensor_45','sensor_46','sensor_47','sensor_48','sensor_49'


## Load Pump Sensor Dataset
* Load the data using all tag name

In [6]:
# Data loading function
def data_load(table, name, start_time, end_time, timeformat):
    
    # Load data 
    df = pd.read_csv(f'http://127.0.0.1:5654/db/tql/datahub/api/v1/select-rawdata.tql?table={table}&name={name}&start={start_time}&end={end_time}&timeformat={timeformat}')

    # Convert to data grouped by the time
    df = df.pivot_table(index='TIME', columns='NAME', values='VALUE', aggfunc='first').reset_index()

    # Set TIME column
    df['TIME'] = pd.to_datetime(df['TIME'])

    # Set time index
    df.set_index('TIME', inplace=True)
    
    # Move the machine_status column to the end and rename it to label
    df['machine_status'] = df.pop('machine_status')
    df.rename(columns={'machine_status': 'label'}, inplace=True)
    
    # Convert label column data to integer type
    df['label'] = df['label'].astype(int)
    
    return df

In [7]:
# Data time loading function
def time_data_load(table, name, start_time, end_time, timeformat):
    
    target = 'time'
    
    # Load the data  
    df = pd.read_csv(f"http://127.0.0.1:5654/db/tql/datahub/api/v1/select-rawdata.tql?target={target}&table={table}&name={name}&start={start_time}&end={end_time}&timeformat={timeformat}")
    
    # Create a dummy value column for resampling
    df['value'] = 0
    
    # Perform resampling
    df['time'] = pd.to_datetime(df['time'])
    df.set_index('time', inplace=True)
    df = df.resample('1s').mean()
    
    # Remove missing values
    df = df.dropna()
    
    # Remove the dummy value column
    df = df.drop(['value'], axis=1)
    
    return df

In [8]:
# Time update function
# Update start and end times based on batch size
def update_time(time_df, start_time, batch_size):
    
    # Calculate how many data points need to be loaded
    time = batch_size - 1
    
    # Check the index number of the current time
    # If not found, set to the first index as there is no data for the current time
    try:
        index_now = time_df.index.get_loc(start_time)
    except KeyError:
        index_now = 0
    
    # Set the end time for the batch data based on the current time 
    end_time_ = str(time_df.index[index_now + time] + timedelta(seconds=1))
    
    # Set the index number for the next start time
    index_next = index_now + time + 1
    
    # Set the next start time
    next_start_time_ = str(time_df.index[index_next])
    
    # URL encoding
    start_time_ = quote(start_time)
    end_time_ = quote(end_time_)
    next_start_time_ = quote(next_start_time_)
    
    return start_time_, end_time_, next_start_time_, index_next

In [9]:
# Function to calculate the maximum and minimum values for selected tag names
def set_minmax_value(table, name, start_time_train, end_time_train):
    
    # URL encoding
    start = quote(start_time_train)
    end = quote(end_time_train)
    
    # Load Min, Max data
    df_ = pd.read_csv(f'http://127.0.0.1:5654/db/tql/datahub/api/v1/select-scale.tql?table={table}&name={name}&start={start}&end={end}')
    
    # Set Min, Max values
    Min = df_.iloc[:,1:-1].T
    Max = df_.iloc[:,2:].T
    
    return Min, Max  

## Data Preprocessing

* 1 MinMax Scaling

### 1. Min-Max Scaling Setup
* Set up a Min-Max Scaler that uses the maximum and minimum values, as the entire dataset is not loaded due to the process concept.

In [10]:
# Definition of the MinMaxScaler class
class MinMaxScaler_custom:
    def __init__(self):
        self.min_ = None
        self.max_ = None

    # Set scale values based on the specified parameters
    def transform(self, X, min_values, max_values):
        X = np.array(X)
        self.min_ = np.array(min_values)
        self.max_ = np.array(max_values)
        
        if self.min_ is None or self.max_ is None:
            raise ValueError("Min and Max values are not set.")
        
        scale = (self.max_ - self.min_)
        if np.any(scale == 0):
            raise ValueError("Min and Max values are the same, resulting in a scale of 0.")
        
        return (X - self.min_) / scale
    
    # Normalize data based on calculated scale values
    def fit_transform(self, X, min_values, max_values):
        """Set parameters and then transform X"""
        return self.transform(X, min_values, max_values)

    # Inverse the normalized data back to original values
    def inverse_transform(self, X_scaled):
        """Inverse the transformation and return original values"""
        if self.min_ is None or self.max_ is None:
            raise ValueError("Min and Max values are not set.")
        
        X_scaled = np.array(X_scaled)
        scale = (self.max_ - self.min_)
        
        return X_scaled * scale + self.min_

## Model Configuration
* Using Linear_classifier model

In [11]:
class Linear_classifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Linear_classifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [12]:
# Model configuration parameters
# Learning rate
learning_rate = 0.01
input_dim = 44 
hidden_dim = 20 
output_dim = 2 

# Model configuration
model = Linear_classifier(input_dim, hidden_dim, output_dim).to(device)

# Configure loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Check the model architecture
print(model)

Linear_classifier(
  (fc1): Linear(in_features=44, out_features=20, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=20, out_features=2, bias=True)
)


## Model Training

In [13]:
def train(table, name, timeformat, model, batch_size, epochs, scaler, Min, Max, time_df_train, time_df_valid):

    # Initialize training loss
    train_loss = []
    train_acc = []

    # Initialize best F1 Score value
    best_f1= -np.inf

    # Start model training
    for epoch in epochs:
        
        model.train()

        running_loss = 0.0
        total_step = 0
        correct = 0
        total=0
        
        # Set initial start time
        start_time_ = str(time_df_train.index[0])

        # Set end time
        end_time_train = str(time_df_train.index[-1])  
        
        # Use a while loop to call data 
        while start_time_ < end_time_train:
            
            # Set the time for loading data based on the batch size
            start_time_, end_time_, next_start_time_, index_next= update_time(time_df_train, start_time_, batch_size)
            
            # Load batch data 
            data = data_load(table, name, start_time_, end_time_, timeformat)
            
            # Apply Scaler
            train = scaler.fit_transform(data.iloc[:,:-1].values, Min.iloc[:,1:], Max.iloc[:,1:])
            
            # Set each DataFrames
            train = pd.DataFrame(train)  
            train['label'] = data.iloc[:,-1:].values
            
            # Print if the loaded data is empty 
            if len(train) == 0:
                print("No data available.")
                
            # Input the data into the model when it accumulates to the batch size
            if len(train) == batch_size:
                
                # Check total batch count  
                total_step = total_step + 1
                
                # Convert data to numpy arrays
                input_data = np.array(train.iloc[:,:-1])
                label = np.array(train.iloc[:,-1:])

                # Convert data to Tensor
                input_data = torch.tensor(input_data, dtype=torch.float32).to(device).float()
                label = torch.tensor(label).to(device).long().squeeze()

                # Optimize the optimizer
                optimizer.zero_grad()
                
                # Input to the model
                outputs = model(input_data)
                
                # Calculate loss
                loss = criterion(outputs.squeeze(1), label)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
                
                # Set label predictions 
                _,pred = torch.max(outputs.squeeze(1), dim=1)
                correct += torch.sum(pred==label).item()
                total += label.size(0)
                
                # Reset batch data
                train = 0
                    
            # Set the next start time   
            start_time_ = unquote(next_start_time_)

            # Prevent fetching beyond the last time
            if index_next + batch_size - 1 >= len(time_df_train):
                break
            
        train_acc.append(100 * correct / total)
        train_loss.append(running_loss/total_step)
        print(f'\ntrain loss: {np.mean(train_loss)}, train acc: {(100 * correct / total):.4f}')

        # Perform validation at the end of each epoch and save the model with the best performance
        with torch.no_grad():
            
            model.eval()
            
            preds_ = []
            targets_ = []
                
            # Set initial start time
            start_time_v = str(time_df_valid.index[0])
            
            # Set end time
            end_time_valid = str(time_df_valid.index[-1])
            
            # Use a while loop to call data 
            while start_time_v < end_time_valid:
                
                # Set the time for loading data based on the batch size
                start_time_v, end_time_v, next_start_time_v, index_next_v = update_time(time_df_valid, start_time_v, batch_size)
                
                # Load batch data 
                data_v = data_load(table, name, start_time_v, end_time_v, timeformat)
                
                # Apply Scaler
                test = scaler.fit_transform(data_v.iloc[:,:-1].values, Min.iloc[:,1:], Max.iloc[:,1:])
                
                # Set each DataFrames
                test = pd.DataFrame(test)  
                test['label'] = data_v.iloc[:,-1:].values
                
                # Print if the loaded data is empty 
                if len(test) == 0:
                    print("No data available.")
                    
                # Input the data into the model when it accumulates to the batch size
                if len(test) == batch_size:
                    
                    # Convert data to numpy arrays
                    input_data_v = np.array(test.iloc[:,:-1])
                    label_v = np.array(test.iloc[:,-1:])

                    # Convert data to Tensor
                    input_data_v = torch.tensor(input_data_v, dtype=torch.float32).to(device).float()
                    label_v = torch.tensor(label_v).to(device).long().squeeze()
                    
                    # Input to the model
                    outputs_v = model(input_data_v)
                    
                    # Set label predictions 
                    _,pred_v = torch.max(outputs_v.squeeze(1), dim=1)
                    target_v = label_v.view_as(pred_v)
      
                    preds_.append(pred_v)
                    targets_.append(target_v)
                    
                    # Reset batch data
                    test = 0
                    
                # Set the next start time    
                start_time_v = unquote(next_start_time_v)
                
                # Prevent fetching beyond the last time
                if index_next_v + batch_size >= len(time_df_valid):
                    break
                
            # Combine predictions and labels collected from all batches
            preds_v = torch.cat(preds_).detach().cpu().numpy()
            targets_v = torch.cat(targets_).detach().cpu().numpy()
            
            f1score = f1_score(targets_v, preds_v,  average='macro')
            if best_f1 < f1score:
                best_f1 = f1score
                # Save the best model 
                with open("./result/Pump_New_Batch.txt", "a") as text_file:
                    print('epoch=====',epoch, file=text_file)
                    print(classification_report(targets_v, preds_v, digits=4), file=text_file)
                torch.save(model, f'./result/Pump_New_Batch.pt') 
            epochs.set_postfix_str(f"epoch = {epoch},  f1_score = {f1score}, best_f1 = {best_f1}")
    
    return model

In [14]:
########################################### Training Parameter Settings ################################################
# Set the tag table name
table = 'pump'
# Set the tag names
name = quote(tags_, safe=":/")
# Set the time format 
timeformat = 'default'
# Set the data start time
start_time_train = '2018-04-01 00:00:00'
# Set the data end time
end_time_train = '2018-05-16 21:35:00'
# Set batch size
batch_size = 1024
# Set number of epochs
epochs = trange(50, desc='training')
# Set Min, Max value 
Min, Max = set_minmax_value(table, name, start_time_train, end_time_train)
# Set scalers
scaler = MinMaxScaler_custom()
# Load training time list 
time_df_train = time_data_load(table, name, quote(start_time_train), quote(end_time_train), timeformat)
########################################### validation Parameter Settings ################################################
# Set the start time for the validation data
start_time_valid = '2018-05-16 21:36:00'
# Set the end time for the validation data
end_time_valid = '2018-06-28 17:44:00'
# Load validation time list
time_df_valid = time_data_load(table, name, quote(start_time_valid), quote(end_time_valid), timeformat)
########################################### Proceed with training ################################################
model = train(table, name, timeformat, model, batch_size, epochs, scaler, Min, Max, time_df_train, time_df_valid)

training:   0%|          | 0/50 [00:00<?, ?it/s]


train loss: 0.35056771747963467, train acc: 92.2485


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



train loss: 0.3487319800161779, train acc: 93.8110

train loss: 0.24732729155411123, train acc: 98.0957

train loss: 0.19180987661599613, train acc: 99.1898

train loss: 0.1578121520184405, train acc: 99.2416

train loss: 0.1349038767120431, train acc: 99.2798

train loss: 0.11841244749880493, train acc: 99.2889

train loss: 0.10595870213087244, train acc: 99.2966

train loss: 0.0962023769350937, train acc: 99.3073

train loss: 0.08834125952772398, train acc: 99.3118

train loss: 0.08186073051937992, train acc: 99.3179

train loss: 0.07641937611039078, train acc: 99.3195

train loss: 0.07177855512392445, train acc: 99.3576

train loss: 0.06776840811235761, train acc: 99.3805

train loss: 0.0642656336406904, train acc: 99.4858

train loss: 0.061176571196932816, train acc: 99.5102

train loss: 0.05842968151304785, train acc: 99.5148

train loss: 0.05596924837409651, train acc: 99.5193

train loss: 0.0537508893426157, train acc: 99.5255

train loss: 0.05173884694265962, train acc: 99.530

In [21]:
def test(table, name, timeformat, model, batch_size, scaler,Min, Max, time_df_test):
    
    model.eval()
    
    # Initial settings 
    preds_t = []
    targets_t = []

    # Set the initial start time
    start_time_t = str(time_df_test.index[0])

    # Set the end time
    end_time_test = str(time_df_test.index[-1])

    # Use a while loop to call data   
    while start_time_t < end_time_test:
        
        # Set the time for loading data based on the batch size
        start_time_t, end_time_t, next_start_time_t, index_next_t = update_time(time_df_test, start_time_t, batch_size)
        
        # Load batch data 
        data = data_load(table, name, start_time_t, end_time_t, timeformat)
        
        # Apply Scaler
        test = scaler.fit_transform(data.iloc[:,:-1].values, Min.iloc[:,1:], Max.iloc[:,1:])
        
        # Set each DataFrames
        test = pd.DataFrame(test)  
        test['label'] = data.iloc[:,-1:].values
        
        # Print if the loaded data is empty 
        if len(test) == 0:
            print("No data available.")
            
        # Input the data into the model when it accumulates to the batch size
        if len(test) == batch_size:
            
            # Convert data to numpy arrays
            input_data_test = np.array(test.iloc[:,:-1]).reshape(batch_size, 1 , -1)
            input_data_label = np.array(test.iloc[:,-1:])
            
            # Convert data to Tensor
            input_data_test = torch.tensor(input_data_test, dtype=torch.float32).to(device).float()
            input_data_label = torch.tensor(input_data_label, dtype=torch.float32).to(device).long()
                
            # Input to the model
            outputs_t = model(input_data_test)
            
            # Set label predictions
            _,pred_t = torch.max(outputs_t.squeeze(1), dim=1)
            target_t = input_data_label.view_as(pred_t)

            preds_t.append(pred_t)
            targets_t.append(target_t)
            
            # Reset batch data
            test = 0
            
        # Set the next start time   
        start_time_t = unquote(next_start_time_t)
                    
        # Prevent fetching beyond the last time
        if index_next_t + batch_size  >= len(time_df_test):
            break
            
    # Combine predictions and labels collected from all batches
    preds_t = torch.cat(preds_t).detach().cpu().numpy()
    targets_t = torch.cat(targets_t).detach().cpu().numpy()

    return targets_t, preds_t

In [24]:
########################################### Test Parameter Settings ################################################
# Load the best model
model_ = torch.load(f'./result/Pump_New_Batch.pt') 
# Set the start time for the test data
start_time_test = '2018-06-28 17:45:00'
# Set the end time for the test data
end_time_test = '2018-08-31 23:59:00'
# Set batch size
batch_size = 1024
# Load the test time list
time_df_test = time_data_load(table, name, quote(start_time_test), quote(end_time_test), timeformat)
######################################## Proceed with testing #############################################
targets_t, preds_t = test(table, name, timeformat, model_, batch_size, scaler, Min, Max, time_df_test)

## Model Performance Evaluation

In [25]:
# Print F1 Score based on testing data
print(classification_report(targets_t, preds_t))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     83651
           1       0.94      0.95      0.94      8509

    accuracy                           0.99     92160
   macro avg       0.97      0.97      0.97     92160
weighted avg       0.99      0.99      0.99     92160

