## Import Necessary Libraries

In [1]:
## Import necessary libraries
import pandas as pd
import numpy as np
import random 
from urllib.parse import quote, unquote
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Import libraries for the model
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import trange
from sklearn.metrics import f1_score, classification_report

## Set path for saving model training results  
import os
os.makedirs('./result', exist_ok=True)

## Set Cuda for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Set random seed
def set_seed(seed_val):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

# Set seed
seed_val = 77
set_seed(seed_val)

cuda


## Selecting Data Columns
* Tag names are loaded in sequential order.
* The process of selecting the required tag names from the tag name list.

In [2]:
# Function to display tag names
def show_column(URL):
    
    # Load tag name data
    df = pd.read_csv(URL)
    
    # Convert to list format
    df = df.values.reshape(-1)
    
    return df.tolist()

In [3]:
## Set parameters for displaying tag names
table = 'pump'

NAME_URL = f'http://127.0.0.1:5654/db/tql/datahub/api/v1/get_tag_names.tql?table={table}'

## Generate tag name list 
name = show_column(NAME_URL)

In [4]:
name

['machine_status',
 'sensor_01',
 'sensor_02',
 'sensor_03',
 'sensor_04',
 'sensor_05',
 'sensor_10',
 'sensor_11',
 'sensor_12',
 'sensor_13',
 'sensor_14',
 'sensor_16',
 'sensor_17',
 'sensor_18',
 'sensor_19',
 'sensor_20',
 'sensor_21',
 'sensor_22',
 'sensor_23',
 'sensor_24',
 'sensor_25',
 'sensor_26',
 'sensor_27',
 'sensor_28',
 'sensor_29',
 'sensor_30',
 'sensor_31',
 'sensor_32',
 'sensor_33',
 'sensor_34',
 'sensor_35',
 'sensor_36',
 'sensor_37',
 'sensor_38',
 'sensor_39',
 'sensor_40',
 'sensor_41',
 'sensor_42',
 'sensor_43',
 'sensor_44',
 'sensor_45',
 'sensor_46',
 'sensor_47',
 'sensor_48',
 'sensor_49']

## Converting TAG Name Format
* After checking all the Tag Names from the pump dataset in the previous step, extract only the columns to be used and convert them into parameter format.
* Use all tag names

In [5]:
# Set the desired tag names
tags = name

# Wrap each item in the list with single quotes and separate with commas
tags_ = ",".join(f"'{tag}'" for tag in tags)

# Check the selected tag names
print(tags_)

'machine_status','sensor_01','sensor_02','sensor_03','sensor_04','sensor_05','sensor_10','sensor_11','sensor_12','sensor_13','sensor_14','sensor_16','sensor_17','sensor_18','sensor_19','sensor_20','sensor_21','sensor_22','sensor_23','sensor_24','sensor_25','sensor_26','sensor_27','sensor_28','sensor_29','sensor_30','sensor_31','sensor_32','sensor_33','sensor_34','sensor_35','sensor_36','sensor_37','sensor_38','sensor_39','sensor_40','sensor_41','sensor_42','sensor_43','sensor_44','sensor_45','sensor_46','sensor_47','sensor_48','sensor_49'


## Load Pump Sensor Dataset
* Load the data using all tag name

In [6]:
# Data loading parameter settings

# Set the tag table name
table = 'pump'
# Set the tag names
name = quote(tags_, safe=":/")
# Set the time format 
timeformat = 'default'
# Set the data start time
start_time = quote('2018-04-01 00:00:00')
# Set the data end time
end_time = quote('2018-09-01 00:00:00')

In [7]:
# Data loading function
def data_load(table, name, start_time, end_time, timeformat):
    
    # Load data 
    df = pd.read_csv(f'http://127.0.0.1:5654/db/tql/datahub/api/v1/select-rawdata.tql?table={table}&name={name}&start={start_time}&end={end_time}&timeformat={timeformat}')

    # Convert to data grouped by the time
    df = df.pivot_table(index='TIME', columns='NAME', values='VALUE', aggfunc='first').reset_index()

    # Set TIME column
    df['TIME'] = pd.to_datetime(df['TIME'])

    # Set time index
    df.set_index('TIME', inplace=True)
    
    # Move the machine_status column to the end and rename it to label
    df['machine_status'] = df.pop('machine_status')
    df.rename(columns={'machine_status': 'label'}, inplace=True)
    
    # Convert label column data to integer type
    df['label'] = df['label'].astype(int)
    
    return df

In [8]:
# Load data
df = data_load(table, name, start_time, end_time, timeformat)
df

NAME,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,...,sensor_41,sensor_42,sensor_43,sensor_44,sensor_45,sensor_46,sensor_47,sensor_48,sensor_49,label
TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-04-01 00:00:00,47.09201,53.211800,46.310760,634.375000,76.45975,37.22740,47.52422,31.11716,1.681353,419.5747,...,30.989580,31.770832,41.92708,39.641200,65.68287,50.92593,38.194440,157.9861,67.70834,0
2018-04-01 00:01:00,47.09201,53.211800,46.310760,634.375000,76.45975,37.22740,47.52422,31.11716,1.681353,419.5747,...,30.989580,31.770832,41.92708,39.641200,65.68287,50.92593,38.194440,157.9861,67.70834,0
2018-04-01 00:02:00,47.35243,53.211800,46.397570,638.888900,73.54598,37.86777,48.17723,32.08894,1.708474,420.8480,...,30.468750,31.770830,41.66666,39.351852,65.39352,51.21528,38.194443,155.9606,67.12963,0
2018-04-01 00:03:00,47.09201,53.168400,46.397568,628.125000,76.98898,38.57977,48.65607,31.67221,1.579427,420.7494,...,30.468750,31.510420,40.88541,39.062500,64.81481,51.21528,38.194440,155.9606,66.84028,0
2018-04-01 00:04:00,47.13541,53.211800,46.397568,636.458300,76.58897,39.48939,49.06298,31.95202,1.683831,419.8926,...,30.989580,31.510420,41.40625,38.773150,65.10416,51.79398,38.773150,158.2755,66.55093,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-08-31 23:55:00,47.69965,50.520830,43.142361,634.722229,64.59095,43.17085,54.16052,38.05424,13.265320,420.7993,...,30.468750,30.208330,38.28125,68.287030,52.37268,48.32176,41.087960,212.3843,153.64580,0
2018-08-31 23:56:00,47.69965,50.564240,43.142361,630.902771,65.83363,43.21038,54.52602,38.53485,13.242270,422.1567,...,30.208332,29.947920,38.28125,66.840280,50.63657,48.03241,40.798610,213.8310,156.25000,0
2018-08-31 23:57:00,47.69965,50.520830,43.142361,625.925903,67.29445,43.12836,55.11779,38.52678,13.188660,420.2166,...,29.947920,30.208330,39.06250,65.393520,48.90046,48.03241,40.798610,217.3032,155.38190,0
2018-08-31 23:58:00,47.69965,50.520832,43.142361,635.648100,65.09175,42.35746,55.99321,38.89159,13.173460,420.5700,...,29.947916,30.208332,40.62500,64.236110,47.74306,48.32176,40.509258,222.5116,153.93520,0


In [9]:
# Split the data into train, test sets
train, test = train_test_split(df, test_size=0.7, shuffle=False)
valid, test = train_test_split(test, test_size=0.6, shuffle=False)

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

print(train['label'].value_counts())
print(valid['label'].value_counts())
print(test['label'].value_counts())

label
0    62040
1     4056
Name: count, dtype: int64
label
0    59770
1     1919
Name: count, dtype: int64
label
0    84026
1     8509
Name: count, dtype: int64


## Data Preprocessing
* 1 Min-Max Scaling

### 1. Applying Min-Max Scaling

In [10]:
# Scaler Setup
scaler = MinMaxScaler()

# Apply Scaler
train_ = scaler.fit_transform(train.iloc[:,:-1].values)
valid_ = scaler.transform(valid.iloc[:,:-1].values)
test_ = scaler.transform(test.iloc[:,:-1].values)

# Set DataFrames
train_scaled = pd.DataFrame(train_)
valid_scaled = pd.DataFrame(valid_)
test_scaled = pd.DataFrame(test_)

# Add labels
train_scaled['label'] = train['label'].values
valid_scaled['label'] = valid['label'].values
test_scaled['label'] = test['label'].values

print(train_scaled['label'].value_counts())
print(valid_scaled['label'].value_counts())
print(test_scaled['label'].value_counts())

label
0    62040
1     4056
Name: count, dtype: int64
label
0    59770
1     1919
Name: count, dtype: int64
label
0    84026
1     8509
Name: count, dtype: int64


## Dataset & Loader Setup

In [11]:
class Pump_Dataset(Dataset):

    def __init__(self, df):
        self.freq_data = df.iloc[:,:-1]
        self.label = df.iloc[:,-1:].squeeze()

    def __len__(self):
        return len(self.freq_data)

    def __getitem__(self, index):

        input_time_data = self.freq_data.iloc[index,:]
        input_time_data = torch.Tensor(input_time_data).expand(1, input_time_data.shape[0])
        label = self.label[index]

        return input_time_data, label

In [12]:
# Set up datasets  
train_ = Pump_Dataset(train_scaled)
valid_ = Pump_Dataset(valid_scaled)
test_ = Pump_Dataset(test_scaled)

# Set up data loaders
train_dataloader = DataLoader(train_, batch_size=1024, shuffle=False)
valid_dataloader = DataLoader(valid_, batch_size=1024, shuffle=False)
test_dataloader = DataLoader(test_, batch_size=1, shuffle=False)

## Model Configuration
* Using Linear_classifier model

In [13]:
class Linear_classifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Linear_classifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [14]:
# Model configuration parameters
# Learning rate
learning_rate = 0.01
input_dim = 44  
hidden_dim = 20 
output_dim = 2  

# Model configuration
model = Linear_classifier(input_dim, hidden_dim, output_dim).to(device)

# Configure loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Check the model architecture
print(model)

Linear_classifier(
  (fc1): Linear(in_features=44, out_features=20, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=20, out_features=2, bias=True)
)


## Model Training

In [15]:
# Initialize training loss
train_loss = []
# Initialize training accuracy
train_acc = []
# Initialize total step
total_step = len(train_dataloader)
# Set number of epochs
epoch_in = trange(50, desc='training')
# Initialize best F1 Score value
best_f1= 0

# Start model training
for epoch in epoch_in:
    model.train()
    running_loss = 0.0
    correct = 0
    total=0

    preds_ = []
    targets_ = []

    for batch_idx, train_data in enumerate(train_dataloader):

        inputs = train_data[0].to(device).float()
        labels = train_data[1].to(device).long().squeeze()

        optimizer.zero_grad()

        # Input to the model
        outputs = model(inputs)
        
        # Calculate loss
        loss = criterion(outputs.squeeze(1), labels)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        # Set label predictions 
        _,pred = torch.max(outputs.squeeze(1), dim=1)
        correct += torch.sum(pred==labels).item()
        total += labels.size(0)
        
    train_acc.append(100 * correct / total)
    train_loss.append(running_loss/total_step)
    print(f'\ntrain loss: {np.mean(train_loss)}, train acc: {(100 * correct / total):.4f}')
    
    # Perform validation at the end of each epoch and save the model with the best performance
    with torch.no_grad():
        model.eval()
        
        for batch_idx, valid_data in enumerate(valid_dataloader):

            inputs_v = valid_data[0].to(device).float()
            labels_v = valid_data[1].to(device).long().squeeze() 
            
            outputs_v = model(inputs_v)
            
            # Set label predictions
            _,pred_v = torch.max(outputs_v.squeeze(1), dim=1)
            target_v = labels_v.view_as(pred_v)
            
            preds_.append(pred_v)
            targets_.append(target_v)
            
        # Combine predictions and labels collected from all batches
        preds_ = torch.cat(preds_).detach().cpu().numpy()
        targets_ = torch.cat(targets_).detach().cpu().numpy()
        
        f1score = f1_score(targets_, preds_,  average='macro')
        if best_f1 < f1score:
            best_f1 = f1score
            # Save the best model 
            with open("./result/Pump_General.txt", "a") as text_file:
                print('epoch=====',epoch, file=text_file)
                print(classification_report(targets_, preds_, digits=4), file=text_file)
            torch.save(model, f'./result/Pump_General.pt') 
        epoch_in.set_postfix_str(f"epoch = {epoch},  f1_score = {f1score}, best_f1 = {best_f1}")

training:   0%|          | 0/50 [00:00<?, ?it/s]


train loss: 0.345174629738148, train acc: 92.3142


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



train loss: 0.3441005078194503, train acc: 93.8635

train loss: 0.2440626430092965, train acc: 98.1103

train loss: 0.18924513601734835, train acc: 99.1966

train loss: 0.15569150837618664, train acc: 99.2481

train loss: 0.13307975759512153, train acc: 99.2798

train loss: 0.11680572741508503, train acc: 99.2950

train loss: 0.10451599298107193, train acc: 99.3025

train loss: 0.09488818320206942, train acc: 99.3131

train loss: 0.08713086775053022, train acc: 99.3177

train loss: 0.08073631828077782, train acc: 99.3237

train loss: 0.0753668083899351, train acc: 99.3252

train loss: 0.0707867928465382, train acc: 99.3646

train loss: 0.06682989109707622, train acc: 99.3812

train loss: 0.06337338743855916, train acc: 99.4916

train loss: 0.060325075123413105, train acc: 99.5143

train loss: 0.057614489998614904, train acc: 99.5189

train loss: 0.05518665000191882, train acc: 99.5234

train loss: 0.05299769046074663, train acc: 99.5295

train loss: 0.05101220460083862, train acc: 99.

## Model Testing

In [16]:
# Load the best model
model_ = torch.load(f'./result/Pump_General.pt') 

In [17]:
# Model testing
preds_test = []
target_test = []
with torch.no_grad():
    model_.eval()
    for batch_idx, test_data in enumerate(test_dataloader):
        inputs_t = test_data[0].to(device).float()
        labels_t = test_data[1].to(device).long().squeeze() 
        
        outputs_t = model_(inputs_t)
        
        _,pred_t = torch.max(outputs_t.squeeze(1), dim=1)
        targets_t = labels_t.view_as(pred_t).to(device)

        preds_test.append(pred_t)
        target_test.append(targets_t)
        
    # Combine predictions and labels collected from all batches
    preds_test = torch.cat(preds_test).detach().cpu().numpy()
    target_test = torch.cat(target_test).detach().cpu().numpy()

## Model Performance Evaluation

In [18]:
print(classification_report(target_test, preds_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     84026
           1       0.94      0.95      0.94      8509

    accuracy                           0.99     92535
   macro avg       0.97      0.97      0.97     92535
weighted avg       0.99      0.99      0.99     92535

