# Bearing Anomaly Detection

## Import Necessary Libraries

In [1]:
## Import necessary libraries
import pandas as pd
import numpy as np
import random 
from urllib.parse import quote, unquote
from datetime import timedelta
from scipy.fftpack import fft
from sklearn.decomposition import PCA

## Import libraries for the model
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import trange
import statistics
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score, classification_report

## Set path for saving model training results
import os
os.makedirs('./result', exist_ok=True)

## Set Cuda for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Set random seed
def set_seed(seed_val):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

# Set seed 
seed_val = 77
set_seed(seed_val)

cuda


## Selecting Data Columns
* Tag names are loaded in sequential order.
* The process of selecting the required tag names from the tag name list.

In [2]:
# Function to display tag names
def show_column(URL):
    
    # Load tag name data
    df = pd.read_csv(URL)
    
    # Convert to list format
    df = df.values.reshape(-1)
    
    return df.tolist()

In [3]:
## Set parameters for displaying tag names
table = 'bearing'

NAME_URL = f'http://127.0.0.1:5654/db/tql/datahub/api/v1/get-tag-names.tql?table={table}'

## Generate tag name list
name = show_column(NAME_URL)

In [4]:
name

['s1-c1',
 's1-c2',
 's1-c3',
 's1-c4',
 's1-c5',
 's1-c6',
 's1-c7',
 's1-c8',
 's2-c1',
 's2-c2',
 's2-c3',
 's2-c4',
 's3-c1',
 's3-c2',
 's3-c3',
 's3-c4']

## Converting TAG Name Format
* After checking all the Tag Names from the Nasa bearing dataset in the previous step, extract only the columns to be used and convert them into parameter format.
* Use tag names related to the s1-c5

In [5]:
# Set the desired tag names
tags = ['s1-c5']

# Wrap each item in the list with single quotes and separate with commas
tags_ = ",".join(f"'{tag}'" for tag in tags)

# Check the selected tag names
print(tags_)

's1-c5'


## Loading Nasa Bearing Dataset
* Load the train, validation, and test datasets separately when loading the data.
* The example focuses on anomaly detection for the 3rd bearing -> using 's1-c5' as the Tag Name.
* Label all states except for the faulty condition as normal for the labeling process.

In [6]:
# Set the status of each bearing based on time ranges
B1 ={
    "early" : ["2003-10-22 12:06:24" , "2003-10-23 09:14:13"],
    "suspect" : ["2003-10-23 09:24:13" , "2003-11-08 12:11:44"],
    "normal" : ["2003-11-08 12:21:44" , "2003-11-19 21:06:07"],
    "suspect_1" : ["2003-11-19 21:16:07" , "2003-11-24 20:47:32"],
    "imminent_failure" : ["2003-11-24 20:57:32","2003-11-25 23:39:56"]
}
B2 = {
    "early" : ["2003-10-22 12:06:24" , "2003-11-01 21:41:44"],
    "normal" : ["2003-11-01 21:51:44" , "2003-11-24 01:01:24"],
    "suspect" : ["2003-11-24 01:11:24" , "2003-11-25 10:47:32"],
    "imminient_failure" : ["2003-11-25 10:57:32" , "2003-11-25 23:39:56"]
}

B3 = {
    "early" : ["2003-10-22 12:06:24" , "2003-11-01 21:41:44"],
    "normal" : ["2003-11-01 21:51:44" , "2003-11-22 09:16:56"],
    "suspect" : ["2003-11-22 09:26:56" , "2003-11-25 10:47:32"],
    "Inner_race_failure" : ["2003-11-25 10:57:32" , "2003-11-25 23:39:56"]
}

B4 = {
    "early" : ["2003-10-22 12:06:24" , "2003-10-29 21:39:46"],
    "normal" : ["2003-10-29 21:49:46" , "2003-11-15 05:08:46"],
    "suspect" : ["2003-11-15 05:18:46" , "2003-11-18 19:12:30"],
    "Rolling_element_failure" : ["2003-11-19 09:06:09" , "2003-11-22 17:36:56"],
    "Stage_two_failure" : ["2003-11-22 17:46:56" , "2003-11-25 23:39:56"]
}

In [7]:
# Data loading function
def data_load(table, name, start_time, end_time, timeformat):
    
    # Load data  
    df = pd.read_csv(f'http://127.0.0.1:5654/db/tql/datahub/api/v1/select-rawdata.tql?table={table}&name={name}&start={start_time}&end={end_time}&timeformat={timeformat}')
    
    # Convert to data grouped by the time
    df = df.pivot_table(index='TIME', columns='NAME', values='VALUE', aggfunc='first').reset_index()
    
    # Set TIME column
    df['TIME'] = pd.to_datetime(df['TIME'], format='%Y-%m-%d %H:%M:%S.%f')

    # Group by TIME
    # Round to the nearest second
    filtered_data = df.copy()
    filtered_data.loc[:, 'TIME'] = filtered_data['TIME'].dt.floor('S')
    grouped = filtered_data.groupby('TIME')['s1-c5'].apply(list).reset_index()

    # Split the list into individual columns
    s1_c5_df = pd.DataFrame(grouped['s1-c5'].tolist())

    # Merge with the 'TIME' column
    result_df = pd.concat([grouped[['TIME']], s1_c5_df], axis=1)
    
    # Set labels
    # Assign labels based on abnormal time ranges for each channel data
    result_df['label'] = np.where((result_df['TIME'] >= "2003-11-25 10:57:32") & (result_df['TIME'] <= "2003-11-25 23:39:56"), 1, 0)
    
    result_df = result_df.drop(['TIME'], axis=1)
    
    result_df = result_df.dropna()
    
    return result_df

In [8]:
# Data time loading function
def time_data_load(table, name, start_time, end_time, timeformat):
    
    target = 'time'
    
    # Load the data  
    df = pd.read_csv(f"http://127.0.0.1:5654/db/tql/datahub/api/v1/select-rawdata.tql?target={target}&table={table}&name={name}&start={start_time}&end={end_time}&timeformat={timeformat}")
    
    # Create a dummy value column for resampling
    df['value'] = 0
    
    # Perform resampling
    df['time'] = pd.to_datetime(df['time'])
    df.set_index('time', inplace=True)
    df = df.resample('1s').mean()
    
    # Remove missing values
    df = df.dropna()
    
    # Remove the dummy value column
    df = df.drop(['value'], axis=1)
    
    return df

In [9]:
# Time update function
# Update start and end times based on batch size
# window_size: The period over which data is collected in batches
# step_size: The interval between the data points
def update_time(time_df, start_time, batch_size, window_size, step_size):
    
    # Calculate how many data points need to be loaded
    time = (batch_size * step_size)+ window_size - step_size - 1
    
    # Check the index number of the current time
    # If not found, set to the first index as there is no data for the current time
    try:
        index_now = time_df.index.get_loc(start_time)
    except KeyError:
        index_now = 0
    
    # Set the end time for the batch data based on the current time 
    end_time_ = str(time_df.index[index_now + time] + timedelta(seconds=1))
    
    # Set the index number for the next start time
    index_next = index_now + time - abs(window_size - step_size - 1)
    
    # Set the next start time
    next_start_time_ = str(time_df.index[index_next])
    
    # URL encoding
    start_time_ = quote(start_time)
    end_time_ = quote(end_time_)
    next_start_time_ = quote(next_start_time_)
    
    return start_time_, end_time_, next_start_time_, index_next

In [10]:
# Function to calculate the maximum and minimum values for selected tag names
def set_minmax_value(table, name, start_time_train, end_time_train):
    
    # URL encoding
    start = quote(start_time_train)
    end = quote(end_time_train)
    
    # Load Min, Max data
    df_ = pd.read_csv(f'http://127.0.0.1:5654/db/tql/datahub/api/v1/select-scale.tql?table={table}&name={name}&start={start}&end={end}')
    
    # Set Min, Max values
    Min = df_.iloc[:,1:-1].T
    Max = df_.iloc[:,2:].T
    
    return Min, Max  

## Data Preprocessing

* 1 hanning window
* 2 FFT
* 3 MinMax Scaling
* 4 PCA -> Apply during training
* 5 Window sliding

### 1. Hanning Window Setup

In [11]:
# Hanning window function setup 
def set_hanning_window(sample_rate, df):
    
    # Generate Hanning window
    hanning_window = np.hanning(sample_rate)

    # Apply Hanning window to each row
    df_windowed = df.multiply(hanning_window, axis=1)
    
    return df_windowed

### 2. FFT (Fast Fourier Transform) Setup

In [12]:
# FFT transformation function
def change_fft(sample_rate, df):
    # Total number of samples in the signal
    N = sample_rate
    
    fft_results = np.zeros((df.shape[0], N // 2 + 1), dtype=float)
    
    # Apply FFT to each row
    for i in range(df.shape[0]):
        
        # Calculate FFT for each row
        yf = fft(df.iloc[i].values)
        
        # Compute the absolute value of the FFT results and normalize (only the meaningful part)
        fft_results[i] = 2.0 / N * np.abs(yf[:N // 2 + 1])
    
    # Convert FFT results to a DataFrame
    fft_df = pd.DataFrame(fft_results)
    
    return fft_df

### 3. Min-Max Scaling Setup
* Set up a Min-Max Scaler that uses the maximum and minimum values, as the entire dataset is not loaded due to the process concept.

In [13]:
# Definition of the MinMaxScaler class
class MinMaxScaler_custom:
    def __init__(self):
        self.min_ = None
        self.max_ = None

    # Set scale values based on the specified parameters
    def transform(self, X, min_values, max_values):
        X = np.array(X)
        self.min_ = np.array(min_values)
        self.max_ = np.array(max_values)
        
        if self.min_ is None or self.max_ is None:
            raise ValueError("Min and Max values are not set.")
        
        # Add 1e-6 to prevent the scale value from becoming 0
        scale = (self.max_ - self.min_) + 1e-6
        if np.any(scale == 0):
            raise ValueError("Min and Max values are the same, resulting in a scale of 0.")
        
        return (X - self.min_) / scale
    
    # Normalize data based on calculated scale values
    def fit_transform(self, X, min_values, max_values):
        """Set parameters and then transform X"""
        return self.transform(X, min_values, max_values)

    # Inverse the normalized data back to original values
    def inverse_transform(self, X_scaled):
        """Inverse the transformation and return original values"""
        if self.min_ is None or self.max_ is None:
            raise ValueError("Min and Max values are not set.")
        
        X_scaled = np.array(X_scaled)
        scale = (self.max_ - self.min_) + 1e-6
        
        return X_scaled * scale + self.min_

### 5. Window sliding setup
* Window size: Determines how many time points to group together.
* Step size: The time interval by which the window moves.

In [14]:
def make_window(data, window_size, step_size):
    
    # List to store the results of sliding windows
    windows = []
    labels = []
    
    # Apply sliding window
    for i in range(0, data.shape[0] - window_size + 1, step_size):
        window = data.iloc[i:i + window_size, :-1].values
        label_array = data.iloc[i:i + window_size, -1].values
        
        # Set the label to 1 if there is any abnormal value in the label array
        if (label_array == 1).any():
            label = 1  
        else:
            label = 0
        
        windows.append(window)
        labels.append([label])
        
    return windows, labels

## Model Configuration
* Using LSTM AE model.

In [15]:
# LSTM Autoencoder class definition
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(LSTMAutoencoder, self).__init__()
        
        # Encoder LSTM
        self.encoder_lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.encoder_fc = nn.Linear(hidden_dim, 2*hidden_dim)
        
        # Decoder LSTM
        self.decoder_fc = nn.Linear(2*hidden_dim, hidden_dim)
        self.decoder_lstm = nn.LSTM(hidden_dim, input_dim, num_layers, batch_first=True)

    def forward(self, x):
        # Encoder part
        _, (h, _) = self.encoder_lstm(x)
        latent = self.encoder_fc(h[-1])
        
        # Decoder part
        hidden = self.decoder_fc(latent).unsqueeze(0).repeat(x.size(1), 1, 1).permute(1, 0, 2)
        output, _ = self.decoder_lstm(hidden)
        
        return output

In [16]:
# Model configuration parameters

# number of input data columns
input_dim = 31

# LSMT hidden state size
hidden_dim = 15

# layer size
num_layers = 3

# Learning rate
learning_rate = 0.01

# Model configuration
model = LSTMAutoencoder(input_dim, hidden_dim, num_layers).to(device)

# Configure loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Check the model architecture
print(model)

LSTMAutoencoder(
  (encoder_lstm): LSTM(31, 15, num_layers=3, batch_first=True)
  (encoder_fc): Linear(in_features=15, out_features=30, bias=True)
  (decoder_fc): Linear(in_features=30, out_features=15, bias=True)
  (decoder_lstm): LSTM(15, 31, num_layers=3, batch_first=True)
)


## Model Training

* Save the model with the Best Loss based on the training data during training.

In [17]:
# Model training function
def train(table, name, timeformat, model, batch_size, window_size, step_size, epochs, sample_rate, Min, Max, scaler, pca, time_df_train):
        
    # Initialize training loss
    train_loss = []

    # Initialize Best Loss value
    best_Loss=np.inf

    for epoch in epochs:
        
        model.train()

        running_loss = 0.0
        total_step = 0
        
        # Set initial start time
        start_time_ = str(time_df_train.index[0])
        
        # Set end time
        end_time_train = str(time_df_train.index[-1])

        # Use a while loop to call data 
        while start_time_ < end_time_train:
            
            # Set the time for loading data based on the batch size
            start_time_, end_time_, next_start_time_, index_next= update_time(time_df_train, start_time_, batch_size, window_size, step_size)
            
            # Load batch data 
            data = data_load(table, name, start_time_, end_time_, timeformat)
            
            # Apply hanning window
            data_ = set_hanning_window(sample_rate, data.iloc[:,:-1])
            
            # Apply FFT
            data_  = change_fft(sample_rate, data_ )
            
            # Apply MinMax scaler
            data_ = scaler.fit_transform(data_, Min, Max)
            
            # Apply PCA
            data_ = pca.fit_transform(data_)
            
            # Setting up DataFrame + label
            data_ = pd.DataFrame(data_)
            data_['label'] = data['label'].values

            # Set window 
            windows, labels = make_window(data_ , window_size, step_size)
            
            # Print if the loaded data is empty 
            if len(data_) == 0:
                print("No data available.")
            
            # Input the data into the model when it accumulates to the batch size
            if len(windows) == batch_size:
                
                # Check total batch count  
                total_step = total_step + 1
                
                # Convert data to numpy arrays
                input_data = np.array(windows)

                # Convert data to Tensor
                input_data = torch.tensor(input_data, dtype=torch.float32).to(device).float()

                # Optimize the optimizer
                optimizer.zero_grad()
                
                # Input to the model
                outputs = model(input_data)
                
                # Calculate loss
                loss = criterion(outputs, input_data)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
                
                # Reset batch data
                windows = []
            
            # Set the next start time   
            start_time_ = unquote(next_start_time_)
            
            # Prevent fetching beyond the last time
            if index_next + (batch_size * step_size)+ window_size - step_size - 1 >= len(time_df_train):
                break
        
        if total_step > 0:
            train_loss.append(running_loss / total_step)
            print(f'\ntrain loss: {np.mean(train_loss)}')
            
        # best model save     
        if best_Loss > np.mean(train_loss):
            best_Loss = np.mean(train_loss)
            torch.save(model, f'./result/Nasa_Bearing_LSTM_AE_New_batch.pt')
            print('Save the best model.') 
            
        epochs.set_postfix_str(f"epoch = {epoch}, best_Loss = {best_Loss}")
               
    return model

In [18]:
########################################### Training Parameter Settings ################################################
# Set tag table name
table = 'bearing'
# Set tag name
name = quote(tags_, safe=":/")
# Set the start time for the train data
start_time_train = '2003-10-22 12:06:24'
# Set the end time for the train data
end_time_train = '2003-11-22 00:00:00'
# Set time format
timeformat = quote('2006-01-02 15:04:05.000000')
# Set batch size
batch_size = 32
# Set window size
window_size = 3
# Set step size -> it must be less than or equal to the window size
step_size = 1
# Set number of epochs
epochs = trange(5, desc='training')
# Set Maximum and Minimum Values 
Min, Max = set_minmax_value(table, name, start_time_train, end_time_train)
# Set sample rate
sample_rate = 20480
# Set Min-Max scaler
scaler = MinMaxScaler_custom()
# Set PCA
# Select principal components that explain 95% of the variance
pca = PCA(n_components=31)
# Load training time list 
time_df_train = time_data_load(table, name, quote(start_time_train), quote(end_time_train), timeformat)

########################################### Proceed with training ################################################
train(table, name, timeformat, model, batch_size, window_size, step_size, epochs, sample_rate, Min, Max, scaler, pca, time_df_train)

training:   0%|          | 0/5 [00:00<?, ?it/s]


train loss: 0.0001948600776426314
Save the best model.

train loss: 0.00010036500810252214
Save the best model.

train loss: 6.882995255440574e-05
Save the best model.

train loss: 5.305689636732798e-05
Save the best model.

train loss: 4.359030993631071e-05
Save the best model.


LSTMAutoencoder(
  (encoder_lstm): LSTM(31, 15, num_layers=3, batch_first=True)
  (encoder_fc): Linear(in_features=15, out_features=30, bias=True)
  (decoder_fc): Linear(in_features=30, out_features=15, bias=True)
  (decoder_lstm): LSTM(15, 31, num_layers=3, batch_first=True)
)

## Threshold Setting
* Calculate the threshold using validation data
  * Max + K × Standard Deviation

In [19]:
# Model validation function
def threshold_set(table, name, timeformat, model, batch_size, window_size, step_size, k, sample_rate, Min, Max, scaler,time_df_valid):
    
    # Initialize validation loss
    valid_loss = []
    with torch.no_grad():
        
        model.eval()
         
        # Set initial start time
        start_time_v = str(time_df_valid.index[0])
        
        # Set end time
        end_time_valid = str(time_df_valid.index[-1])
        
        # Use a while loop to call data 
        while start_time_v < end_time_valid:
            
            # Set the time for loading data based on the batch size
            start_time_v, end_time_v, next_start_time_v, index_next_v = update_time(time_df_valid, start_time_v, batch_size, window_size, step_size)
            
            # Load batch data 
            data_v = data_load(table, name, start_time_v, end_time_v, timeformat)
            
            # Apply hanning window
            data_ = set_hanning_window(sample_rate, data_v.iloc[:,:-1])
            
            # Apply FFT
            data_  = change_fft(sample_rate, data_ )
            
            # Apply MinMax scaler 
            data_ = scaler.fit_transform(data_, Min, Max)
            
            # Apply PCA
            data_ = pca.fit_transform(data_)
            
            # Setting up DataFrame + label
            data_ = pd.DataFrame(data_)
            data_['label'] = data_v['label'].values

            # Set window 
            windows_v, labels_v = make_window(data_ , window_size, step_size)
            
            # Print if the loaded data is empty 
            if len(data_) == 0:
                print("No data available.")
            
            # Input the data into the model when it accumulates to the batch size
            if len(windows_v) == batch_size:
                
                # Convert data to numpy arrays
                input_data_val = np.array(windows_v)
                
                # Convert data to Tensor
                input_data_val = torch.tensor(input_data_val, dtype=torch.float32).to(device).float()

                # Input to the model
                outputs_val = model(input_data_val)
                
                # Calculate loss
                loss_val = criterion(outputs_val, input_data_val)
            
                valid_loss.append(loss_val.item())
                
                # Reset batch data
                windows_v = []
                
            # Set the next start time  
            start_time_v = unquote(next_start_time_v)
            
            # Prevent fetching beyond the last time
            if index_next_v + (batch_size * step_size)+ window_size - step_size - 1 >= len(time_df_valid):
                break
            
            
        # Calculate the threshold
        threshold =  max(valid_loss) + k * statistics.stdev(valid_loss)
      
                    
    return threshold

In [20]:
########################################### validation Parameter Settings ################################################
# Load the best model
model_ = torch.load(f'./result/Nasa_Bearing_LSTM_AE_New_batch.pt') 
# Set the start time for the validation data
start_time_valid = '2003-11-22 00:00:00'
# Set the end time for the validation data
end_time_valid = '2003-11-23 13:00:00'
# Set the threshold parameters
k = 10
# Load validation time list
time_df_valid = time_data_load(table, name, quote(start_time_valid), quote(end_time_valid), timeformat)

########################################### Proceed with validation ################################################
threshold = threshold_set(table, name, timeformat, model_, batch_size, window_size, step_size, k, sample_rate, Min, Max, scaler, time_df_valid)
print(threshold)

1.2756313173981703e-05


## Model Testing

In [21]:
# Model testing function
def test(table, name, timeformat, model, batch_size, window_size, step_size, threshold, sample_rate, Min, Max, scaler, pca, time_df_test):
    
    # Initial settings 
    test_loss = []
    labels = []
    
    with torch.no_grad():
        
        model.eval()
        
        # Set the initial start time
        start_time_t = str(time_df_test.index[0])
        
        # Set the end time
        end_time_test = str(time_df_test.index[-1])
        
        # Use a while loop to call data   
        while start_time_t < end_time_test:
            
            # Set the time for loading data based on the batch size
            start_time_t, end_time_t, next_start_time_t, index_next_t = update_time(time_df_test, start_time_t, batch_size, window_size, step_size)
            
            # Load batch data
            data_t = data_load(table, name, start_time_t, end_time_t, timeformat)
            
            # Apply hanning window
            data_ = set_hanning_window(sample_rate, data_t.iloc[:,:-1])
            
            # Apply FFT 
            data_  = change_fft(sample_rate, data_ )
            
            # Apply MinMax scaler 
            data_ = scaler.fit_transform(data_, Min, Max)
            
            # Apply PCA
            data_ = pca.fit_transform(data_)
            
            # Setting up DataFrame + label
            data_ = pd.DataFrame(data_)
            data_['label'] = data_t['label'].values

            # Set window 
            windows_t, labels_t = make_window(data_ , window_size, step_size)
            
            # Print if the loaded data is empty  
            if len(data_) == 0:
                print("No data available.")
            
            # Input the data into the model when it accumulates to the batch size
            if len(windows_t) == batch_size:
                
                # Convert data to numpy arrays
                input_data_test = np.array(windows_t)
                input_data_label = np.array(labels_t)
                
                # Convert data to Tensor
                input_data_test = torch.tensor(input_data_test, dtype=torch.float32).to(device).float()
                input_data_label = torch.tensor(input_data_label, dtype=torch.float32).to(device).long()
                
                # Create DataLoader
                dataset = TensorDataset(input_data_test, input_data_label)
                data_loader = DataLoader(dataset, batch_size=1, shuffle=False)
                
                for batch_input, batch_label in data_loader:

                    # Input to the model
                    outputs_test = model(batch_input)
                    
                    # Calculate loss
                    loss_test = criterion(outputs_test, batch_input)
                    
                    test_loss.append(loss_test.item())
                    labels.append(batch_label.item())
                
                # Reset batch data
                windows_t = []
                
            # Set the next start time   
            start_time_t = unquote(next_start_time_t) 
            
            # Prevent fetching beyond the last time
            if index_next_t + (batch_size * step_size)+ window_size - step_size - 1 >= len(time_df_test):
                break
            
    # Generate final results
    final_df = pd.DataFrame(test_loss, columns=['reconst_score'])
    final_df['label'] = labels

    # Set labels based on each threshold
    final_df['pred_label'] = np.where(final_df['reconst_score'] > threshold, 1, 0)
    
    return final_df

In [22]:
########################################### Test Parameter Settings ################################################

# Set the start time for the test data
start_time_test = '2003-11-23 13:00:00'
# Set the end time for the test data
end_time_test = '2003-11-26 00:00:00'
# Load the test time list
time_df_test = time_data_load(table, name, quote(start_time_test), quote(end_time_test), timeformat)

######################################## Proceed with testing #############################################
final_df = test(table, name, timeformat, model_, batch_size, window_size, step_size, threshold, sample_rate, Min, Max, scaler, pca, time_df_test)

## Model Performance Evaluation

In [23]:
# Print F1 Score based on testing data
print(classification_report(final_df['label'], final_df['pred_label']))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       178
           1       0.93      1.00      0.97        14

    accuracy                           0.99       192
   macro avg       0.97      1.00      0.98       192
weighted avg       1.00      0.99      0.99       192



## Overfitting Check

* Calculate the F1 score using not only the test data but also the train and validation datasets.
* If the results for the train and validation are not satisfactory, it can be determined that the model is overfitting.

In [24]:
# Conduct testing on the training data
final_df_train = test(table, name, timeformat, model_, batch_size, window_size, step_size, threshold, sample_rate, Min, Max, scaler, pca, time_df_train)

In [25]:
# Print F1 Score based on training data
print(classification_report(final_df_train['label'], final_df_train['pred_label'],labels=[0]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1728

    accuracy                           1.00      1728
   macro avg       1.00      1.00      1.00      1728
weighted avg       1.00      1.00      1.00      1728



In [26]:
# Conduct testing on the validation data
final_df_valid = test(table, name, timeformat, model_, batch_size, window_size, step_size, threshold, sample_rate, Min, Max, scaler, pca, time_df_valid)

In [27]:
# Print F1 Score based on validation data
print(classification_report(final_df_valid['label'], final_df_valid['pred_label'],labels=[0]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       192

    accuracy                           1.00       192
   macro avg       1.00      1.00      1.00       192
weighted avg       1.00      1.00      1.00       192

