# Rotating Shaft Anomlay Detection

## Import Necessary Libraries

In [1]:
## Import necessary libraries
import pandas as pd
import numpy as np
import random 
from urllib.parse import quote, unquote
from sklearn.preprocessing import MinMaxScaler
from datetime import timedelta
from scipy.fftpack import fft
from sklearn.decomposition import PCA

## Import libraries for the model
import torch
import torch.nn as nn
import xgboost as xgb
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report

## Set path for saving model training results 
import os
os.makedirs('./result', exist_ok=True)

## Set Cuda for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Set random seed
def set_seed(seed_val):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

# Set seed
seed_val = 77
set_seed(seed_val)

cuda


## Selecting Data Columns
* Tag names are loaded in sequential order.
* The process of selecting the required tag names from the tag name list.

In [2]:
# Function to display tag names
def show_column(URL):
    
    # Load tag name data
    df = pd.read_csv(URL)
    
    # Convert to list format
    df = df.values.reshape(-1)
    
    return df.tolist()

In [3]:
## Set parameters for displaying tag names
table = 'vibe_unbal'

NAME_URL = f'http://127.0.0.1:5654/db/tql/datahub/api/v1/get-tag-names.tql?table={table}'

## Generate tag name list
name = show_column(NAME_URL)

In [4]:
name

['0_Measured_RPM',
 '0_V_in',
 '0_Vibration_1',
 '0_Vibration_2',
 '0_Vibration_3',
 '0_unbalance_Factor',
 '1_Measured_RPM',
 '1_V_in',
 '1_Vibration_1',
 '1_Vibration_2',
 '1_Vibration_3',
 '1_unbalance_Factor',
 '2_Measured_RPM',
 '2_V_in',
 '2_Vibration_1',
 '2_Vibration_2',
 '2_Vibration_3',
 '2_unbalance_Factor',
 '3_Measured_RPM',
 '3_V_in',
 '3_Vibration_1',
 '3_Vibration_2',
 '3_Vibration_3',
 '3_unbalance_Factor',
 '4_Measured_RPM',
 '4_V_in',
 '4_Vibration_1',
 '4_Vibration_2',
 '4_Vibration_3',
 '4_unbalance_Factor',
 '5_Measured_RPM',
 '5_V_in',
 '5_Vibration_1',
 '5_Vibration_2',
 '5_Vibration_3',
 '5_unbalance_Factor',
 '6_Measured_RPM',
 '6_V_in',
 '6_Vibration_1',
 '6_Vibration_2',
 '6_Vibration_3',
 '6_unbalance_Factor',
 '7_Measured_RPM',
 '7_V_in',
 '7_Vibration_1',
 '7_Vibration_2',
 '7_Vibration_3',
 '7_unbalance_Factor',
 '8_Measured_RPM',
 '8_V_in',
 '8_Vibration_1',
 '8_Vibration_2',
 '8_Vibration_3',
 '8_unbalance_Factor',
 '9_Measured_RPM',
 '9_V_in',
 '9_Vib

## Converting TAG Name Format
* After checking all the Tag Names from the Vibration Unbalance dataset in the previous step, extract only the columns to be used and convert them into parameter format.
* Use tag names related to the 0 & 1 for classification

In [5]:
# Set the desired each tag names
tags_0 = name[:6]
tags_1 = name[6:12]

# Wrap each item in the list with single quotes and separate with commas
tags_0 = ",".join(f"'{tag}'" for tag in tags_0)
tags_1 = ",".join(f"'{tag}'" for tag in tags_1)

# Check the selected tag names
print(tags_0)
print(tags_1)

'0_Measured_RPM','0_V_in','0_Vibration_1','0_Vibration_2','0_Vibration_3','0_unbalance_Factor'
'1_Measured_RPM','1_V_in','1_Vibration_1','1_Vibration_2','1_Vibration_3','1_unbalance_Factor'


## Load Vibration Unbalance Dataset
* Load the data using the Tag Names.

In [6]:
# Data loading function
# Preprocess for each vibration
# Rotation speed, voltage, and unbalance factor are combined into each vibration DataFrame
def data_load(table, name, start_time, end_time, timeformat):
    
    # Load data 
    df = pd.read_csv(f'http://127.0.0.1:5654/db/tql/datahub/api/v1/select-rawdata.tql?table={table}&name={name}&start={start_time}&end={end_time}&timeformat={timeformat}')

    # Convert to data grouped by time
    df = df.pivot_table(index='TIME', columns='NAME', values='VALUE', aggfunc='first').reset_index()

    # Select rotation speed, voltage, and unbalance factor
    df_non_vibe = df.iloc[:, [1, 2, -1]].copy()

    # Convert 'TIME' column to datetime format (skip if already in datetime format)
    df_non_vibe['TIME'] = pd.to_datetime(df['TIME'], format='%Y-%m-%d %H:%M:%S.%f')

    # Set 'TIME' column as index (resample operates based on the index)
    df_non_vibe.set_index('TIME', inplace=True)

    # Resample to 1-second intervals
    df_non_vibe = df_non_vibe.resample('1S').mean().reset_index()
    
    # Set up a list for vibration data 
    vibe = []
    
    # Process each vibration column 
    for i in range(3):
    
        # Separate the DataFrame for vibration data
        df_vibe = df[df.columns[3+i:4+i]].copy()

        # Set 'TIME' column
        df_vibe['TIME'] = pd.to_datetime(df['TIME'], format='%Y-%m-%d %H:%M:%S.%f')
        
        # Group by seconds and count the number of records
        df_counts = df_vibe.groupby(df_vibe['TIME'].dt.floor('S')).size().reset_index(name='count')

        # Filter groups with the same number of records
        # Select the most common count values
        most_common_count = df_counts['count'].mode()[0]

        # Filter by the most common count value
        filtered_df_counts = df_counts[df_counts['count'] == most_common_count]

        # Convert filtered time values to a list
        filtered_times = filtered_df_counts['TIME'].tolist()

        # Select only the filtered time values from the original DataFrame
        filtered_data = df_vibe[df_vibe['TIME'].dt.floor('S').isin(filtered_times)]

        # Group by TIME
        # Round to the nearest second
        filtered_data_ = filtered_data.copy()
        filtered_data_.loc[:, 'TIME'] = filtered_data_['TIME'].dt.floor('S')
        grouped = filtered_data_.groupby('TIME')[df.columns[3+i:4+i].item()].apply(list).reset_index()

        # Split the list into individual columns
        df_vibe_1 = pd.DataFrame(grouped[df.columns[3+i:4+i].item()].tolist())

        # Merge with the 'TIME' column
        result_df = pd.concat([grouped[['TIME']], df_non_vibe.iloc[:, 1:], df_vibe_1], axis=1)

        # Remove missing values -> last line 
        result_df = result_df.dropna()
        
        # drop Time column
        result_df.drop(columns=['TIME'], inplace=True)
        
        # Remove numbers and underscores from the beginning of the first three column names
        columns_to_modify = result_df.columns[:3]
        
        # Create updated column names
        new_columns = columns_to_modify.str.replace(r'^\d+_', '', regex=True)
        
        # Update the entire column names
        result_df.columns = new_columns.tolist() + result_df.columns[3:].tolist()
        
        # set label 
        result_df['label'] = (result_df['unbalance_Factor'] != 0.0).astype(int)
        
        # drop unbalance_Factor column
        result_df.drop(columns=['unbalance_Factor'], inplace=True)
        
        # Save to the list
        vibe.append(result_df)
    
    return vibe

In [None]:
# Data time loading function
def time_data_load(table, name, start_time, end_time, timeformat):
    
    target = 'TIME'
    
    # Load the data  
    df = pd.read_csv(f"http://127.0.0.1:5654/db/tql/datahub/api/v1/select-rawdata.tql?target={target}&table={table}&name={name}&start={start_time}&end={end_time}&timeformat={timeformat}")
    
    # Create a dummy value column for resampling
    df['value'] = 0
    
    # Perform resampling
    df['TIME'] = pd.to_datetime(df['TIME'])
    df.set_index('TIME', inplace=True)
    df = df.resample('1s').mean()
    
    # Remove missing values
    df = df.dropna()
    
    # Remove the dummy value column
    df = df.drop(['value'], axis=1)
    
    return df

In [8]:
# Time update function
# Update start and end times based on batch size
def update_time(time_df, start_time, batch_size):
    
    # Calculate how many data points need to be loaded
    time = batch_size - 1
    
    # Check the index number of the current time
    # If not found, set to the first index as there is no data for the current time
    try:
        index_now = time_df.index.get_loc(start_time)
    except KeyError:
        index_now = 0
    
    # Set the end time for the batch data based on the current time 
    end_time_ = str(time_df.index[index_now + time] + timedelta(seconds=1))
    
    # Set the index number for the next start time
    index_next = index_now + time
    
    # Set the next start time
    next_start_time_ = str(time_df.index[index_next])
    
    # URL encoding
    start_time_ = quote(start_time)
    end_time_ = quote(end_time_)
    next_start_time_ = quote(next_start_time_)
    
    return start_time_, end_time_, next_start_time_, index_next

In [9]:
# Function to calculate the maximum and minimum values for selected tag names
def set_minmax_value(table, name, start_time_train, end_time_train):
    
    # URL encoding
    start = quote(start_time_train)
    end = quote(end_time_train)
    
    # Load Min, Max data
    df_ = pd.read_csv(f'http://127.0.0.1:5654/db/tql/datahub/api/v1/select-scale.tql?table={table}&name={name}&start={start}&end={end}')
    
    # Set Min, Max values
    Min = df_.iloc[:,1:-1].T
    Max = df_.iloc[:,2:].T
    
    return Min, Max  

## Data Preprocessing

* 1 hanning window
* 2 FFT
* 3 MinMax Scaling -> Apply during training
* 4 PCA -> Apply during training

### 1. Hanning Window Setup

In [10]:
# Hanning window function setup 
def set_hanning_window(sample_rate, df):
    
    # Generate Hanning window
    hanning_window = np.hanning(sample_rate)

    # Apply Hanning window to each row
    df_windowed = df.multiply(hanning_window, axis=1)
    
    return df_windowed

### 2. FFT (Fast Fourier Transform) Setup

In [11]:
# FFT transformation function
def change_fft(sample_rate, df):
    # Total number of samples in the signal
    N = sample_rate
    
    fft_results = np.zeros((df.shape[0], N // 2 + 1), dtype=float)
    
    # Apply FFT to each row
    for i in range(df.shape[0]):
        
        # Calculate FFT for each row
        yf = fft(df.iloc[i].values)
        
        # Compute the absolute value of the FFT results and normalize (only the meaningful part)
        fft_results[i] = 2.0 / N * np.abs(yf[:N // 2 + 1])
    
    # Convert FFT results to a DataFrame
    fft_df = pd.DataFrame(fft_results)
    
    return fft_df

## Model Configuration
* Using XGBoost model
* Train three XGBoost models on three vibration datasets, then ensemble the results to make a final prediction

In [12]:
# Create the each XGBoost model
model1 = xgb.XGBClassifier()
model2 = xgb.XGBClassifier()
model3 = xgb.XGBClassifier()

## Model Training
* Training three models simultaneously.

In [13]:
def train(table, name_normal, name_abnomal, timeformat, model1, model2, model3, batch_size, sample_rate, scaler1, scaler2, scaler3, pca1, pca2 , pca3, time_df_train):
    
    # Set initial start time
    start_time_ = str(time_df_train.index[0])

    # Set end time
    end_time_train = str(time_df_train.index[-1])

    # Initialize tqdm with the total number of iterations (or time steps)
    total_steps = (time_df_train.index[-1] - time_df_train.index[0]).total_seconds() // batch_size

    # Use 'with' statement to ensure proper closing of the tqdm object
    with tqdm(total=int(total_steps), desc="Processing Data") as progress_bar:
        
        # Use a while loop to call data 
        while start_time_ < end_time_train:
            
            # Set the time for loading data based on the batch size
            start_time_, end_time_, next_start_time_, index_next= update_time(time_df_train, start_time_, batch_size)
            
            # Load batch data 
            data_normal = data_load(table, name_normal, start_time_, end_time_, timeformat)
            data_abnomal = data_load(table, name_abnomal, start_time_, end_time_, timeformat)
            
            # Combine data by vibration
            # Set label based on unbalance_Factor
            df_vibe_1 = pd.concat([data_normal[0], data_abnomal[0]], axis=0)
            df_vibe_2 = pd.concat([data_normal[1], data_abnomal[1]], axis=0)
            df_vibe_3 = pd.concat([data_normal[2], data_abnomal[2]], axis=0)

            # Randomly shuffle each DataFrame
            df_vibe_1 = df_vibe_1.sample(frac=1, random_state=77).reset_index(drop=True)
            df_vibe_2 = df_vibe_2.sample(frac=1, random_state=77).reset_index(drop=True)
            df_vibe_3 = df_vibe_3.sample(frac=1, random_state=77).reset_index(drop=True)
            
            # Applying Hanning Window each data
            train_1_ = set_hanning_window(sample_rate, df_vibe_1.iloc[:,2:-1])
            train_2_ = set_hanning_window(sample_rate, df_vibe_2.iloc[:,2:-1])
            train_3_ = set_hanning_window(sample_rate, df_vibe_3.iloc[:,2:-1])
            
            # Applying FFT(Fast Fourier Transform) each data
            train_FFT_1 = change_fft(sample_rate, train_1_)
            train_FFT_2 = change_fft(sample_rate, train_2_)
            train_FFT_3 = change_fft(sample_rate, train_3_)
            
            # Apply each Scaler
            train_s1 = scaler1.fit_transform(pd.concat([df_vibe_1.iloc[:,:2], train_FFT_1], axis=1).values)
            train_s2 = scaler2.fit_transform(pd.concat([df_vibe_2.iloc[:,:2], train_FFT_2], axis=1).values)
            train_s3 = scaler3.fit_transform(pd.concat([df_vibe_3.iloc[:,:2], train_FFT_3], axis=1).values)
            
            # Apply PCA
            train_s1 = pca1.fit_transform(train_s1)
            train_s2 = pca2.fit_transform(train_s2)
            train_s3 = pca3.fit_transform(train_s3)
            
            # Print if the loaded data is empty 
            if len(train_s1) == 0:
                print("No data available.")
                
            # Input the data into the model when it accumulates to the batch size
            if len(train_s1) == batch_size * 2:
                
                # Train each model
                model1.fit(train_s1, df_vibe_1.iloc[:, -1:].values)
                model2.fit(train_s2, df_vibe_2.iloc[:,-1:].values)
                model3.fit(train_s3, df_vibe_3.iloc[:,-1:].values)
                
                # Reset batch data
                train_s1 = 0
                train_s2 = 0
                train_s3 = 0
                
            # Set the next start time   
            start_time_ = unquote(next_start_time_)
                    
            # Prevent fetching beyond the last time
            if index_next + batch_size - 1 >= len(time_df_train):
                break
            
            # Update tqdm progress bar
            progress_bar.update(1)

        # Save each model
        model1.save_model(f'./result/vibe_unval_XGBoost_New_Batch_1.json')
        model2.save_model(f'./result/vibe_unval_XGBoost_New_Batch_2.json')
        model3.save_model(f'./result/vibe_unval_XGBoost_New_Batch_3.json')
        
    return model1, model2, model3, scaler1, scaler2, scaler3, pca1, pca2 , pca3

In [14]:
########################################### Training Parameter Settings ################################################
# Set the tag table name
table = 'vibe_unbal'
# Set the tag names
name_normal = quote(tags_0, safe=":/")
name_abnomal = quote(tags_1, safe=":/")
# Set the start time for the train data
start_time_train = '2024-10-07 00:00:00'
# Set the end time for the train data
end_time_train = '2024-10-07 01:20:00'
# Set time format
timeformat = quote('2006-01-02 15:04:05.000000')
# Set batch size
batch_size = 32
# Set sample rate
sample_rate = 4096
# Set each Min-Max scaler
scaler1 = MinMaxScaler()
scaler2 = MinMaxScaler()
scaler3 = MinMaxScaler()
# Set PCA
# Select principal components that explain 95% of the variance
pca1 = PCA(n_components=0.95)
pca2 = PCA(n_components=0.95)
pca3 = PCA(n_components=0.95)
# Load training time list 
time_df_train = time_data_load(table, name_normal, quote(start_time_train), quote(end_time_train), timeformat)

########################################### Proceed with training ################################################
model1, model2, model3, scaler1, scaler2, scaler3, pca1, pca2 , pca3 = train(table, name_normal, name_abnomal, timeformat, model1, model2, model3, batch_size, sample_rate, scaler1, scaler2, scaler3, pca1, pca2 , pca3, time_df_train)

Processing Data:   0%|          | 0/150 [00:00<?, ?it/s]

In [15]:
def test(table, name_normal, name_abnomal, timeformat, model1, model2, model3, batch_size, sample_rate, scaler1, scaler2, scaler3, pca1, pca2 , pca3, time_df_test):
    # Set pred & label list
    pred = []
    label = []
    
    # Set the initial start time
    start_time_t = str(time_df_test.index[0])

    # Set the end time
    end_time_test = str(time_df_test.index[-1])

    # Initialize tqdm with the total number of iterations (or time steps)
    total_steps = (time_df_test.index[-1] - time_df_test.index[0]).total_seconds() // batch_size

    # Use 'with' statement to ensure proper closing of the tqdm object
    with tqdm(total=int(total_steps), desc="Processing Data") as progress_bar:

        # Use a while loop to call data   
        while start_time_t < end_time_test:
            
            # Set the time for loading data based on the batch size
            start_time_t, end_time_t, next_start_time_t, index_next_t = update_time(time_df_test, start_time_t, batch_size)
            
            # Load batch data 
            data_normal = data_load(table, name_normal, start_time_t, end_time_t, timeformat)
            data_abnomal = data_load(table, name_abnomal, start_time_t, end_time_t, timeformat)
            
            # Combine data by vibration
            # Set label based on unbalance_Factor
            df_vibe_1 = pd.concat([data_normal[0], data_abnomal[0]], axis=0).reset_index(drop=True)
            df_vibe_2 = pd.concat([data_normal[1], data_abnomal[1]], axis=0).reset_index(drop=True)
            df_vibe_3 = pd.concat([data_normal[2], data_abnomal[2]], axis=0).reset_index(drop=True)
            
            # Applying Hanning Window each data
            test_1_ = set_hanning_window(sample_rate, df_vibe_1.iloc[:,2:-1])
            test_2_ = set_hanning_window(sample_rate, df_vibe_2.iloc[:,2:-1])
            test_3_ = set_hanning_window(sample_rate, df_vibe_3.iloc[:,2:-1])
            
            # Applying FFT(Fast Fourier Transform) each data
            test_FFT_1 = change_fft(sample_rate, test_1_)
            test_FFT_2 = change_fft(sample_rate, test_2_)
            test_FFT_3 = change_fft(sample_rate, test_3_)
            
            # Apply each Scaler
            test_s1 = scaler1.transform(pd.concat([df_vibe_1.iloc[:,:2], test_FFT_1], axis=1).values)
            test_s2 = scaler2.transform(pd.concat([df_vibe_2.iloc[:,:2], test_FFT_2], axis=1).values)
            test_s3 = scaler3.transform(pd.concat([df_vibe_3.iloc[:,:2], test_FFT_3], axis=1).values)
            
            # Apply PCA
            test_s1 = pca1.transform(test_s1)
            test_s2 = pca2.transform(test_s2)
            test_s3 = pca3.transform(test_s3)
            
            # Print if the loaded data is empty 
            if len(test_s1) == 0:
                print("No data available.")
                
            # Input the data into the model when it accumulates to the batch size
            if len(test_s1) == batch_size * 2:
                
                # Make each predictions probas
                y_pred1 = model1.predict_proba(test_s1)
                y_pred2 = model2.predict_proba(test_s2)
                y_pred3 = model3.predict_proba(test_s3)
                
                # Average the predicted probabilities
                final_pred_probs = (y_pred1 + y_pred2 + y_pred3) / 3

                # Make final predictions based on the averaged probabilities
                final_predictions = final_pred_probs.argmax(axis=1)
                
                pred.append(final_predictions)
                label.append(df_vibe_1['label'].values)
                
                # Reset batch data
                test_s1 = 0
                test_s2 = 0
                test_s3 = 0
                    
            # Set the next start time   
            start_time_t = unquote(next_start_time_t)
                        
            # Prevent fetching beyond the last time
            if index_next_t + batch_size - 1 >= len(time_df_test):
                break
                
            # Update tqdm progress bar
            progress_bar.update(1)
            
        # Concatenating by converting to NumPy arrays
        label = np.concatenate(label)
        pred = np.concatenate(pred)
        
        # Generate final results
        final_df = pd.DataFrame(label, columns=['label'])
        final_df['pred'] = pred
            
    return final_df

In [16]:
########################################### Test Parameter Settings ################################################

# Set the start time for the test data
start_time_test = '2024-10-07 01:20:00'
# Set the end time for the test data
end_time_test = '2024-10-07 02:00:00'
# Load the test time list
time_df_test = time_data_load(table, name_normal, quote(start_time_test), quote(end_time_test), timeformat)

######################################## Proceed with testing #############################################
final_df = test(table, name_normal, name_abnomal, timeformat, model1, model2, model3, batch_size, sample_rate, scaler1, scaler2, scaler3, pca1, pca2 , pca3, time_df_test)

Processing Data:   0%|          | 0/50 [00:00<?, ?it/s]

## Model Performance Evaluation

In [17]:
# Print F1 Score based on testing data
print(classification_report(final_df['label'], final_df['pred']))

              precision    recall  f1-score   support

           0       0.98      0.85      0.91      1632
           1       0.87      0.98      0.92      1632

    accuracy                           0.92      3264
   macro avg       0.93      0.92      0.92      3264
weighted avg       0.93      0.92      0.92      3264

