# Commercial Vehicles Sensor Classification

## Import Necessary Libraries

In [1]:
## Import necessary libraries
import pandas as pd
import numpy as np
import random 
from urllib.parse import quote
from datetime import timedelta, datetime

## Import necessary API
import sys
sys.path.append('../../../../')
from api.v2.util.data_load import data_load
from api.v2.model.Transformer import Transformer_Encoder
from api.v2.util.set_minmax import set_minmax_value
from api.v2.Preprocessing.MinMaxScaler import MinMaxScaler

## Import libraries for the model
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import trange
from sklearn.metrics import f1_score, classification_report

## Set path for saving model training results 
import os
os.makedirs('./result', exist_ok=True)

## Set Cuda for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Set random seed
def set_seed(seed_val):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

# Set seed
seed_val = 77
set_seed(seed_val)

cuda


## Base Parameter Setting
* Set parameters based on the information identified during EDA (Exploratory Data Analysis).

In [2]:
# Set Commercial Vehicles Sensor Data Parameter

# Set MachBase Neo URL address
URL = 'http://127.0.0.1:5654'
# Set Tag Table Name
table = 'commercial_vehicles'
# Select Tag Name -> Can Check Tag Names Using command 'show_column(URL, table)'
# Set Austria Tag Name 
tags = ['gFx', 'gFy', 'gFz', 'label', 'speed', 'wx', 'wy', 'wz']
# Wrap each item in the list with single quotes and separate with commas
tags_ = ",".join(f"'{tag}'" for tag in tags)
# Set Tag Name
name = quote(tags_, safe=":/")
# Set resample Option -> D(day), H(hour), T(minute), S(second)
resample_freq = None
# Set Start time
start_time = '2025-01-01 00:00:00'
# Set End time 
end_time = '2025-01-01 04:44:00'
# Set TimeFormat - > 'default' or quote('2006-01-02 15:04:05.000000')(Divided down to the nanosecond)
timeformat = quote('2006-01-02 15:04:05.000000')

## Model Configuration
* Using Transformer model.

In [3]:
# Hyperparameter settings
input_dim = 7         # Number of features in the time series
model_dim = 64        # Dimension of the model
num_heads = 2         # Number of attention heads
num_layers = 3        # Number of Transformer layers
output_dim = 6        # Dimension of the predicted labels
dropout = 0.1         # Dropout rate

# Model configuration
model = Transformer_Encoder(input_dim, model_dim, num_heads, num_layers, output_dim, dropout).to(device)

# Loss function and optimizer configuration
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print(model)

Transformer_Encoder(
  (input_embedding): Linear(in_features=7, out_features=64, bias=True)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
    )
    (linear1): Linear(in_features=64, out_features=512, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=512, out_features=64, bias=True)
    (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=512

## Dataset & Loader Setup

In [4]:
class Vehicles_Dataset(Dataset):

    def __init__(self, df):
        self.freq_data = df.iloc[:,:-1]
        self.label = df.iloc[:,-1:].values

    def __len__(self):
        return len(self.freq_data)

    def __getitem__(self, index):

        input_time_data = self.freq_data.iloc[index,:]
        input_time_data = torch.Tensor(input_time_data).expand(1, input_time_data.shape[0])
        label = self.label[index]

        return input_time_data, label

## Model Training

In [5]:
# Model training function
def train(epochs, start_time_train, end_time_train, unit, Fetch_size, URL, table, name, timeformat, resample_freq, scaler, Min, Max, batch_size):
    
    # Initialize training loss
    train_loss = []
    # Initialize training accuracy
    train_acc = []
    
    # Initialize best F1 Score value
    best_f1= 0
    
    # Start model training
    for epoch in epochs:
        
        # Set the model to training mode
        model.train()
        correct = 0
        total=0
        preds_ = []
        targets_ = []
        
        # Initialize loss and total step
        running_loss = 0.0
        total_step = 0
        
        # Set initial Time
        args = {unit: Fetch_size}
        start_time = start_time_train
        end_time = str(datetime.strptime(start_time_train, "%Y-%m-%d %H:%M:%S") + timedelta(**args))
        end_time_train_ = str(datetime.strptime(end_time_train, "%Y-%m-%d %H:%M:%S") + timedelta(seconds=1))
        
        # Set flag
        flag = False

        while end_time < end_time_train_:
            
            # Load batch data
            data = data_load(URL, table, name, start_time, end_time, timeformat, resample_freq)

            # Move the 'label' column to the last position
            data = data.reindex(columns=[col for col in data.columns if col != 'label'] + ['label'])

            # Convert the 'label' column to integer type
            data['label'] = data['label'].astype(int)
            
            # Remove labels 0 and 5, as they have no meaning as labels
            data = data[(data['label'] != 0) & (data['label'] != 5)]

            # Adjust the label values to start from 0
            data['label'] = data['label'] - 1

            # Apply MinMaxscaler
            data_scaled = scaler.fit_transform(data.iloc[:,:-1].values, Min.drop(columns=[0]).values, Max.drop(columns=[0]).values)
            
            # Set up the DataFrame
            data_ = pd.DataFrame(data_scaled)
            data_['label'] = data['label'].values
            
            # Drop NaN values
            data = data_.dropna()

            # Set up dataset & Loader
            train_ = Vehicles_Dataset(data)

            train_dataloader = DataLoader(train_, batch_size, shuffle=True)

            # Print if the loaded data is empty
            if len(data) != 0:
                
                for batch_idx, (data, target) in enumerate(train_dataloader):
                    
                    # Check total batch count
                    total_step += 1
                    
                    data = data.to(device).float()
                    target = target.to(device).long().squeeze()
                    
                    optimizer.zero_grad()
                
                    # Input to the model
                    outputs = model(data)
                    outputs = outputs.squeeze()
                    
                    # Calculate loss
                    loss = criterion(outputs, target)
                    loss.backward()
                    optimizer.step()
                    
                    running_loss += loss.item()
                    
                    # Set label predictions 
                    _,pred = torch.max(outputs, dim=1)
                    target_ = target.view_as(pred)
                    correct += torch.sum(pred==target).item()
                    total += target.size(0)
                    
                    preds_.append(pred)
                    targets_.append(target_)
                    
            # Update start_time and end_time for next batch
            start_time = end_time
            end_time = str(datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") + timedelta(**args))
            
            # Select the remaining portions at the end
            if end_time >= end_time_train and not flag:
                
                end_time = end_time_train
                flag = True   

        train_acc.append(100 * correct / total)
        train_loss.append(running_loss/total_step)
        print(f'\ntrain loss: {np.mean(train_loss)}, train acc: {(100 * correct / total):.4f}')

        # Combine predictions and labels collected from all batches
        preds_ = torch.cat(preds_).detach().cpu().numpy()
        targets_ = torch.cat(targets_).detach().cpu().numpy()
        
        f1score = f1_score(targets_, preds_,  average='macro')
        if best_f1 < f1score:
            best_f1 = f1score
            # Save the best model 
            with open("./result/Commercial_Vehicles_Sensor_Buffered.txt", "a") as text_file:
                print('epoch=====',epoch, file=text_file)
                print(classification_report(targets_, preds_, digits=4), file=text_file)
            print('model save')
            torch.save(model, f'./result/Commercial_Vehicles_Sensor_Buffered.pt') 
        epochs.set_postfix_str(f"epoch = {epoch},  f1_score = {f1score}, best_f1 = {best_f1}")
     
    return model

In [6]:
# Set number of epochs
epochs = trange(20, desc='training')
# Unit of time ('days', 'hours', 'minutes', 'seconds')
unit = 'hours'
Fetch_size = 1
# Set Batch Size
batch_size = 64
# Set Train Time
start_time_train = '2025-01-01 00:00:00'
end_time_train = '2025-01-01 04:00:00'
# Set up scalers
scaler = MinMaxScaler()
# Set Min, Max value
Min, Max = set_minmax_value(URL, table, name, start_time_train, end_time_train)

#################################################################Training#############################################################################################
train(epochs, start_time_train, end_time_train, unit, Fetch_size, URL, table, name, timeformat, resample_freq, scaler, Min, Max, batch_size)

training:   0%|          | 0/20 [00:00<?, ?it/s]

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)



train loss: 0.34564798096300164, train acc: 87.4890


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


model save

train loss: 0.317945278449255, train acc: 89.4683
model save

train loss: 0.30363373976351354, train acc: 89.9874
model save

train loss: 0.29340244446013874, train acc: 90.4671
model save

train loss: 0.2854429211933337, train acc: 90.8409
model save

train loss: 0.2781842412260607, train acc: 91.2633
model save

train loss: 0.2721767114612044, train acc: 91.4765
model save

train loss: 0.2673685350859712, train acc: 91.5174
model save

train loss: 0.26297809996999166, train acc: 91.7152
model save

train loss: 0.2593905063010484, train acc: 91.8376
model save

train loss: 0.25583769931480327, train acc: 92.0282
model save

train loss: 0.252694848827456, train acc: 92.1129
model save

train loss: 0.24935095093606469, train acc: 92.4585
model save

train loss: 0.24625542779483506, train acc: 92.5820
model save

train loss: 0.24353771827194, train acc: 92.5731

train loss: 0.240927953290204, train acc: 92.7146
model save

train loss: 0.23919231125420987, train acc: 92.4076



Transformer_Encoder(
  (input_embedding): Linear(in_features=7, out_features=64, bias=True)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
    )
    (linear1): Linear(in_features=64, out_features=512, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=512, out_features=64, bias=True)
    (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=512

## Model Test

In [7]:
def test(model, start_time_test, end_time_test, unit, Fetch_size, URL, table, name, timeformat, resample_freq, scaler, Min, Max, batch_size):
    
    with torch.no_grad():
                
        model.eval()
        
        # Initial settings 
        output_test = []
        output_target = []
        
        # Set initial Time
        args = {unit: Fetch_size}
        start_time = start_time_test
        end_time = str(datetime.strptime(start_time_test, "%Y-%m-%d %H:%M:%S") + timedelta(**args))
        end_time_test_ = str(datetime.strptime(end_time_test, "%Y-%m-%d %H:%M:%S") + timedelta(minutes=1))
        
        # Set flag
        flag = False
        
        while end_time < end_time_test_:
            
            # Load batch data
            data = data_load(URL, table, name, start_time, end_time, timeformat, resample_freq)

            # Move the 'label' column to the last position
            data = data.reindex(columns=[col for col in data.columns if col != 'label'] + ['label'])

            # Convert the 'label' column to integer type
            data['label'] = data['label'].astype(int)
            
            # Remove labels 0 and 5, as they have no meaning as labels
            data = data[(data['label'] != 0) & (data['label'] != 5)]

            # Adjust the label values to start from 0
            data['label'] = data['label'] - 1

            # Apply MinMaxscaler
            data_scaled = scaler.fit_transform(data.iloc[:,:-1].values, Min.drop(columns=[0]).values, Max.drop(columns=[0]).values)
            
            # Set up the DataFrame
            data_ = pd.DataFrame(data_scaled)
            data_['label'] = data['label'].values
            
            # Drop NaN values
            data = data_.dropna()

            # Set up dataset & Loader
            test_ = Vehicles_Dataset(data)

            test_dataloader = DataLoader(test_, batch_size, shuffle=False)
            
            # Print if the loaded data is empty
            if len(data) != 0:
                
                for batch_idx, (data, target) in enumerate(test_dataloader):
                    
                    data = data.to(device).float()
                    target = target.to(device).long().squeeze()
                
                    # Input to the model
                    outputs = model(data)
                    outputs = outputs.squeeze()
                    
                    # Set label predictions 
                    _,pred = torch.max(outputs, dim=1)
                    target_ = target.view_as(pred)
                    
                    output_test.append(pred)
                    output_target.append(target_)
        
            # Update start_time and end_time for next batch
            start_time = end_time
            end_time = str(datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") + timedelta(**args))
            
            # Select the remaining portions at the end
            if end_time >= end_time_test and not flag:
                
                end_time = end_time_test
                flag = True   
 
        # Combine tensors into one
        combined_tensor_target = torch.cat(output_target, dim=0)
        combined_tensor_pred = torch.cat(output_test, dim=0)

        # Change to NumPy format
        real_values = combined_tensor_target.cpu().numpy()
        real_pred_values = combined_tensor_pred.cpu().numpy()

    return real_values, real_pred_values  

In [11]:
# Load the best model
model_ = torch.load(f'./result/Commercial_Vehicles_Sensor_Buffered.pt')

# Unit of time ('days', 'hours', 'minutes', 'seconds')
unit = 'minutes'
Fetch_size = 10
# Set Test Time
start_time_test = '2025-01-01 04:00:00'
end_time_test = '2025-01-01 04:44:00'
##################################################################################Test############################################################################################################
real_values, real_pred_values = test(model_, start_time_test, end_time_test, unit, Fetch_size, URL, table, name, timeformat, resample_freq, scaler, Min, Max, batch_size)

## Model Performance Evaluation

In [13]:
print(classification_report(real_values, real_pred_values))

              precision    recall  f1-score   support

           0       0.20      0.48      0.28     25611
           1       0.92      0.92      0.92    127886
           2       0.67      0.45      0.54     85496
           3       0.62      0.37      0.46     20991

    accuracy                           0.68    259984
   macro avg       0.60      0.55      0.55    259984
weighted avg       0.74      0.68      0.70    259984

