In this notebook, a Bi-LSTM with attention model is trained using the processed data.

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Input, Dropout, Layer, Concatenate, TimeDistributed, Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow as tf
from tensorflow.keras.utils import Sequence
import tensorflow.keras.backend as K
import os
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# DataFrame display settings 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)

In [2]:
operational_data = pd.read_csv('balanced4000noresample_original_labels_increased_and_selected_and_filled_train_operational_readouts.csv')
labels_data = pd.read_csv('balanced4000noresample_original_labels_increased_and_selected_and_filled_train_labels.csv')
validation_operational = pd.read_csv('selected_and_filled_validation_operational_readouts.csv')
validation_labels = pd.read_csv('validation_labels.csv')

# Initialize LabelEncoder and fit it to all labels
le = LabelEncoder()
all_labels = pd.concat([labels_data['class_labels'], validation_labels['class_label']])
le.fit(all_labels)

# Merge operational data with labels
merged_data = operational_data.merge(labels_data, on='vehicle_id')
vehicle_groups = merged_data.groupby('vehicle_id')

In [3]:
scaler = StandardScaler()
# Collect all training data for fitting the scaler
all_train_data = []
for vehicle_id, group in vehicle_groups:
    time_series = group.sort_values('time_step').iloc[:, 2:-3].values
    all_train_data.append(time_series)
all_train_data = np.vstack(all_train_data)
scaler.fit(all_train_data)

In [4]:
# Self-Attention Layer
class SelfAttention(Layer):
    def __init__(self, attention_units=128, return_attention=False, **kwargs):
        self.attention_units = attention_units
        self.return_attention = return_attention
        super(SelfAttention, self).__init__(**kwargs)
        
    def build(self, input_shape):
        self.time_steps = input_shape[1]
        self.input_dim = input_shape[2]
        
        self.query_dense = Dense(self.attention_units)
        self.key_dense = Dense(self.attention_units)
        self.value_dense = Dense(self.input_dim)  
        
        self.context_dense = Dense(self.input_dim)
        
        super(SelfAttention, self).build(input_shape)
    
    def call(self, inputs):
        query = self.query_dense(inputs)  
        key = self.key_dense(inputs)    
        value = self.value_dense(inputs)  

        score = tf.matmul(query, key, transpose_b=True) 
        score = score / tf.math.sqrt(tf.cast(self.attention_units, tf.float32))
        
        attention_weights = tf.nn.softmax(score, axis=-1)  
        
        context = tf.matmul(attention_weights, value)  
        
        output = self.context_dense(context) 
        
        if self.return_attention:
            return output, attention_weights
        return output
    
    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[1], self.input_dim), 
                    (input_shape[0], input_shape[1], input_shape[1])]
        return (input_shape[0], input_shape[1], self.input_dim)
    
    def get_config(self):
        config = super(SelfAttention, self).get_config()
        config.update({
            'attention_units': self.attention_units,
            'return_attention': self.return_attention
        })
        return config

In [5]:
# Padding function for Bi-LSTM's inputs
class DynamicPaddingGenerator(Sequence):
    def __init__(self, groups, batch_size, label_encoder, scaler, is_training=True, return_vehicle_ids=False, **kwargs):
        super().__init__(**kwargs)
        self.groups = list(groups)
        self.batch_size = batch_size
        self.is_training = is_training
        self.label_encoder = label_encoder
        self.scaler = scaler  # Use the pre-fitted scaler
        self.n_samples = len(self.groups)
        self.indexes = np.arange(self.n_samples)
        self.return_vehicle_ids = return_vehicle_ids
        
    def __len__(self):
        return int(np.ceil(self.n_samples / self.batch_size))
    
    def on_epoch_end(self):
        if self.is_training:
            np.random.shuffle(self.indexes)
    
    def __getitem__(self, idx):
        batch_indexes = self.indexes[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_groups = [self.groups[i] for i in batch_indexes]
        max_length = max(len(group[1]) for group in batch_groups)
        
        X_batch = []
        y_batch = []
        vehicle_ids = []
        
        for vehicle_id, group in batch_groups:
            if self.is_training:
                time_series = group.sort_values('time_step').iloc[:, 2:-3].values
                label = group['class_labels'].iloc[0]
            else:
                time_series = group.sort_values('time_step').iloc[:, 2:].values
                label = validation_labels.loc[validation_labels['vehicle_id'] == vehicle_id, 'class_label'].values[0]
            
            # Use transform instead of fit_transform (scaler is already fitted)
            time_series = self.scaler.transform(time_series)
            padded_series = pad_sequences([time_series], maxlen=max_length, padding='post', dtype='float32')[0]
            
            X_batch.append(padded_series)
            # Convert to one-hot encoding
            encoded_label = tf.keras.utils.to_categorical(
                self.label_encoder.transform([label])[0], 
                num_classes=5
            )
            y_batch.append(encoded_label)
            vehicle_ids.append(vehicle_id)
        
        if self.return_vehicle_ids:
            return np.array(X_batch), np.array(y_batch), vehicle_ids
        return np.array(X_batch), np.array(y_batch)

In [6]:
def create_bilstm_attention_model(input_shape, return_attention=False):
    inputs = Input(shape=input_shape)
    
    x = Bidirectional(LSTM(256, return_sequences=True))(inputs)
    x = Dropout(0.4)(x)
    x = Bidirectional(LSTM(196, return_sequences=True))(x)
    x = Dropout(0.4)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.4)(x)
    
    # Self-Attention Layer
    if return_attention:
        attention_output, attention_weights = SelfAttention(attention_units=128, return_attention=True)(x)
    else:
        attention_output = SelfAttention(attention_units=128, return_attention=False)(x)
    
    x = Concatenate()([x, attention_output])  # Combine the Attention output with the Bi-LSTM output
    x = Bidirectional(LSTM(96, return_sequences=False))(x)
    x = Dropout(0.4)(x)
    
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(16, activation='relu')(x)
    x = Dropout(0.4)(x)
    
    # Çıktı katmanı
    outputs = Dense(5, activation=None)(x)  # Linear activation for from_logits=True
    
    if return_attention:
        model = Model(inputs=inputs, outputs=[outputs, attention_weights])
    else:
        model = Model(inputs=inputs, outputs=outputs)
    
    return model

In [7]:
def custom_cost_metric(y_true, y_pred):
    # Convert predictions to probabilities
    y_pred = tf.nn.softmax(y_pred)
    
    # Get the predicted class
    pred_class = tf.argmax(y_pred, axis=1)
    true_class = tf.argmax(y_true, axis=1)
    
    # Define cost matrix
    cost_matrix = tf.constant([
        [0, 7, 8, 9, 10],
        [200, 0, 7, 8, 9],
        [300, 200, 0, 7, 8],
        [400, 300, 200, 0, 7],
        [500, 400, 300, 200, 0]
    ], dtype=tf.float32)
    
    # Calculate cost
    costs = tf.gather_nd(cost_matrix, 
                        tf.stack([true_class, pred_class], axis=1))
    return tf.reduce_mean(costs)

In [8]:
class CostAwareCallback(tf.keras.callbacks.Callback):
    def __init__(self, validation_generator, cost_matrix):
        super(CostAwareCallback, self).__init__()
        self.validation_generator = validation_generator
        self.cost_matrix = cost_matrix
        self.best_cost = float('inf')
    
    def on_epoch_end(self, epoch, logs={}):
        total_cost = 0
        true_labels = []
        pred_labels = []
        
        for i in range(len(self.validation_generator)):
            x_val, y_val = self.validation_generator[i]
            predictions = self.model.predict(x_val, verbose=0)
            predictions = tf.nn.softmax(predictions)  # Apply softmax
            
            true_labels.extend(np.argmax(y_val, axis=1))
            pred_labels.extend(np.argmax(predictions, axis=1))
        
        total_cost = sum(self.cost_matrix[t][p] for t, p in zip(true_labels, pred_labels))
        logs['val_cost'] = total_cost
        
        if total_cost < self.best_cost:
            self.best_cost = total_cost
            self.model.save('best_cost_model.keras')

In [9]:
# Initialize data generators with the pre-fitted scaler
batch_size = 64
train_generator = DynamicPaddingGenerator(vehicle_groups, batch_size, le, scaler, is_training=True)
val_generator = DynamicPaddingGenerator(validation_operational.groupby('vehicle_id'), batch_size, le, scaler, is_training=False)

first_batch_X, _ = train_generator[0]
input_shape = (None, first_batch_X.shape[2])

bilstm_attention_model = create_bilstm_attention_model(input_shape, return_attention=False)
bilstm_attention_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=['accuracy', custom_cost_metric]
)

In [10]:
bilstm_attention_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 61)]   0           []                               
                                                                                                  
 bidirectional (Bidirectional)  (None, None, 512)    651264      ['input_1[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, None, 512)    0           ['bidirectional[0][0]']          
                                                                                                  
 bidirectional_1 (Bidirectional  (None, None, 392)   1111712     ['dropout[0][0]']                
 )                                                                                            

In [11]:
# Callbacks
model_dir = "trained_model_bilstm_attention"
os.makedirs(model_dir, exist_ok=True)
model_path_template = os.path.join(model_dir, "bilstm_attention_model_epoch_{epoch:02d}.keras")

callbacks = [
    ModelCheckpoint(
        filepath=model_path_template,
        save_freq='epoch',
        save_weights_only=False
    ),
    EarlyStopping(
        monitor='val_custom_cost_metric',
        patience=50,
        restore_best_weights=True
    ),
    CostAwareCallback(
        val_generator,
        np.array([
            [0, 7, 8, 9, 10],
            [200, 0, 7, 8, 9],
            [300, 200, 0, 7, 8],
            [400, 300, 200, 0, 7],
            [500, 400, 300, 200, 0]
        ])
    )
]

# Train the model
history = bilstm_attention_model.fit(
    train_generator,
    epochs=150,
    validation_data=val_generator,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/150


KeyboardInterrupt: 

In [None]:
history_df = pd.DataFrame(history.history)
history_path = os.path.join(model_dir, 'training_history.csv')
history_df.to_csv(history_path, index=False)

In [None]:
# Save the LabelEncoder
label_encoder_path = os.path.join(model_dir, 'label_encoder.joblib')
joblib.dump(le, label_encoder_path)

In [None]:
# Save the StandardScaler
scaler_path = os.path.join(model_dir, 'standard_scaler.joblib')
joblib.dump(scaler, scaler_path)

In [None]:
# Visualize train history
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(2, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 2, 3)
plt.plot(history.history['custom_cost_metric'], label='Train Cost')
plt.plot(history.history['val_custom_cost_metric'], label='Validation Cost')
plt.title('Custom Cost Metric')
plt.xlabel('Epoch')
plt.ylabel('Cost')
plt.legend()

if 'val_cost' in history.history:
    plt.subplot(2, 2, 4)
    plt.plot(history.history['val_cost'], label='Validation Total Cost')
    plt.title('Validation Total Cost')
    plt.xlabel('Epoch')
    plt.ylabel('Total Cost')
    plt.legend()

plt.tight_layout()
plt.savefig(os.path.join(model_dir, 'training_history.png'))
plt.close()