Load datasets

In [15]:
import pandas as pd

# dfTrain=pd.read_csv("Train.csv")
df_train_id002=pd.read_csv("train_id0002.csv")
df_train_id003=pd.read_csv("train_id0003.csv")
df_train_id006=pd.read_csv("train_id0006.csv")
dfTest=pd.read_csv("Test.csv")
dfValidation=pd.read_csv("Validation.csv")


epochs=5
iterations=5



Columns to be analyzed

In [16]:
columns=['EngSpeed', 'EngOilPress', 'EngCoolantTemp', 'EngFuelRate', 'BatteryPotential_PowerInput1', 'ActualEngPercentTorque', 'BarometricPress', 'EngPercentLoadAtCurrentSpeed', 'NominalFrictionPercentTorque', 'EngsDesiredOperatingSpeed', 'EngIntakeManifold1Temp', 'EngIntakeManifold1Press']


In [17]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler

def create_autoencoder(input_dim, encoding_dim=128):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(encoding_dim, activation='tanh'),
        layers.Dense(2, activation='tanh'),
        layers.Dense(encoding_dim, activation='tanh'),
        layers.Dense(input_dim, activation='tanh')
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [18]:
input_dim = len(columns)

def federated_average(models):
    # Inicializa el modelo global
    global_model = create_autoencoder(input_dim)

    # Extrae pesos de cada modelo local
    local_weights = [model.get_weights() for model in models]

    # Promedia los pesos
    avg_weights = []
    for weights in zip(*local_weights):
        avg_weights.append(np.mean(weights, axis=0))

    # Asigna los pesos promedio al modelo global
    global_model.set_weights(avg_weights)
    return global_model

In [19]:
# Dimensiones
input_dim = len(columns)

global_model = create_autoencoder(input_dim)

for round_num in range(iterations):
    print(f"\n🔁 Federated Iteration {round_num + 1}")
    
    local_models = []

    # Entrenamiento local en cada cliente
    for i, df in enumerate([df_train_id002, df_train_id003, df_train_id006]):
        print(f"  Entrenando cliente {i+1}")
        
        # Clonar el modelo global (estructura) y copiar pesos
        local_model = create_autoencoder(input_dim)
        local_model.set_weights(global_model.get_weights())
        
        # Entrenamiento local
        local_model.fit(df[columns], df[columns],
                        epochs=epochs, batch_size=64, verbose=0)
        
        local_models.append(local_model)

    # Promedio de pesos (FedAvg)
    global_model = federated_average(local_models)

    # Evaluación después de la ronda
    val_loss = global_model.evaluate(dfValidation[columns], dfValidation[columns], verbose=0)
    print(f"  🔍 Validation Loss after Round {round_num + 1}: {val_loss:.6f}")


🔁 Federated Iteration 1
  Entrenando cliente 1
  Entrenando cliente 2
  Entrenando cliente 3
  🔍 Validation Loss after Round 1: 1.543114

🔁 Federated Iteration 2
  Entrenando cliente 1
  Entrenando cliente 2
  Entrenando cliente 3
  🔍 Validation Loss after Round 2: 1.544053

🔁 Federated Iteration 3
  Entrenando cliente 1
  Entrenando cliente 2
  Entrenando cliente 3
  🔍 Validation Loss after Round 3: 1.546190

🔁 Federated Iteration 4
  Entrenando cliente 1
  Entrenando cliente 2
  Entrenando cliente 3
  🔍 Validation Loss after Round 4: 1.554206

🔁 Federated Iteration 5
  Entrenando cliente 1
  Entrenando cliente 2
  Entrenando cliente 3
  🔍 Validation Loss after Round 5: 1.559056


train the model and determine threshold values

In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Define the number of input features (columns in the dataset)
input_dim = len(columns)  # Number of features
encoding_dim = 128  # Number of neurons in the encoding layer

# Columns that require modifications based on specific conditions
columns_to_modify = ["EngCoolantTemp", "EngIntakeManifold1Temp"]

# Define the Autoencoder model


# Extract validation data for evaluation
x_val = dfValidation[columns]

# Generate predictions using the trained autoencoder
predictions = global_model.predict(x_val)

# Compute the Mean Squared Error (MSE) for each feature
mse = pd.DataFrame(np.power(x_val - predictions, 2), columns=columns)

# Dictionary to store the best threshold for each feature
best_thresholds = {}

# Iterate over each feature to find the best threshold using F1-score
for column in columns:
    mse_column = mse[column]  # Get MSE values for the column
    best_f1 = 0  # Initialize best F1-score
    best_threshold = 0  # Initialize best threshold value

    # Try different percentile-based thresholds (from 50% to 100%)
    for threshold_value in range(50, 101):
        threshold = np.percentile(mse_column, threshold_value)  # Compute percentile threshold
        anomaly_labels = (mse_column > threshold)  # Identify anomalies
        y_pred_bool = ~anomaly_labels  # Convert to boolean normal/abnormal labels
        
        # Apply additional condition for specific columns
        if column + "_normal" in columns_to_modify:
            y_pred_bool = y_pred_bool | (dfValidation["within_5_minutes"])
        
        y_test = dfValidation[column + "_normal"]  # Get actual labels
        
        # Compute F1-score
        f1 = f1_score(y_test, y_pred_bool)
        
        # Update the best threshold if F1-score improves
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_threshold_value = threshold_value

    # Store the best threshold for this feature
    best_thresholds[column] = best_threshold




Evaluate with test dataset

In [21]:
import numpy as np
import pandas as pd
from sklearn.metrics import (f1_score, confusion_matrix, accuracy_score, 
                             precision_score, recall_score, roc_auc_score, average_precision_score)

# List of columns that need modifications
columns_to_modify = ["EngCoolantTemp_normal", "EngIntakeManifold1Temp_normal"]

# Extract test data based on selected columns
x_test = dfTest[columns]

# Predict results using the autoencoder
results = global_model.predict(x_test)

# Compute Mean Squared Error (MSE) for each column
mse = pd.DataFrame(np.power(x_test - results, 2), columns=columns)

# Iterate over columns to determine anomalies
for column in columns:
    mse_column = mse[column]
    best_threshold = best_thresholds[column]
    anomaly_labels = (mse_column > best_threshold)
    y_pred_bool = ~anomaly_labels
    
    # Modify prediction based on special condition
    if column + "_normal" in columns_to_modify:
        y_pred_bool = y_pred_bool | (dfTest["within_5_minutes"])

    dfTest[column + "_normal_pred"] = y_pred_bool

# Determine final prediction by checking all normal predictions
dfTest['normal_label_pred'] = dfTest[[col + "_normal_pred" for col in columns]].all(axis=1)

# Extract actual and predicted labels for evaluation
y_test_global = dfTest["normal_label"]
y_pred_global = dfTest['normal_label_pred']

# Compute evaluation metrics
confusion = confusion_matrix(y_test_global, y_pred_global)
accuracy = accuracy_score(y_test_global, y_pred_global)
precision = precision_score(y_test_global, y_pred_global)
recall = recall_score(y_test_global, y_pred_global)
f1 = f1_score(y_test_global, y_pred_global)
roc_auc = roc_auc_score(y_test_global, y_pred_global)
pr_auc = average_precision_score(y_test_global, y_pred_global)

# Extract values from confusion matrix
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

# Compute additional evaluation metrics
TPR = TP / (TP + FN)  # Sensitivity
FNR = FN / (FN + TP)  # False Negative Rate
TNR = TN / (TN + FP)  # Specificity
FPR = FP / (FP + TN)  # False Positive Rate

# Create DataFrame to store results without additional statistics
df_resultados_globales = pd.DataFrame([{
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1 Score": f1,
    "ROC AUC": roc_auc,
    "PR AUC": pr_auc,
    "TP": TP,
    "TN": TN,
    "FP": FP,
    "FN": FN,
    "TPR": TPR,
    "FNR": FNR,
    "TNR": TNR,
    "FPR": FPR
}])

# Format percentage-based metrics properly
columns_to_exclude = ["TP", "TN", "FP", "FN"]
for col in df_resultados_globales.columns:
    if col not in columns_to_exclude:
        df_resultados_globales[col] = df_resultados_globales[col].apply(lambda x: f"{x*100:.2f}%")

# Print and save the results
print(df_resultados_globales)
df_resultados_globales.to_csv("results.csv", index=False)


  Accuracy Precision  Recall F1 Score ROC AUC  PR AUC     TP   TN   FP   FN  \
0   94.12%    96.36%  97.18%   96.77%  80.72%  96.20%  10486  712  396  304   

      TPR    FNR     TNR     FPR  
0  97.18%  2.82%  64.26%  35.74%  
