# 🕑 **Preparing Data for 2-TBN: Time-Series Transformation**

**Import useful libraries**

In [1]:
import os
import glob
import pandas as pd

import logging
logging.getLogger().setLevel(logging.CRITICAL)
logging.getLogger("matplotlib").setLevel(logging.WARNING)

import warnings
warnings.filterwarnings('ignore')

from utilities import DataEncoder, DBNDataTransformer, DataVisualizer, DataProcessor

**Define basic folder paths**

In [2]:
# Define folder names
DATA_FOLDER_NAME = r".\data"

DISCRETIZED_ORIGINAL_DATASETS_IMOLA_FOLDER_NAME = os.path.join(DATA_FOLDER_NAME, "discretized-datasets-imola")

CUT_DATASETS_IMOLA_FOLDER_NAME      = DISCRETIZED_ORIGINAL_DATASETS_IMOLA_FOLDER_NAME
KMEANS_DATASETS_IMOLA_FOLDER_NAME   = os.path.join(DISCRETIZED_ORIGINAL_DATASETS_IMOLA_FOLDER_NAME, "kmeans")

# AGGREGATED_DATASETS_IMOLA_FOLDER_NAME = os.path.join(DATA_FOLDER_NAME, "aggregated-datasets-imola")

In [3]:
# Get all CSV dataset files from the specified folder
cut_datasets_imola = glob.glob(os.path.join(CUT_DATASETS_IMOLA_FOLDER_NAME, "*.csv"))
print(f"📂 Found {len(cut_datasets_imola)} datasets in '{CUT_DATASETS_IMOLA_FOLDER_NAME}'")

# Lists to store transformed time-series data
full_time_series_list = []
normal_time_series_list = []
anomalous_time_series_list = []

# Process each dataset
for idx, dataset_path in enumerate(cut_datasets_imola, start=1):
    print(f"\n🔍 [{idx}/{len(cut_datasets_imola)}] Loading dataset: {dataset_path}")
    
    # Load dataset
    df = pd.read_csv(dataset_path)
    print(f"   ✔ Loaded dataset with shape: {df.shape}")

    # Transform full dataset into time-series format
    print("   ⏳ Creating time-series representation (Full dataset)...")
    full_time_series = DBNDataTransformer.create_time_series_dbn_data(df, time_slices=2)
    full_time_series_list.append(full_time_series)
    print(f"   ✔ Full dataset transformed, new shape: {full_time_series.shape}")

    # Extract & transform normal data (InverterFault == 0)
    print("   ⏳ Extracting & transforming NORMAL data...")
    normal_df = df[df['InverterFault'] == 0].drop(columns=['InverterFault'])
    normal_time_series = DBNDataTransformer.create_time_series_dbn_data(normal_df)
    normal_time_series_list.append(normal_time_series)
    print(f"   ✔ Normal dataset transformed, new shape: {normal_time_series.shape}")

    # Extract & transform anomalous data (InverterFault == 1)
    print("   ⏳ Extracting & transforming ANOMALOUS data...")
    anomalous_df = df[df['InverterFault'] == 1].drop(columns=['InverterFault'])
    anomalous_time_series = DBNDataTransformer.create_time_series_dbn_data(anomalous_df)
    anomalous_time_series_list.append(anomalous_time_series)
    if anomalous_df.empty:
        print("   ⚠ Warning: No anomalous data found in this dataset!")
    else:
        anomalous_time_series = DBNDataTransformer.create_time_series_dbn_data(anomalous_df)
        anomalous_time_series_list.append(anomalous_time_series)
        print(f"   ✔ Anomalous dataset transformed, new shape: {anomalous_time_series.shape}")

    print(f"✅ Completed processing {dataset_path}\n")

📂 Found 3 datasets in '.\data\discretized-datasets-imola'

🔍 [1/3] Loading dataset: .\data\discretized-datasets-imola\discr-20241128-imola.csv
   ✔ Loaded dataset with shape: (248448, 14)
   ⏳ Creating time-series representation (Full dataset)...
   ✔ Full dataset transformed, new shape: (248447, 28)
   ⏳ Extracting & transforming NORMAL data...
   ✔ Normal dataset transformed, new shape: (248447, 26)
   ⏳ Extracting & transforming ANOMALOUS data...
✅ Completed processing .\data\discretized-datasets-imola\discr-20241128-imola.csv


🔍 [2/3] Loading dataset: .\data\discretized-datasets-imola\discr-20250113-imola.csv
   ✔ Loaded dataset with shape: (497553, 14)
   ⏳ Creating time-series representation (Full dataset)...
   ✔ Full dataset transformed, new shape: (497552, 28)
   ⏳ Extracting & transforming NORMAL data...
   ✔ Normal dataset transformed, new shape: (439732, 26)
   ⏳ Extracting & transforming ANOMALOUS data...
   ✔ Anomalous dataset transformed, new shape: (57819, 26)
✅ Comple

In [4]:
# Concatenate all processed time-series data
full_time_series = pd.concat(full_time_series_list, ignore_index=True)
normal_time_series = pd.concat(normal_time_series_list, ignore_index=True)
anomalous_time_series = pd.concat(anomalous_time_series_list, ignore_index=True)

In [7]:
# Encode the datasets for DBN
print("\n🔢 Encoding datasets for DBN processing...")
full_dbn_dataset, full_encoding_mappings = DataEncoder.encode_categorical_columns(full_time_series)
print(f"   ✔ Full DBN dataset encoded.")
normal_dbn_dataset, normal_encoding_mappings = DataEncoder.encode_categorical_columns(normal_time_series)
print(f"   ✔ Normal DBN dataset encoded.")
anomalous_dbn_dataset, anomalous_encoding_mappings = DataEncoder.encode_categorical_columns(anomalous_time_series)
print(f"   ✔ Anomalous DBN dataset encoded.")

print(f"✅ Completed datasets encoding.")


🔢 Encoding datasets for DBN processing...
   ✔ Full DBN dataset encoded.
   ✔ Normal DBN dataset encoded.
   ✔ Anomalous DBN dataset encoded.
✅ Completed datasets encoding.


In [8]:
# Merge encoding mappings
encoding_mappings = {**full_encoding_mappings, **normal_encoding_mappings, **anomalous_encoding_mappings}

# Display dataset summary
DataVisualizer.display_dataset_info(full_dbn_dataset, normal_dbn_dataset, anomalous_dbn_dataset, encoding_mappings)

Full Dataset Information:
Shape: (1349722, 28)
Memory usage: 36.04 MB

Sample column names:
  ('BatteryVoltage_V', 0)
  ('BatteryCurrent_A', 0)
  ('BatteryPackTemp_C', 0)
  ('InverterFault', 0)
  ('InverterSpeed_RearLeft_RPM', 0)

Normal Dataset Information:
Shape: (1216018, 26)
Memory usage: 30.15 MB

Sample column names:
  ('BatteryVoltage_V', 0)
  ('BatteryCurrent_A', 0)
  ('BatteryPackTemp_C', 0)
  ('InverterSpeed_RearLeft_RPM', 0)
  ('Inverter_Iq_Ref_RearLeft_A', 0)

Anomalous Dataset Information:
Shape: (267404, 26)
Memory usage: 6.63 MB

Sample column names:
  ('BatteryVoltage_V', 0)
  ('BatteryCurrent_A', 0)
  ('BatteryPackTemp_C', 0)
  ('InverterSpeed_RearLeft_RPM', 0)
  ('Inverter_Iq_Ref_RearLeft_A', 0)

Time slices included: [0, 1]

NaN values in full dataset: 0
NaN values in normal dataset: 0
NaN values in anomalous dataset: 0

Sample of encoding mappings:
  ('BatteryVoltage_V', 0): {'type': 'ordinal', 'categories': ['0_Low', '1_Medium', '2_High'], 'mapping': {'0_Low': 0, '

In [9]:
# Define output folder & ensure it exists
dbn_output_folder = os.path.join(DATA_FOLDER_NAME, 'dbn-datasets-imola')
os.makedirs(dbn_output_folder, exist_ok=True)

# Save datasets to CSV files
output_files = {
    "full": os.path.join(dbn_output_folder, "full-dbn-imola.csv"),
    "normal": os.path.join(dbn_output_folder, "normal-dbn-imola.csv"),
    "anomalous": os.path.join(dbn_output_folder, "anomalous-dbn-imola.csv"),
}

DataProcessor.save_dataset(full_dbn_dataset, output_files["full"], file_format="csv")
DataProcessor.save_dataset(normal_dbn_dataset, output_files["normal"], file_format="csv")
DataProcessor.save_dataset(anomalous_dbn_dataset, output_files["anomalous"], file_format="csv")

# Confirm successful saves
for key, path in output_files.items():
    print(f"✅ {key.capitalize()} dataset saved: {path}")

Dataset with shape (1349722, 28), saved successfully at .\data\dbn-datasets-imola\full-dbn-imola.csv (csv).
Dataset with shape (1216018, 26), saved successfully at .\data\dbn-datasets-imola\normal-dbn-imola.csv (csv).
Dataset with shape (267404, 26), saved successfully at .\data\dbn-datasets-imola\anomalous-dbn-imola.csv (csv).
✅ Full dataset saved: .\data\dbn-datasets-imola\full-dbn-imola.csv
✅ Normal dataset saved: .\data\dbn-datasets-imola\normal-dbn-imola.csv
✅ Anomalous dataset saved: .\data\dbn-datasets-imola\anomalous-dbn-imola.csv
