In [9]:
import sys
import os
import time
import pickle
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
# Local imports
from Utils.utils import MinMaxScale_datasets
from Utils.feature_selection_methods import (
    select_n_features_LC, select_n_features_MI,
    select_n_features_DT, select_n_features_MLPSHAP
)

# -------------------------- Configuration -------------------------- #
farm = 'Kelmarsh'
target_feature = 'Generator bearing rear temperature (°C)'

# -------------------------- Load Data -------------------------- #
with open(f'./1_healthy_datasets/{farm}_HealthyDatasets.pkl', 'rb') as f:
    healthy_datasets = pickle.load(f)

# Normalize datasets (excluding target variable)
healthy_datasets = MinMaxScale_datasets(healthy_datasets, target_feature)

# Select datasets to process
dataset_names = [list(healthy_datasets.keys())[0]]
print('Dataset names:', dataset_names)

# Define feature selection methods
fs_methods = {
    'MLPSHAP': select_n_features_MLPSHAP,
    'LC': select_n_features_LC,
    'MI': select_n_features_MI,
    'DT': select_n_features_DT
}

# Output directory: where cleaned files will be saved
save_dir = './2_ordered_inputs'
os.makedirs(save_dir, exist_ok=True)

# -------------------------- Feature Selection Loop -------------------------- #
chosen_features_dict = {}
start_time = time.time()

for dataset_name in dataset_names:
    print(f'\nProcessing dataset: {dataset_name}')

    # Load split sets
    train_set = healthy_datasets[dataset_name]['train']
    valid_set = healthy_datasets[dataset_name]['valid']
    test_set = healthy_datasets[dataset_name]['test']

    total_features = train_set.shape[1]

    for fs_method, selector in fs_methods.items():
        print(f'  Feature Selection Method: {fs_method}')

        # Copy data
        selector_set = train_set.copy()
        valid_copy = valid_set.copy()

        # Remove rows with NaNs from selector set
        nan_rows = selector_set.isna().any(axis=1)
        selector_set = selector_set.loc[~nan_rows]

        if fs_method == 'MLPSHAP':
            # Perform feature selection (using all features for full ranking)
            chosen_features, weights = selector(selector_set, valid_copy, target_feature, total_features, epochs=200, background_samples = 400)
        else:
            chosen_features, weights = selector(selector_set, target_feature, total_features)          

        # Store selected features
        key = f'{dataset_name}_{fs_method}'
        chosen_features_dict[key] = list(chosen_features)

    # -------------------------- Save Results -------------------------- #
    with open(f'{save_dir}/{farm}_ordered_inputs.pkl', 'wb') as f:
        pickle.dump(chosen_features_dict, f)
    
    print(f'\nFeature selection completed and saved to: {output_path}')


Dataset names: ['T2_DS10']

Processing dataset: T2_DS10
  Feature Selection Method: MLPSHAP
hp_sets [(32, 64, 0.01), (32, 64, 0.001), (32, 128, 0.01), (32, 128, 0.001), (64, 64, 0.01), (64, 64, 0.001), (64, 128, 0.01), (64, 128, 0.001)]
hp_set: 0
Epoch 0
Epoch 50
Epoch 100
Epoch 150
Set 0 ,best_val_loss(MAE): 0.09226505755520377, Took: 2.70mins
best set and model so far (32, 64, 0.01) SimpleMLP(
  (fc1): Linear(in_features=112, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)
hp_set: 1
Epoch 0
Epoch 50
Epoch 100
Epoch 150
Set 1 ,best_val_loss(MAE): 0.03466146076281898, Took: 2.57mins
best set and model so far (32, 64, 0.001) SimpleMLP(
  (fc1): Linear(in_features=112, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)
hp_set: 2
Epoch 0
Epoch 50
Epoch 100
Epoch 150
Set 2 ,best_val_loss(MAE): 0.07877486094188052, Took: 2.85mins
best set and model so far (32, 64, 0.001) SimpleML

Using 400 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Set 7 ,best_val_loss(MAE): 0.020309760164292086, Took: 1.64mins
best set and model so far (64, 64, 0.001) SimpleMLP(
  (fc1): Linear(in_features=112, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)
Best hp_set: (64, 64, 0.001)
Explainer Type: <class 'shap.explainers._kernel.KernelExplainer'>
Explainer Attributes: {'link': <shap.utils._legacy.IdentityLink object at 0x000001CD0F9AB790>, 'keep_index': False, 'keep_index_ordered': False, 'model': <shap.utils._legacy.Model object at 0x000001CD0CB84CD0>, 'data': <shap.utils._legacy.DenseData object at 0x000001CD18CA8250>, 'N': 400, 'P': 112, 'linkfv': <numpy.vectorize object at 0x000001CD18CF3310>, 'nsamplesAdded': 0, 'nsamplesRun': 0, 'fnull': array([40.04491307]), 'expected_value': array([40.04491307]), 'vector_out': True, 'D': 1}


  0%|          | 0/400 [00:00<?, ?it/s]

  Feature Selection Method: LC
  Feature Selection Method: MI
  Feature Selection Method: DT

Feature selection completed and saved to: ./chosen_features_MLPSHAP_v2/Kelmarsh_Ordered_features_vG_T2_DS10.pkl
