In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import models
from sklearn.preprocessing import StandardScaler
import os
import logging
import joblib # For saving/loading the StandardScaler
from tensorflow.keras.losses import MeanSquaredError
'''
Layer 0: Dense(62)

Layer 1: Activation('tanh') (or it's part of the Dense layer)

Layer 2: Dense(170)

Layer 3: Activation('relu') (or it's part of the Dense layer)

Layer 4: Dense(26)

Layer 5: Activation('elu') (or it's part of the Dense layer)

Layer 6: Dense(19)

Layer 7: Activation('elu') (or it's part of the Dense layer)

Layer 8: Dense(1) (output)
'''
# --- Configure Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def extract_ann_features(data_file_path, feature_names_file_path,
                         model_path, scaler_path,
                         output_features_csv_path,
                         target_column='FEV1_FVC',
                         feature_layer_index=7): # Corrected index for the layer before the output
    """
    Loads a trained ANN model and scaler, extracts features from a specified
    intermediate layer of the model for all samples in the data file,
    and saves these features along with Filename and SUBJID to a CSV file.

    Args:
        data_file_path (str): Path to the Excel file containing all data.
        feature_names_file_path (str): Path to the Excel file listing feature names.
        model_path (str): Path to the saved Keras model (.h5 file).
        scaler_path (str): Path to the saved StandardScaler (.pkl file).
        output_features_csv_path (str): Path to save the extracted features CSV.
        target_column (str): The name of the target column (used for data cleaning).
        feature_layer_index (int): The index of the layer in the sequential model
                                   whose output will be extracted as features.
                                   Based on your model, index 7 corresponds to the
                                   activation layer after Dense(19).
    """
    # 1. Validate input file paths
    if not os.path.exists(data_file_path):
        logger.error(f"Error: Data file not found at '{data_file_path}'")
        return
    if not os.path.exists(feature_names_file_path):
        logger.error(f"Error: Feature names file not found at '{feature_names_file_path}'")
        return
    if not os.path.exists(model_path):
        logger.error(f"Error: Trained model not found at '{model_path}'")
        return
    if not os.path.exists(scaler_path):
        logger.error(f"Error: StandardScaler not found at '{scaler_path}'. Please ensure it was saved during training.")
        return

    # Ensure output directory exists
    output_dir = os.path.dirname(output_features_csv_path)
    os.makedirs(output_dir, exist_ok=True)
    logger.info(f"Ensured output directory exists at: '{output_dir}'")

    # 2. Load feature names
    try:
        feature_df = pd.read_excel(feature_names_file_path)
        feature_columns = feature_df['FeatureName'].tolist()
        logger.info(f"Loaded {len(feature_columns)} feature names from '{feature_names_file_path}'")
    except Exception as e:
        logger.error(f"Error loading feature names from '{feature_names_file_path}': {e}")
        return

    # 3. Load main data
    try:
        data_df = pd.read_excel(data_file_path)
        logger.info(f"Successfully loaded data from '{data_file_path}'")
    except Exception as e:
        logger.error(f"Error reading data file '{data_file_path}': {e}")
        return

    # 4. Prepare data for feature extraction
    required_cols = feature_columns + [target_column]
    if 'Filename' in data_df.columns:
        required_cols.append('Filename')
    else:
        logger.warning("No 'Filename' column found in data.xlsx. Extracted features will not include filenames.")
    
    if 'SUBJID' in data_df.columns:
        required_cols.append('SUBJID')
    else:
        logger.warning("No 'SUBJID' column found in data.xlsx. Extracted features will not include SUBJIDs.")

    working_df = data_df[required_cols].copy()

    initial_rows = working_df.shape[0]
    working_df.dropna(subset=feature_columns + [target_column], inplace=True)
    if working_df.shape[0] < initial_rows:
        logger.warning(f"Dropped {initial_rows - working_df.shape[0]} rows due to missing values in features or target, to match training data preprocessing.")
    
    if working_df.empty:
        logger.error("No valid data remaining after handling missing values. Cannot extract features.")
        return

    X_data = working_df[feature_columns]
    
    final_filenames = working_df['Filename'].values if 'Filename' in working_df.columns else [None] * len(working_df)
    final_subjid = working_df['SUBJID'].values if 'SUBJID' in working_df.columns else [None] * len(working_df)

    # 5. Load the trained model and scaler
    try:
        trained_model = models.load_model(model_path, custom_objects={"mse": tf.keras.losses.MeanSquaredError()})
        logger.info(f"Successfully loaded trained model from '{model_path}'")
        scaler = joblib.load(scaler_path)
        logger.info(f"Successfully loaded StandardScaler from '{scaler_path}'")
    except Exception as e:
        logger.error(f"Error loading model or scaler: {e}")
        return

    # 6. Create a feature extraction model
    if feature_layer_index >= len(trained_model.layers):
        logger.error(f"Error: feature_layer_index ({feature_layer_index}) is out of bounds for the model with {len(trained_model.layers)} layers.")
        logger.error(f"Available layers: {[layer.name for layer in trained_model.layers]}")
        return

    feature_extractor_model = models.Model(
        inputs=trained_model.inputs,
        outputs=trained_model.layers[feature_layer_index].output
    )
    
    extracted_feature_dim = feature_extractor_model.output_shape[1]
    logger.info(f"Feature extraction model created. It will extract {extracted_feature_dim}-dimensional features from layer '{trained_model.layers[feature_layer_index].name}'.")
    if extracted_feature_dim != 19:
        logger.warning(f"Note: The extracted features have {extracted_feature_dim} dimensions, not 19. Please double-check your model architecture and the feature_layer_index.")

    # 7. Scale the input data using the loaded scaler
    X_data_scaled = scaler.transform(X_data)
    logger.info("Input data scaled using the loaded StandardScaler.")

    # 8. Extract features
    extracted_features = feature_extractor_model.predict(X_data_scaled)
    logger.info(f"Extracted features for {len(extracted_features)} samples.")

    # 9. Prepare DataFrame for saving
    feature_col_names = [f'ANN_{i}' for i in range(extracted_feature_dim)]
    features_df = pd.DataFrame(extracted_features, columns=feature_col_names)

    # Combine with Filename and SUBJID
    output_df = pd.DataFrame({
        'Filename': final_filenames,
        'SUBJID': final_subjid
    })
    output_df = pd.concat([output_df, features_df], axis=1)

    # 10. Save the extracted features to CSV
    try:
        output_df.to_csv(output_features_csv_path, index=False)
        logger.info(f"Extracted features saved to '{output_features_csv_path}'")
    except Exception as e:
        logger.error(f"Error saving extracted features to CSV: {e}")

# --- Usage Example ---
if __name__ == "__main__":
    # Define your paths
    data_file = "data/data.xlsx"
    feature_names_file = "data/ANN_feature_cols.xlsx"
    model_file = "models/ANN_regressor/ANN_regressor.h5"
    scaler_file = "models/ANN_regressor/scaler.pkl"
    output_features_file = "models/ANN_regressor/ANN_features_19d.csv"
    target_col = "FEV1_FVC"

    logger.info("\n--- Starting ANN feature extraction process ---")
    extract_ann_features(data_file, feature_names_file,
                         model_file, scaler_file,
                         output_features_file,
                         target_column=target_col)
    logger.info("\n--- ANN feature extraction process finished ---")

2025-08-25 14:11:21,955 - INFO - 
--- Starting ANN feature extraction process ---
2025-08-25 14:11:21,955 - INFO - Ensured output directory exists at: 'models/ANN_regressor'
2025-08-25 14:11:22,268 - INFO - Loaded 62 feature names from 'data/ANN_feature_cols.xlsx'
2025-08-25 14:11:22,849 - INFO - Successfully loaded data from 'data/data.xlsx'
2025-08-25 14:11:22,946 - INFO - Successfully loaded trained model from 'models/ANN_regressor/ANN_regressor.h5'
2025-08-25 14:11:22,953 - INFO - Successfully loaded StandardScaler from 'models/ANN_regressor/scaler.pkl'
2025-08-25 14:11:22,955 - INFO - Feature extraction model created. It will extract 26-dimensional features from layer 'activation_6'.
2025-08-25 14:11:22,957 - INFO - Input data scaled using the loaded StandardScaler.




2025-08-25 14:11:23,067 - INFO - Extracted features for 607 samples.
2025-08-25 14:11:23,078 - INFO - Extracted features saved to 'models/ANN_regressor/ANN_features_19d.csv'
2025-08-25 14:11:23,079 - INFO - 
--- ANN feature extraction process finished ---
