In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
import logging
import joblib

# --- Configure Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def train_ann_model(data_file_path, feature_names_file_path, target_column='FEV1_FVC', epochs=100, output_dir='models/ANN_regressor'):
    """
    Trains an Artificial Neural Network (ANN) model to predict a target variable
    using features specified in an external file. It also saves predictions and
    performance metrics.

    Args:
        data_file_path (str): Path to the Excel file containing all data (features and target).
        feature_names_file_path (str): Path to the Excel file listing the feature names.
        target_column (str): The name of the column to be predicted.
        epochs (int): Number of training epochs.
        output_dir (str): Directory to save the trained model, predictions, and metrics.
    """
    # 1. Validate input file paths and create output directory
    if not os.path.exists(data_file_path):
        logger.error(f"Error: Data file not found at '{data_file_path}'")
        return
    if not os.path.exists(feature_names_file_path):
        logger.error(f"Error: Feature names file not found at '{feature_names_file_path}'")
        return
    
    os.makedirs(output_dir, exist_ok=True)
    logger.info(f"Ensured output directory exists at: '{output_dir}'")

    # 2. Load feature names
    try:
        feature_df = pd.read_excel(feature_names_file_path)
        feature_columns = feature_df['FeatureName'].tolist()
        logger.info(f"Loaded {len(feature_columns)} feature names from '{feature_names_file_path}'")
    except Exception as e:
        logger.error(f"Error loading feature names from '{feature_names_file_path}': {e}")
        return

    # 3. Load main data
    try:
        data_df = pd.read_excel(data_file_path)
        logger.info(f"Successfully loaded data from '{data_file_path}'")
    except Exception as e:
        logger.error(f"Error reading data file '{data_file_path}': {e}")
        return

    # 4. Prepare features (X) and target (y)
    # Ensure 'Filename' column is available and kept for later mapping
    required_cols = feature_columns + [target_column]
    if 'Filename' in data_df.columns:
        required_cols.append('Filename')
    else:
        logger.warning("No 'Filename' column found in data.csv. Predictions will not include filenames.")

    # Check if all required feature columns exist in the data_df
    missing_features = [col for col in feature_columns if col not in data_df.columns]
    if missing_features:
        logger.error(f"Error: The following feature columns are missing from '{data_file_path}': {missing_features}")
        return
    
    if target_column not in data_df.columns:
        logger.error(f"Error: Target column '{target_column}' not found in '{data_file_path}'.")
        logger.error(f"Available columns: {data_df.columns.tolist()}")
        return

    # Create a working copy of the dataframe with only relevant columns
    working_df = data_df[required_cols].copy()

    # Handle missing values: Drop rows with NaNs in features or target.
    initial_rows = working_df.shape[0]
    working_df.dropna(subset=feature_columns + [target_column], inplace=True)
    if working_df.shape[0] < initial_rows:
        logger.warning(f"Dropped {initial_rows - working_df.shape[0]} rows due to missing values in features or target.")
    
    if working_df.empty:
        logger.error("No valid data remaining after handling missing values. Cannot train model.")
        return

    X = working_df[feature_columns]
    y = working_df[target_column]
    
    # Store filenames for the test set if available
    test_filenames = None
    if 'Filename' in working_df.columns:
        # Split data into training and testing sets, keeping track of original indices
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        test_filenames = working_df.loc[X_test.index, 'Filename']
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    logger.info(f"Data split: Training samples = {len(X_train)}, Testing samples = {len(X_test)}")

    # 5. Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    logger.info("Features standardized using StandardScaler.")

    joblib.dump(scaler, f"{output_dir}/scaler.pkl")

    # 6. Define the ANN model architecture using the new hyperparameters
    input_shape = [X_train_scaled.shape[1]]

    model = models.Sequential([
        # Layer 0
        layers.Dense(62, 
                     kernel_regularizer=regularizers.l2(2.1811651197905483e-05), 
                     input_shape=input_shape),
        layers.Activation('tanh'),
        layers.Dropout(0.17892153711630857),

        # Layer 1
        layers.Dense(170, 
                     kernel_regularizer=regularizers.l2(0.004620937198731296)),
        layers.Activation('relu'),
        layers.Dropout(0.3520070343362533),

        # Layer 2
        layers.Dense(26, 
                     kernel_regularizer=regularizers.l2(1.809685772545748e-05)),
        layers.Activation('elu'),
        layers.Dropout(0.01699774506122022),

        # Layer 3
        layers.Dense(19, 
                     kernel_regularizer=regularizers.l2(0.00032188995803460494)),
        layers.Activation('elu'),
        layers.Dropout(0.43474227661833453),

        # Output Layer
        layers.Dense(1, activation='linear')
    ])

    logger.info("ANN model architecture defined with new hyperparameters.")
    model.summary(print_fn=logger.info)

    # 7. Compile the model with the new learning rate
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.005965528253236619)
    model.compile(loss='mse', optimizer=optimizer, metrics=['mae'])
    logger.info("Model compiled with Adam optimizer, MSE loss, and MAE metric.")

    # 8. Train the model with the new batch size
    history = model.fit(X_train_scaled, y_train,
                        epochs=epochs,
                        batch_size=32,
                        validation_split=0.1,
                        verbose=1)
    logger.info("Model training complete.")

    # 9. Save the trained model
    model_save_path = os.path.join(output_dir, 'ANN_regressor.h5')
    model.save(model_save_path)
    logger.info(f"Trained model saved to '{model_save_path}'")
    
    return model, scaler

# --- Usage Example ---
if __name__ == "__main__":
    # Define your paths
    data_file = "data/data.xlsx"
    feature_names_file = "data/ANN_feature_cols.xlsx"
    target_column = "FEV1_FVC"
    epochs = 1000 # Number of training epochs
    output_directory = 'models/ANN_regressor' # Output folder for model

    
    logger.info("\n--- Starting ANN model training process ---")
    trained_model, feature_scaler = train_ann_model(data_file, feature_names_file, target_column, epochs, output_directory)
    logger.info("\n--- ANN model training process finished ---")

2025-08-25 14:10:08,679 - INFO - 
--- Starting ANN model training process ---
2025-08-25 14:10:08,680 - INFO - Ensured output directory exists at: 'models/ANN_regressor'
2025-08-25 14:10:08,687 - INFO - Loaded 62 feature names from 'data/ANN_feature_cols.xlsx'
2025-08-25 14:10:09,190 - INFO - Successfully loaded data from 'data/data.xlsx'
2025-08-25 14:10:09,193 - INFO - Data split: Training samples = 485, Testing samples = 122
2025-08-25 14:10:09,196 - INFO - Features standardized using StandardScaler.
2025-08-25 14:10:09,225 - INFO - ANN model architecture defined with new hyperparameters.
2025-08-25 14:10:09,226 - INFO - Model: "sequential_1"
2025-08-25 14:10:09,226 - INFO - _________________________________________________________________
2025-08-25 14:10:09,226 - INFO -  Layer (type)                Output Shape              Param #   
2025-08-25 14:10:09,227 - INFO -  dense_5 (Dense)             (None, 62)                3906      
2025-08-25 14:10:09,228 - INFO -                 

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

2025-08-25 14:10:43,822 - INFO - Model training complete.
  saving_api.save_model(
2025-08-25 14:10:43,988 - INFO - Trained model saved to 'models/ANN_regressor\ANN_regressor.h5'
2025-08-25 14:10:43,988 - INFO - 
--- ANN model training process finished ---
