# Pruning Experiment for LSTM on Energy Prediction Dataset

In [1]:
import torch
import torch.nn as nn
import torch_pruning as tp
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torch.utils.data import Dataset, DataLoader
from torch import optim
import os
import time
import copy

### 1. LSTM Model Definition

In [None]:
class TimeSeriesLSTM(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, num_layers: int, output_size: int, dropout_rate: float = 0.5):
        super(TimeSeriesLSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        self.lstm.flatten_parameters()
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = out[:, -1, :]
        out = self.dropout(out)
        out = self.fc(out)
        return out

In [None]:
class TimeSeriesLSTM_MoreLayers(nn.Module): # Renamed for clarity
    def __init__(self, input_size, hidden_size, num_layers,
                 intermediate_size_1=32, # Size for the first intermediate layer
                 intermediate_size_2=24, # Size for the second
                 intermediate_size_3=16, # Size for the third
                 output_size=1,
                 dropout_prob=0.2):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
                            batch_first=True, dropout=dropout_prob if num_layers > 1 else 0)

        # Define the intermediate Linear layers
        # Layer 1
        self.intermediate_fc1 = nn.Linear(hidden_size, intermediate_size_1)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_prob)

        # Layer 2
        self.intermediate_fc2 = nn.Linear(intermediate_size_1, intermediate_size_2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_prob)

        # Layer 3
        self.intermediate_fc3 = nn.Linear(intermediate_size_2, intermediate_size_3)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(dropout_prob)

        # Final output layer
        self.fc_final = nn.Linear(intermediate_size_3, output_size)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        last_time_step_out = lstm_out[:, -1, :] # Shape: (batch, hidden_size)

        # Pass through intermediate layers
        x = self.intermediate_fc1(last_time_step_out)
        x = self.relu1(x)
        x = self.dropout1(x) # Shape: (batch, intermediate_size_1)

        x = self.intermediate_fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x) # Shape: (batch, intermediate_size_2)

        x = self.intermediate_fc3(x)
        x = self.relu3(x)
        x = self.dropout3(x) # Shape: (batch, intermediate_size_3)

        final_out = self.fc_final(x) # Shape: (batch, output_size)
        return final_out

#### Seed block

In [21]:
class IntermediateBlock(nn.Module):
    def __init__(self, in_features, out_features, dropout_prob=0.2):
        """
        A block consisting of a Linear layer, ReLU activation, and Dropout.
        Args:
            in_features (int): Number of input features to the Linear layer.
            out_features (int): Number of output features from the Linear layer.
            dropout_prob (float): Dropout probability.
        """
        super(IntermediateBlock, self).__init__()
        self.fc = nn.Linear(in_features, out_features)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        x = self.fc(x)
        x = self.relu(x)
        x = self.dropout(x)
        return x

#### LSTM with Intermidiate Blocks

In [22]:
class TimeSeriesLSTM_WithBlocks(nn.Module):
    def __init__(self, input_size, lstm_hidden_size, num_lstm_layers,
                 block_configs, # List of output sizes for each intermediate block
                 output_size=1,
                 lstm_dropout_prob=0.2, # Dropout for LSTM layers if num_layers > 1
                 block_dropout_prob=0.2): # Dropout within each IntermediateBlock
        """
        LSTM model followed by a sequence of IntermediateBlocks.
        Args:
            input_size (int): Number of features per time step for LSTM.
            lstm_hidden_size (int): Hidden size of the LSTM layer(s).
            num_lstm_layers (int): Number of stacked LSTM layers.
            block_configs (list of int): A list where each integer is the
                                         'out_features' for an IntermediateBlock.
                                         e.g., [48, 32, 24] for three blocks.
            output_size (int): Final output dimension (e.g., 1 for regression).
            lstm_dropout_prob (float): Dropout probability for LSTM (if num_layers > 1).
            block_dropout_prob (float): Dropout probability for each IntermediateBlock.
        """
        super(TimeSeriesLSTM_WithBlocks, self).__init__()
        self.input_size = input_size
        self.lstm_hidden_size = lstm_hidden_size
        self.num_lstm_layers = num_lstm_layers
        self.output_size = output_size

        self.lstm = nn.LSTM(input_size, lstm_hidden_size, num_lstm_layers,
                            batch_first=True,
                            dropout=lstm_dropout_prob if num_lstm_layers > 1 else 0)

        # Create intermediate blocks dynamically
        self.intermediate_blocks = nn.ModuleList()
        current_in_features = lstm_hidden_size # Input to the first block is LSTM's output

        for block_out_features in block_configs:
            self.intermediate_blocks.append(
                IntermediateBlock(current_in_features, block_out_features, block_dropout_prob)
            )
            current_in_features = block_out_features # Output of this block is input to the next

        # Final output layer takes input from the last intermediate block
        self.fc_final = nn.Linear(current_in_features, output_size)

    def forward(self, x):
        # LSTM part
        lstm_out, _ = self.lstm(x)
        # Use the output from the last time step of the LSTM
        x = lstm_out[:, -1, :] # Shape: (batch, lstm_hidden_size)

        # Pass through intermediate blocks
        for block in self.intermediate_blocks:
            x = block(x)
        # After loop, x shape: (batch, block_configs[-1]) if block_configs is not empty
        # or (batch, lstm_hidden_size) if block_configs is empty

        # Final output
        final_out = self.fc_final(x)
        return final_out

### 2. Data Handling for Appliances Energy Dataset

#### --- Data Configuration ---

In [4]:
DATASET_PATH = './data/energydata_complete.csv' # ADJUST PATH AS NEEDED
SEQUENCE_LENGTH = 6 * 12 # Use 12 hours of past data (12 hours * 6 samples/hour)
TARGET_COLUMN = 'Appliances'
# Features to use (excluding target, date, and others)
FEATURE_COLUMNS = [
    'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4', 'T5', 'RH_5',
    'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9', 'T_out',
    'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint'
]

#### --- Helper function to create sequences ---

In [23]:
def create_sequences(input_data, target_data, seq_length):
    sequences = []
    targets = []
    # Stop seq_length steps early to ensure target data is available
    for i in range(len(input_data) - seq_length):
        sequences.append(input_data[i:i + seq_length])
        targets.append(target_data[i + seq_length])
    return np.array(sequences), np.array(targets)

#### --- Custom Dataset ---

In [24]:
class EnergyDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32).unsqueeze(1) # Target shape [N, 1]

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

#### --- Main Data Loading Function ---

In [25]:
def get_energy_data_loaders(
    file_path, # Pass these from config
    feature_cols,
    target_col,
    seq_length,
    batch_size=64,
    test_size_ratio=0.2, # Ratio for test set
    val_size_ratio=0.125 # Ratio for validation set FROM THE REMAINING (0.125 * 0.8 = 0.1 of total)
    ):
    """Loads, preprocesses, scales, and creates sequences for the energy dataset."""
    print(f"Loading dataset from: {file_path}")
    try:
        # 1. Load Data & Initial Processing
        df = pd.read_csv(file_path)
        print(f"Original data shape: {df.shape}")

        # --- Convert 'date' to datetime and sort ---
        if 'date' not in df.columns:
            raise ValueError("'date' column not found in the dataset.")
        df['date'] = pd.to_datetime(df['date'])
        df = df.sort_values('date').reset_index(drop=True) # Ensure sorted and clean index

        # --- Select relevant columns ---
        all_cols_to_keep = list(set(feature_cols + [target_col])) # Use set to avoid duplicates
        missing_cols = [col for col in all_cols_to_keep if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Following columns are missing from the dataset: {missing_cols}")
        df_selected = df[all_cols_to_keep]

        # --- Ensure all features are numeric (important before scaling) ---
        for col in feature_cols:
            if not pd.api.types.is_numeric_dtype(df_selected[col]):
                try:
                    df_selected[col] = pd.to_numeric(df_selected[col], errors='coerce') # Try to convert
                    print(f"Warning: Column '{col}' was non-numeric, converted to numeric. Check for NaNs.")
                except Exception as e:
                    raise ValueError(f"Column '{col}' is non-numeric and could not be converted. Error: {e}")

        # --- Handle NaNs AFTER potential conversion and selection ---
        initial_rows = len(df_selected)
        df_selected = df_selected.dropna()
        if len(df_selected) < initial_rows:
            print(f"Dropped {initial_rows - len(df_selected)} rows due to NaNs.")
        if df_selected.empty:
            raise ValueError("DataFrame is empty after selecting columns and dropping NaNs.")
        print(f"Data shape after selecting columns & cleaning NaNs: {df_selected.shape}")


        # 2. Separate Features and Target
        X = df_selected[feature_cols].values
        y = df_selected[[target_col]].values # Keep as 2D: [N, 1]

        # 3. Splitting (Chronological)
        n_total = len(X)
        n_test = int(n_total * test_size_ratio)
        n_val = int((n_total - n_test) * val_size_ratio) # Val size from remaining train_val data
        n_train = n_total - n_test - n_val

        # Ensure each split has enough data for at least one sequence + target
        min_data_needed = seq_length + 1
        if n_train < min_data_needed:
             raise ValueError(f"Not enough data for training sequences ({n_train} rows < {min_data_needed} needed). Try decreasing seq_length or test/val ratios.")
        if n_val < min_data_needed:
             raise ValueError(f"Not enough data for validation sequences ({n_val} rows < {min_data_needed} needed).")
        if n_test < min_data_needed:
             raise ValueError(f"Not enough data for test sequences ({n_test} rows < {min_data_needed} needed).")

        X_train, y_train = X[:n_train], y[:n_train]
        X_val, y_val = X[n_train : n_train + n_val], y[n_train : n_train + n_val]
        X_test, y_test = X[n_train + n_val:], y[n_train + n_val:]
        print(f"Data split: Train={X_train.shape[0]}, Val={X_val.shape[0]}, Test={X_test.shape[0]}")

        # 4. Scaling
        scaler_features = MinMaxScaler()
        scaler_target = MinMaxScaler()

        # Fit on training data ONLY
        X_train_scaled = scaler_features.fit_transform(X_train)
        y_train_scaled = scaler_target.fit_transform(y_train)

        # Transform validation and test data
        X_val_scaled = scaler_features.transform(X_val)
        y_val_scaled = scaler_target.transform(y_val)
        X_test_scaled = scaler_features.transform(X_test)
        y_test_scaled = scaler_target.transform(y_test)
        print("Features and target scaled using MinMaxScaler (fit on train only).")

        # 5. Create Sequences
        print("Creating sequences...")
        X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled.flatten(), seq_length)
        X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val_scaled.flatten(), seq_length)
        X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled.flatten(), seq_length)

        if len(X_train_seq) == 0 or len(X_val_seq) == 0 or len(X_test_seq) == 0:
            raise ValueError("Sequence creation resulted in one or more empty datasets. Check split sizes and seq_length.")

        # 6. Create Datasets and DataLoaders
        train_dataset = EnergyDataset(X_train_seq, y_train_seq)
        val_dataset = EnergyDataset(X_val_seq, y_val_seq)
        test_dataset = EnergyDataset(X_test_seq, y_test_seq)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) # Shuffle train
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        input_size = X_train_scaled.shape[1]
        scalers = {'features': scaler_features, 'target': scaler_target}

        print(f"Data loaded successfully:")
        print(f"  Input features per step: {input_size}")
        print(f"  Sequence length: {seq_length}")
        print(f"  Train sequences/batches: {len(train_dataset)} / {len(train_loader)}")
        print(f"  Val sequences/batches: {len(val_dataset)} / {len(val_loader)}")
        print(f"  Test sequences/batches: {len(test_dataset)} / {len(test_loader)}")

        return train_loader, val_loader, test_loader, input_size, seq_length, scalers

    except FileNotFoundError:
        print(f"Error: Dataset file not found at {file_path}")
    except ValueError as ve:
        print(f"ValueError during data processing: {ve}")
    except Exception as e:
        print(f"An unexpected error occurred during data loading: {e}")
        import traceback
        traceback.print_exc()
    return None, None, None, 0, 0, None


### 3. Training Function (Regression)

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim # Explicitly import optim
import time
import os # For saving models
import numpy as np # For inf

# Assume other necessary imports like model definition are present

def train_model_regression(
    model,
    train_loader,
    criterion,
    optimizer,
    device,
    num_epochs,
    val_loader=None,
    model_path_prefix="best_model", # Prefix for saving model files
    grad_clip=None,
    lr_scheduler_config=None, # Dict: {'type': 'ReduceLROnPlateau', 'patience': 5, 'factor': 0.5} or None
    early_stopping_patience=None # Int: e.g., 10, or None to disable
    ):
    """
    Trains a model for a regression task with optional LR scheduling and early stopping.

    Args:
        model (nn.Module): The PyTorch model to train.
        train_loader (DataLoader): DataLoader for the training data.
        criterion (nn.Module): The loss function (e.g., nn.MSELoss).
        optimizer (optim.Optimizer): The optimizer (e.g., Adam).
        device (torch.device): The device to train on ('cuda' or 'cpu').
        num_epochs (int): The total number of epochs to train for.
        val_loader (DataLoader, optional): DataLoader for validation data. If None,
                                           validation and best model saving are skipped.
        model_path_prefix (str, optional): Prefix for saving the best model's state_dict.
                                           e.g., "output_dir/model_name" -> "output_dir/model_name_best_val.pth"
        grad_clip (float, optional): Maximum norm for gradient clipping.
        lr_scheduler_config (dict, optional): Configuration for learning rate scheduler.
            Example: {'type': 'ReduceLROnPlateau', 'patience': 5, 'factor': 0.5, 'min_lr': 1e-7}
                     {'type': 'StepLR', 'step_size': 10, 'gamma': 0.1}
        early_stopping_patience (int, optional): Number of epochs to wait for improvement
                                                 before stopping. If None, disabled.

    Returns:
        tuple: (trained_model, train_losses_history, val_losses_history)
               The model after training (could be last epoch or best if loaded back).
               Lists of training and validation losses per epoch.
    """

    best_val_loss = float('inf')
    epochs_no_improve = 0
    train_losses_history = []
    val_losses_history = []

    # --- Initialize Learning Rate Scheduler ---
    scheduler = None
    if lr_scheduler_config and val_loader: # Scheduler often relies on validation metrics
        scheduler_type = lr_scheduler_config.get('type', 'ReduceLROnPlateau').lower()
        if scheduler_type == 'reducelronplateau':
            scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                mode='min', # For loss
                factor=lr_scheduler_config.get('factor', 0.1),
                patience=lr_scheduler_config.get('patience', 10),
                verbose=True,
                min_lr=lr_scheduler_config.get('min_lr', 0)
            )
            print(f"Using ReduceLROnPlateau scheduler (factor={scheduler.factor}, patience={scheduler.patience})")
        elif scheduler_type == 'steplr':
            scheduler = optim.lr_scheduler.StepLR(
                optimizer,
                step_size=lr_scheduler_config.get('step_size', 10),
                gamma=lr_scheduler_config.get('gamma', 0.1),
                verbose=True
            )
            print(f"Using StepLR scheduler (step_size={scheduler.step_size}, gamma={scheduler.gamma})")
        # Add other schedulers like CosineAnnealingLR if needed
        else:
            print(f"Warning: Unknown scheduler type '{scheduler_type}'. No scheduler will be used.")


    print(f"Starting training for up to {num_epochs} epochs...")
    for epoch in range(num_epochs):
        model.train() # Set model to training mode
        running_train_loss = 0.0
        epoch_start_time = time.time()

        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()

            if grad_clip:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip)

            optimizer.step()
            running_train_loss += loss.item() * inputs.size(0) # Accumulate loss scaled by batch size

        epoch_train_loss = running_train_loss / len(train_loader.dataset) # Average loss per sample
        train_losses_history.append(epoch_train_loss)
        epoch_time = time.time() - epoch_start_time

        log_msg = f"Epoch {epoch+1}/{num_epochs}: Train Loss={epoch_train_loss:.6f}, Time: {epoch_time:.2f}s"

        current_lr = optimizer.param_groups[0]['lr'] # Get current LR
        log_msg += f", LR: {current_lr:.2e}"


        # --- Validation Step (if val_loader is provided) ---
        epoch_val_loss = None
        if val_loader:
            model.eval() # Set model to evaluation mode
            running_val_loss = 0.0
            with torch.no_grad():
                for inputs_val, labels_val in val_loader:
                    inputs_val, labels_val = inputs_val.to(device), labels_val.to(device)
                    outputs_val = model(inputs_val)
                    val_loss_batch = criterion(outputs_val, labels_val)
                    running_val_loss += val_loss_batch.item() * inputs_val.size(0)

            epoch_val_loss = running_val_loss / len(val_loader.dataset)
            val_losses_history.append(epoch_val_loss)
            log_msg += f", Val Loss={epoch_val_loss:.6f}"

            # --- Learning Rate Scheduler Step (if applicable) ---
            if scheduler:
                if isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau):
                    scheduler.step(epoch_val_loss) # ReduceLROnPlateau needs the metric
                # For schedulers like StepLR, CosineAnnealingLR, call step() without metric
                # elif not isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau):
                #     scheduler.step() # This placement is common for epoch-based schedulers other than ReduceLROnPlateau

            # --- Best Model Saving ---
            if epoch_val_loss < best_val_loss:
                best_val_loss = epoch_val_loss
                epochs_no_improve = 0 # Reset counter
                best_model_path = f"{model_path_prefix}_best_val.pth"
                try:
                    # Ensure the directory exists
                    os.makedirs(os.path.dirname(best_model_path), exist_ok=True)
                    torch.save(model.state_dict(), best_model_path)
                    log_msg += f" (Best Val Loss: {best_val_loss:.6f} -> Model Saved)"
                except Exception as e:
                    log_msg += f" (Error saving best model: {e})"
            else:
                epochs_no_improve += 1
                log_msg += f" (Val Loss did not improve for {epochs_no_improve} epoch(s))"

            # --- Early Stopping Check ---
            if early_stopping_patience and epochs_no_improve >= early_stopping_patience:
                print(f"\nEarly stopping triggered after {epoch+1} epochs due to no improvement in validation loss for {early_stopping_patience} consecutive epochs.")
                break # Exit the training loop

            model.train() # Switch back to training mode for the next epoch
        else: # No validation loader
            val_losses_history.append(np.nan) # Placeholder if no validation
            # For schedulers that don't need validation metric (like StepLR)
            if scheduler and not isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau):
                scheduler.step()


        print(log_msg)
        # End of epoch loop

    print("Training finished.")

    # Optional: Load the best model weights back into the model if validation was performed
    if val_loader and os.path.exists(f"{model_path_prefix}_best_val.pth"):
        print(f"Loading best model weights from {model_path_prefix}_best_val.pth")
        try:
            model.load_state_dict(torch.load(f"{model_path_prefix}_best_val.pth", map_location=device))
        except Exception as e:
            print(f"Error loading best model weights: {e}. Returning model from last epoch.")

    return model, train_losses_history, val_losses_history

### 4. Evaluation Function (Regression)

In [27]:
def calculate_macs_params(model, example_input):
    # Ensure example_input is on the right device
    device = next(model.parameters()).device
    example_input = example_input.to(device)
    # tp.utils.count_ops_and_params can fail with LSTMs sometimes. Use torchinfo as fallback.
    try:
         macs, params = tp.utils.count_ops_and_params(model, example_input)
         return macs, params
    except Exception as e:
        print(f"Warning: torch_pruning MACs calculation failed ({e}). Falling back to torchinfo estimate.")
        try:
             from torchinfo import summary
             # Correct input format for torchinfo might be needed depending on version
             # Try with tuple (common format) or just the tensor
             input_data_shape = example_input.shape
             model_summary = summary(model, input_size=input_data_shape, verbose=0)
             params = model_summary.total_params
             macs = model_summary.total_mult_adds
             print(f"torchinfo estimate: Params={params}, MACs={macs}")
             return macs, params
        except Exception as e2:
            print(f"Warning: torchinfo calculation also failed ({e2}). Returning 0 for MACs/Params.")
            return 0, sum(p.numel() for p in model.parameters()) # Return at least params

def evaluate_model_regression(model, test_loader, example_input, device, scalers=None):
    model.eval()
    macs, params = calculate_macs_params(model, example_input) # Handles potential LSTM issues
    size_mb = params * 4 / 1e6 # Assumes float32

    all_outputs = []
    all_labels = []

    print("Evaluating on test set...")
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            all_outputs.append(outputs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_outputs = np.concatenate(all_outputs, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    outputs_eval = all_outputs
    labels_eval = all_labels

    # Inverse transform for interpretable metrics
    if scalers and 'target' in scalers:
        try:
            outputs_eval = scalers['target'].inverse_transform(all_outputs)
            labels_eval = scalers['target'].inverse_transform(all_labels)
            print("Metrics calculated on original scale.")
        except Exception as e:
            print(f"Warning: Could not inverse transform. Metrics on scaled data. Error: {e}")
    else:
         print("Warning: Target scaler not provided. Metrics calculated on scaled data.")


    mse = mean_squared_error(labels_eval, outputs_eval)
    mae = mean_absolute_error(labels_eval, outputs_eval)
    r2 = r2_score(labels_eval, outputs_eval)
    rmse = np.sqrt(mse)
    # MAPE calculation - handle potential zeros in labels_eval
    epsilon = 1e-8 # Small value to avoid division by zero
    mape = np.mean(np.abs((labels_eval - outputs_eval) / (labels_eval + epsilon))) * 100


    print(f"Evaluation Metrics: MSE={mse:.4f}, MAE={mae:.4f}, RMSE={rmse:.4f}, R2={r2:.4f}, MAPE={mape:.2f}%")

    return {
        'macs': macs,
        'params': params,
        'size_mb': size_mb,
        'mse': mse,
        'mae': mae,
        'rmse': rmse,
        'r2': r2,
        'mape': mape,
        'performance': mse # Use MSE as the primary performance metric (lower is better)
    }

### 5. Pruning Function (Adapted for LSTM)

In [28]:
def prune_lstm_model_by_threshold(
    model,
    example_input_bs1, # BS=1 for MACs/Params calc & non-grad strategies
    target_macs,
    target_params,
    strategy,
    max_iterations=50,
    step_pruning_ratio=0.1,
    gradient_batch=None, # Dict {'inputs': T, 'labels': T} with BS > 1
    prunable_modules=None # List of specific layers (e.g., [model.fc])
    ):
    device = next(model.parameters()).device

    needs_gradient = isinstance(strategy['importance'], (
        tp.importance.TaylorImportance,
        tp.importance.GroupHessianImportance
    ))
    if needs_gradient:
        if gradient_batch is None: raise ValueError(f"Strategy needs 'gradient_batch'.")
        if gradient_batch['inputs'].shape[0] <= 1: raise ValueError(f"Need BS > 1 in gradient_batch")
        gradient_inputs = gradient_batch['inputs'].to(device)
        gradient_labels = gradient_batch['labels'].to(device)

    print(f"--- Starting Pruning ({strategy['importance'].__class__.__name__}) ---")
    print(f"Target MACs: <= {target_macs:,.0f}, Target Params: <= {target_params:,.0f}")
    print(f"Step Ratio: {step_pruning_ratio:.2f}, Max Iter: {max_iterations}")

    # Determine layers to prune/ignore
    if not prunable_modules:
         prunable_modules = [m for m in model.modules() if isinstance(m, nn.Linear)]
         print(f"Defaulting to pruning nn.Linear layers: {[m.__class__.__name__ for m in prunable_modules]}")
    else:
         print(f"Targeting specific modules for pruning: {[m.__class__.__name__ for m in prunable_modules]}")

    modules_to_ignore = [m for m in model.modules() if isinstance(m, (nn.Linear, nn.Conv2d, nn.LSTM)) and m not in prunable_modules]
    root_types = list(set(type(m) for m in prunable_modules))
    if not root_types:
        print("Warning: No prunable module types identified. Pruning may fail.")
        root_types = [nn.Linear] # Fallback guess


    pruner = strategy['pruner'](
        model,
        example_input_bs1.to(device),
        importance=strategy['importance'],
        pruning_ratio=step_pruning_ratio,
        root_module_types=root_types,
        ignored_layers=modules_to_ignore,
    )

    initial_macs, initial_params = calculate_macs_params(model, example_input_bs1.to(device))
    current_macs, current_params = initial_macs, initial_params
    print(f"Initial State | MACs: {current_macs:,.0f}, Params: {current_params:,.0f}")
    # --- Remove or comment out these two lines ---
    # prunable_layer_names = [layer.__class__.__name__ for layer in pruner.get_pruning_layers()] # <--- ERROR HERE
    # print(f"Detected Prunable Layers by tp: {prunable_layer_names}")
    # --------------------------------------------
    if initial_macs == 0 and initial_params == 0: # Check if initial calc failed
         print("Warning: Initial MACs/Params calculation failed or returned zero. Cannot proceed.")
         return model # Or raise error

    # Existing check (adjust slightly): If no prunable layers are implicitly handled by pruner.step, it will just return empty groups.
    # We don't need the explicit check here anymore. The loop checking `if not pruning_groups:` will handle it.
    # if not prunable_layer_names and initial_macs > 0:
    #     print("Warning: torch-pruning did not detect any prunable layers matching criteria.")


    iteration = 0
    model.eval()
    criterion = nn.MSELoss().to(device) # Loss for gradient calculation

    while (current_macs > target_macs or current_params > target_params) and iteration < max_iterations:
        iteration += 1
        macs_before_step = current_macs
        params_before_step = current_params

        if needs_gradient:
            model.train()
            input_for_grad = gradient_inputs.detach().clone()
            labels_for_grad = gradient_labels.detach().clone()
            try:
                for param in model.parameters(): param.requires_grad_(True)
                outputs = model(input_for_grad)
                loss = criterion(outputs, labels_for_grad)
                model.zero_grad()
                loss.backward()
            except Exception as e:
                print(f"\nError during backward (Iter {iteration}): {e}. Stopping.")
                model.eval() ; break
            finally:
                model.eval() ; model.zero_grad(set_to_none=True)

        try:
            pruning_groups = list(pruner.step(interactive=True))
        except Exception as e:
            print(f"\nError during pruner.step() (Iter {iteration}): {e}. Stopping.")
            break

        if not pruning_groups:
            print(f"Iter {iteration}: No more candidates found by pruner. Stopping.")
            break

        for group in pruning_groups: group.prune()

        current_macs, current_params = calculate_macs_params(model, example_input_bs1.to(device))
        macs_reduced_pct = (macs_before_step - current_macs) / initial_macs * 100 if initial_macs > 0 else 0
        params_reduced_pct = (params_before_step - current_params) / initial_params * 100 if initial_params > 0 else 0

        print(
            f"Iter {iteration: >3}/{max_iterations} | "
            f"MACs: {current_macs:,.0f} ({macs_reduced_pct:+6.1f}% R) | "
            f"Params: {current_params:,.0f} ({params_reduced_pct:+6.1f}% R)"
        )

        if current_macs >= macs_before_step and current_params >= params_before_step:
            if not (current_macs <= target_macs and current_params <= target_params):
                print(f"Iter {iteration}: No reduction. Stopping.") ; break
            else: break # Targets met

    # --- Final Report ---
    print(f"--- Finished Pruning ({strategy['importance'].__class__.__name__}) ---")
    if iteration >= max_iterations: print(f"Warning: Reached max iterations ({max_iterations}).")
    final_macs, final_params = calculate_macs_params(model, example_input_bs1.to(device))
    macs_reduction = (initial_macs - final_macs) / initial_macs * 100 if initial_macs > 0 else 0
    params_reduction = (initial_params - final_params) / initial_params * 100 if initial_params > 0 else 0
    print(f"Initial | MACs: {initial_macs:,.0f}, Params: {initial_params:,.0f}")
    print(f"Final   | MACs: {final_macs:,.0f} (Reduction: {macs_reduction:.2f}%)")
    print(f"        | Params: {final_params:,.0f} (Reduction: {params_reduction:.2f}%)")
    print(f"Target  | MACs <= {target_macs:,.0f}, Params <= {target_params:,.0f}")
    if final_macs > target_macs or final_params > target_params:
         print("Warning: Pruning finished, but target threshold(s) were not fully met.")

    model.eval()
    return model

### 6. Comparison and Plotting Function (Regression)

In [29]:
def compare_results_and_plot_regression(results, metric_key='mse', lower_is_better=True, output_dir='output'):
    if not results: print("No results to plot.") ; return
    valid_results = {k: v for k, v in results.items() if isinstance(v, dict) and all(m in v for m in ['macs', 'params', metric_key])}
    if not valid_results: print("No valid results entries found for plotting.") ; return

    strategy_order = []
    if 'initial' in valid_results: strategy_order.append('initial')
    strategy_order.extend([s for s in valid_results if s != 'initial'])
    if not strategy_order: print("No strategies to plot."); return

    # --- Print Table ---
    metric_name = metric_key.upper()
    print(f"\n=== Pruning Strategy Comparison (Metric: {metric_name}) ===")
    header = f"{'Strategy':<15} | {'MACs (M)':<10} | {'Params (K)':<10} | {'Size (MB)':<10} | {metric_name:<12}"
    print(header); print("-" * len(header))
    for strategy in strategy_order:
        metrics = valid_results[strategy]
        macs_m = metrics['macs']/1e6 if metrics['macs'] is not None else 0
        params_k = metrics['params']/1e3 if metrics['params'] is not None else 0
        print(f"{strategy:<15} | {macs_m:<10.2f} | {params_k:<10.1f} | {metrics.get('size_mb', 0):>10.2f} | {metrics[metric_key]:>12.4f}")

    # --- Generate Bar Charts ---
    os.makedirs(output_dir, exist_ok=True)
    metrics_to_plot = ['macs', 'params', 'size_mb', metric_key]
    base_titles = {'macs': 'MACs', 'params': 'Parameters', 'size_mb': 'Model Size (MB)', metric_key: metric_name}
    plot_titles = {k: f'{v} Comparison (Lower is Better)' for k, v in base_titles.items()}
    if not lower_is_better:
        plot_titles[metric_key] = f'{base_titles[metric_key]} Comparison (Higher is Better)'

    colors = plt.cm.viridis(np.linspace(0, 1, len(strategy_order)))
    initial_metrics = valid_results.get('initial', None)

    for plot_metric in metrics_to_plot:
        if not all(plot_metric in valid_results[s] for s in strategy_order):
             print(f"Skipping plot for {plot_metric} as it's missing from some results.")
             continue
        values = [valid_results[strategy][plot_metric] for strategy in strategy_order]

        plt.figure(figsize=(12, 7))
        bars = plt.bar(strategy_order, values, color=colors)
        plt.ylabel(base_titles[plot_metric])
        plt.title(plot_titles[plot_metric])
        plt.xticks(rotation=45, ha='right')
        plt.grid(axis='y', linestyle='--', alpha=0.7)

        # Add value labels
        max_val = max(values) if values else 0
        for i, bar in enumerate(bars):
            yval = bar.get_height()
            label = ""
            # ... (Use the formatting logic from previous `compare_results_and_plot_regression` function) ...
            if plot_metric == 'macs': label = f'{yval/1e6:.2f}M' if yval > 1e5 else f'{yval:,.0f}' # Adjust format
            elif plot_metric == 'params': label = f'{yval/1e3:.1f}K' if yval > 100 else f'{yval:,.0f}' # Adjust format
            elif plot_metric == 'size_mb': label = f'{yval:.2f}'
            else: label = f'{yval:.4f}' # Regression metric
            plt.text(bar.get_x() + bar.get_width()/2., yval + 0.01 * max_val, label, ha='center', va='bottom', fontsize=8, rotation=0)

        if initial_metrics and plot_metric in initial_metrics:
             initial_value = initial_metrics[plot_metric]
             plt.axhline(y=initial_value, color='r', linestyle='--', label=f'Initial Value')
             plt.legend()

        plt.tight_layout()
        save_path = os.path.join(output_dir, f'lstm_energy_{plot_metric}_comparison.png')
        try:
            plt.savefig(save_path)
        except Exception as e:
             print(f"Error saving plot {save_path}: {e}")
        plt.close()
    print(f"Comparison plots saved to {output_dir}")

In [30]:
import torch
import torch.onnx

# [...] other imports

def save_model_as_onnx(model, example_input, output_path, opset_version=13):
    """Saves the PyTorch model as ONNX."""
    # Ensure model is on the same device as the example input for export
    device = example_input.device
    model.to(device)
    model.eval() # Ensure model is in evaluation mode

    print(f"Attempting to save model to ONNX: {output_path}")
    print(f"Using example input shape: {example_input.shape}")

    try:
        torch.onnx.export(
            model,
            example_input, # Must have the correct shape (batch, seq_len, features)
            output_path,
            export_params=True,       # Store the trained parameter weights inside the model file
            opset_version=opset_version,    # The ONNX version to export the model to
            do_constant_folding=True, # Optional: optimizes the model
            input_names=['input'],    # Specify names for input nodes
            output_names=['output'],  # Specify names for output nodes
            dynamic_axes={            # Allow variable batch size
                'input': {0: 'batch_size'},
                'output': {0: 'batch_size'}
            }
        )
        print(f"✅ Model successfully saved as ONNX to {output_path}")
        return True

    except Exception as e:
        print(f"❌ Failed to save model as ONNX: {e}")
        import traceback
        traceback.print_exc()
        return False

### 7. Main Workflow (LSTM Energy Prediction)

In [33]:
def main_lstm():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    lstm_hidden_size_val = 256
    # --- Configuration ---
    config = {
        'dataset_path': './data/energydata_complete.csv', # ADJUST AS NEEDED
        'target_column_name': 'Appliances',
        'feature_column_names': [
            'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4', 'T5', 'RH_5',
            'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9', 'T_out',
            'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint'
        ], # Your list of features
        'sequence_length': 6 * 12, # 12 hours

        'strategies': {
            'Magnitude_L1': {'pruner': tp.pruner.MagnitudePruner, 'importance': tp.importance.MagnitudeImportance(p=1)},
            'Magnitude_L2': {'pruner': tp.pruner.MagnitudePruner, 'importance': tp.importance.MagnitudeImportance(p=2)},
            'Random': {'pruner': tp.pruner.MagnitudePruner, 'importance': tp.importance.RandomImportance()},
            'Taylor': {'pruner': tp.pruner.MagnitudePruner, 'importance': tp.importance.TaylorImportance()},
            'FPGM': {'pruner': tp.pruner.MagnitudePruner, 'importance': tp.importance.FPGMImportance()},
        },
        'learning_rate_scheduler': {
        'type': 'ReduceLROnPlateau', # or 'StepLR'
        'patience': 10,             # For ReduceLROnPlateau
        'factor': 0.5,              # For ReduceLROnPlateau
        'min_lr': 1e-7,             # For ReduceLROnPlateau
        # 'step_size': 15,          # For StepLR
        # 'gamma': 0.1,             # For StepLR
        },
        'early_stopping_patience': 20,
        # --- LSTM Hyperparameters ---
        'lstm_hidden_size': lstm_hidden_size_val,    # Example, adjust as needed
        'num_lstm_layers': 5,
        'lstm_dropout': 0.5,

        # --- Configuration for 7 Intermediate Blocks ---
        # Defines the output size of each block. The number of elements defines the number of blocks.
        # Example: Tapering down, starting relatively large after LSTM
        'block_output_features': [
            lstm_hidden_size_val // 2,
            lstm_hidden_size_val // 2,
            lstm_hidden_size_val // 3,
            lstm_hidden_size_val // 3,
            lstm_hidden_size_val // 4,
            lstm_hidden_size_val // 4,
            lstm_hidden_size_val // 5
        ],
        # 'block_output_features': [96, 80, 64, 48, 32, 24, 16], # Alternative: More explicit tapering

        'block_dropout': 0.6,       # Dropout for the intermediate blocks

        # --- Training ---
        'train_epochs': 50,         # May need more for a deeper model
        'fine_tune_epochs': 60,     # Also potentially more
        'batch_size': 64,
        'learning_rate_initial': 0.0005, # May need adjustment for deeper model
        'learning_rate_finetune': 0.0002,
        'grad_clip': 1.0,

        # --- Paths & Pruning ---
        'output_dir': './output/lstm_energy_7blocks', # New output directory
        'pruning_max_iterations': 100, # More iterations might be needed for deeper models
        'pruning_step_ratio': 0.15,    # Adjust step ratio
        'pruning_primary_metric': 'mse',

        # --- TARGETS FOR PRUNING ---
        'target_macs_absolute': 3_000_000_000, # 3 Billion MACs
        'target_params_absolute': None,      # Use size_mb for params target, effectively
        'target_size_mb_absolute': 5.0,    # 5 MB target size

        # Sparsity targets (will be overridden if absolute targets are set and initial model is larger)
        'target_macs_sparsity': 0.5,   # Target 50% MAC reduction
        'target_params_sparsity': 0.5, # Target 50% Params reduction
    }
    os.makedirs(config['output_dir'], exist_ok=True)

    # --- Initialize Data ---
    print("Loading and processing data...")
    train_loader, val_loader, test_loader, input_size, actual_seq_length_used, scalers = get_energy_data_loaders(
        file_path=config['dataset_path'],
        feature_cols=config['feature_column_names'],
        target_col=config['target_column_name'],
        seq_length=config['sequence_length'],
        batch_size=config['batch_size']
        # Add test_size_ratio, val_size_ratio here if you want to override defaults in get_energy_data_loaders
    )
    if train_loader is None:
        print("Failed to load data. Exiting.")
        return
    # Update sequence_length in config if it was adjusted by data loader (e.g. due to data size)
    config['sequence_length'] = actual_seq_length_used


    # --- Initialize Model ---
    model = TimeSeriesLSTM_WithBlocks(
        input_size=input_size,
        lstm_hidden_size=config['lstm_hidden_size'],
        num_lstm_layers=config['num_lstm_layers'],
        block_configs=config['block_output_features'],
        output_size=1,
        lstm_dropout_prob=config['lstm_dropout'],
        block_dropout_prob=config['block_dropout']
    )
    model.to(device)
    print("\nModel Architecture:")
    # Using torchinfo for a more detailed summary if available
    try:
        from torchinfo import summary as torchinfo_summary
        torchinfo_summary(model, input_size=(config['batch_size'], config['sequence_length'], input_size), verbose=0)
    except ImportError:
        print(model) # Fallback
    try:
        print(f"Initial model parameter device: {next(model.parameters()).device}")
    except StopIteration: print("Initial model has no parameters.")


    # --- Create Example Inputs ---
    example_input_bs1 = torch.randn(1, config['sequence_length'], input_size).to(device)
    example_gradient_batch = None
    try:
        grad_batch_data_peek = next(iter(train_loader))
        if grad_batch_data_peek[0].shape[0] > 1:
            example_gradient_batch = {'inputs': grad_batch_data_peek[0], 'labels': grad_batch_data_peek[1]}
            print(f"Obtained gradient batch with BS={example_gradient_batch['inputs'].shape[0]}")
        else: print("Warning: First train batch has BS=1 or less. Cannot use for Taylor gradient batch initially if Taylor is the first strategy.")
    except Exception as e: print(f"Could not get gradient batch from DataLoader: {e}")


    # --- Initial Training ---
    initial_model_base_path = os.path.join(config['output_dir'], "lstm_energy_initial")
    initial_model_pth_path = initial_model_base_path + ".pth" # Last epoch
    initial_best_model_pth_path = initial_model_base_path + "_best_val.pth"
    initial_model_onnx_path = initial_model_base_path + "_best_val.onnx"

    if not os.path.exists(initial_best_model_pth_path):
        print("\n--- Initial Training ---")
        optimizer = optim.Adam(model.parameters(), lr=config['learning_rate_initial'], weight_decay=config.get('weight_decay_initial', 0))
        model, train_hist, val_hist = train_model_regression(
            model=model, train_loader=train_loader, criterion=nn.MSELoss().to(device),
            optimizer=optimizer, device=device, num_epochs=config['train_epochs'],
            val_loader=val_loader, model_path_prefix=initial_model_base_path,
            grad_clip=config.get('grad_clip', None),
            lr_scheduler_config=config.get('learning_rate_scheduler', None),
            early_stopping_patience=config.get('early_stopping_patience', None)
        )
        torch.save(model.state_dict(), initial_model_pth_path) # Save last epoch state

        # Determine which model to use for ONNX & as base for pruning
        path_for_onnx_and_pruning_base = initial_model_pth_path # Default to last epoch
        if os.path.exists(initial_best_model_pth_path):
            print(f"Best initial model saved during training: {initial_best_model_pth_path}")
            model.load_state_dict(torch.load(initial_best_model_pth_path, map_location=device))
            path_for_onnx_and_pruning_base = initial_best_model_pth_path
        else:
            print(f"Warning: Best validation model file not found. Using last epoch from {initial_model_pth_path} for ONNX/pruning base.")

        print(f"\nSaving best available initial model (from {os.path.basename(path_for_onnx_and_pruning_base)}) as ONNX...")
        save_model_as_onnx(model, example_input_bs1, initial_model_onnx_path)
    else:
        print(f"\nLoading best initial model from {initial_best_model_pth_path}")
        model.load_state_dict(torch.load(initial_best_model_pth_path, map_location=device))
        if not os.path.exists(initial_model_onnx_path):
            print(f"Saving loaded best initial model ({os.path.basename(initial_best_model_pth_path)}) as ONNX...")
            save_model_as_onnx(model, example_input_bs1, initial_model_onnx_path)
        else:
            print(f"ONNX for initial model already exists: {initial_model_onnx_path}")


    # --- Evaluate Initial Model ---
    results = {}
    print("\n--- Evaluating Initial Model (Post loading best/last initial weights) ---")
    results['initial'] = evaluate_model_regression(model, test_loader, example_input_bs1, device, scalers)
    initial_macs = results['initial']['macs']
    initial_params = results['initial']['params']
    initial_size_mb = results['initial']['size_mb']
    print(f"Initial Model Eval: MACs={initial_macs:,.0f}, Params={initial_params:,.0f}, SizeMB={initial_size_mb:.2f}, MSE={results['initial']['mse']:.4f}")

    # --- Calculate Targets for Pruning (More Robust) ---
    target_macs_value = config['target_macs_absolute']
    if initial_macs is not None and initial_macs < target_macs_value: # If initial is already smaller
        print(f"Initial MACs ({initial_macs:,.0f}) is already less than target_macs_absolute ({target_macs_value:,.0f}). Adjusting target.")
        target_macs_value = initial_macs * (1 - config['target_macs_sparsity']) # Fallback to sparsity reduction

    target_size_mb_config = config.get('target_size_mb_absolute', None)
    if target_size_mb_config is not None:
        target_params_from_size_mb = int((target_size_mb_config * 1024 * 1024) / 4)
    else: # Fallback to sparsity if no size target
        target_params_from_size_mb = initial_params * (1 - config['target_params_sparsity']) if initial_params is not None else float('inf')

    target_params_value = target_params_from_size_mb # Primarily use size_mb target
    if initial_params is not None and initial_params < target_params_value: # If initial is already smaller
         print(f"Initial Params ({initial_params:,.0f}) is already less than target based on size_mb. Adjusting target.")
         target_params_value = initial_params * (1- config['target_params_sparsity']) # Fallback


    print(f"\nFinal Pruning Targets:")
    print(f"  Targeting MACs <= {target_macs_value:,.0f} (Initial: {initial_macs:,.0f})")
    if target_size_mb_config is not None:
        print(f"  Targeting Size <= {target_size_mb_config:.2f} MB (Derived Params Target: {target_params_from_size_mb:,.0f})")
    print(f"  Using Effective Params Target <= {target_params_value:,.0f} (Initial: {initial_params:,.0f})")


    # --- Pruning and Fine-tuning Loop ---
    for strategy_name, strategy_config in config['strategies'].items():
        print(f"\n===== Processing Strategy: {strategy_name} =====")
        # --- Initialize results for this strategy ---
        results[strategy_name] = {
            'macs': initial_macs if initial_macs is not None else 0, # Default to initial if available
            'params': initial_params if initial_params is not None else 0,
            'size_mb': initial_size_mb if initial_size_mb is not None else 0,
            config['pruning_primary_metric']: results.get('initial', {}).get(config['pruning_primary_metric'], float('inf')),
            'error': 'Not run or error occurred'
        }

        model_to_prune = TimeSeriesLSTM_WithBlocks(
             input_size=input_size, lstm_hidden_size=config['lstm_hidden_size'],
             num_lstm_layers=config['num_lstm_layers'], block_configs=config['block_output_features'],
             output_size=1, lstm_dropout_prob=config['lstm_dropout'],
             block_dropout_prob=config['block_dropout']
        ).to(device)

        # Load the *best initial* weights (from initial_best_model_pth_path)
        path_to_load_initial = initial_best_model_pth_path if os.path.exists(initial_best_model_pth_path) else initial_model_pth_path
        try:
            model_to_prune.load_state_dict(torch.load(path_to_load_initial, map_location=device))
            print(f"Loaded initial weights from {os.path.basename(path_to_load_initial)} for strategy {strategy_name}")
        except Exception as e_load:
            print(f"Error loading state dict for strategy {strategy_name} from {path_to_load_initial}: {e_load}")
            results[strategy_name]['error'] = f"Load Initial Error: {e_load}"
            continue
        model_to_prune.eval()

        # Determine gradient batch
        needs_grad = isinstance(strategy_config['importance'], tp.importance.TaylorImportance)
        grad_batch_for_prune = example_gradient_batch if needs_grad else None
        if needs_grad and not grad_batch_for_prune:
            print(f"Skipping {strategy_name}: Taylor/Hessian requires gradient_batch, but none was successfully obtained.")
            results[strategy_name]['error'] = "Taylor requires grad_batch, unavailable"
            continue

        # Define prunable layers from the current model_to_prune instance
        prunable_layers_list = []
        if hasattr(model_to_prune, 'intermediate_blocks') and model_to_prune.intermediate_blocks:
            for i_block in range(len(model_to_prune.intermediate_blocks)):
                # Assuming each block in ModuleList is a Sequential and its first element is Linear
                if isinstance(model_to_prune.intermediate_blocks[i_block], nn.Sequential) and \
                   len(model_to_prune.intermediate_blocks[i_block]) > 0 and \
                   isinstance(model_to_prune.intermediate_blocks[i_block][0], nn.Linear):
                    prunable_layers_list.append(model_to_prune.intermediate_blocks[i_block][0]) # Get the Linear layer
                elif isinstance(model_to_prune.intermediate_blocks[i_block], nn.Linear): # If block itself is Linear
                    prunable_layers_list.append(model_to_prune.intermediate_blocks[i_block])

        if not prunable_layers_list:
            print(f"No prunable layers selected for strategy {strategy_name}. Copying initial results and skipping pruning+finetuning.")
            results[strategy_name] = results['initial'].copy()
            results[strategy_name]['notes'] = "Pruning skipped: No prunable intermediate layers found/selected."
            continue
        print(f"Targeting for pruning in {strategy_name}: {[str(layer) for layer in prunable_layers_list]}")

        # Perform Pruning
        try:
            pruned_model = prune_lstm_model_by_threshold(
                model=model_to_prune, example_input_bs1=example_input_bs1,
                target_macs=target_macs_value, target_params=target_params_value,
                strategy=strategy_config,
                max_iterations=config['pruning_max_iterations'],
                step_pruning_ratio=config['pruning_step_ratio'],
                gradient_batch=grad_batch_for_prune,
                prunable_modules=prunable_layers_list
            )
            pruned_pth_path = os.path.join(config['output_dir'], f"lstm_{strategy_name}_pruned.pth")
            torch.save(pruned_model.state_dict(), pruned_pth_path)
            print(f"Pruned model state saved to {pruned_pth_path}")
        except Exception as e_prune:
            print(f"\nCRITICAL ERROR during PRUNING for strategy {strategy_name}: {e_prune}")
            import traceback; traceback.print_exc()
            results[strategy_name]['error'] = f"Pruning Error: {e_prune}"
            continue

        # Fine-tune
        print(f"\n--- Fine-tuning ({strategy_name}) ---")
        ft_base_path = os.path.join(config['output_dir'], f"lstm_{strategy_name}_ft")
        ft_best_pth_path = ft_base_path + "_best_val.pth"
        optimizer_ft = optim.Adam(pruned_model.parameters(), lr=config['learning_rate_finetune'], weight_decay=config.get('weight_decay_finetune',0))
        try:
            fine_tuned_model, _, _ = train_model_regression(
                model=pruned_model, train_loader=train_loader, criterion=nn.MSELoss().to(device),
                optimizer=optimizer_ft, device=device, num_epochs=config['fine_tune_epochs'],
                val_loader=val_loader, model_path_prefix=ft_base_path,
                grad_clip=config.get('grad_clip', None),
                lr_scheduler_config=config.get('learning_rate_scheduler', None),
                early_stopping_patience=config.get('early_stopping_patience', None)
            )
            # After train_model_regression, fine_tuned_model holds the best val weights if val_loader was used and best model saved/loaded
        except Exception as e_ft:
            print(f"\nCRITICAL ERROR during FINE-TUNING for strategy {strategy_name}: {e_ft}")
            import traceback; traceback.print_exc()
            results[strategy_name]['error'] = f"Fine-tuning Error: {e_ft}"
            continue

        # Evaluate Final Model
        print(f"\n--- Evaluating Fine-tuned Model ({strategy_name}) ---")
        try:
            final_metrics = evaluate_model_regression(fine_tuned_model, test_loader, example_input_bs1, device, scalers)
            results[strategy_name] = final_metrics
        except Exception as e_eval:
            print(f"\nCRITICAL ERROR during EVALUATION for strategy {strategy_name}: {e_eval}")
            import traceback; traceback.print_exc()
            results[strategy_name]['error'] = f"Evaluation Error: {e_eval}"
            continue

        # Save final model .pth and .onnx
        final_model_pth_to_save = ft_best_pth_path if os.path.exists(ft_best_pth_path) else (ft_base_path + ".pth") # Prefer best, fallback to last
        # If fine_tuned_model holds the best already due to loading in train_fn, save it directly:
        # torch.save(fine_tuned_model.state_dict(), final_model_pth_to_save)
        # print(f"Final PyTorch model for strategy {strategy_name} saved based on fine-tuning to {final_model_pth_to_save}")
        # Simpler: always save the state of fine_tuned_model as it should be the best one loaded by train_model_regression
        torch.save(fine_tuned_model.state_dict(), ft_best_pth_path) # Always save best as _best_val.pth
        print(f"Best fine-tuned PyTorch model for {strategy_name} saved to {ft_best_pth_path}")


        strategy_onnx_path = ft_best_pth_path.replace('.pth', '.onnx')
        save_model_as_onnx(fine_tuned_model, example_input_bs1, strategy_onnx_path)

    # --- Final Comparison (Called only ONCE after all strategies attempted) ---
    print("\n===== Final Comparison of All Successfully Processed Strategies =====")
    final_processed_results = {}
    if 'initial' in results and isinstance(results['initial'], dict) and config['pruning_primary_metric'] in results['initial']:
        final_processed_results['initial'] = results['initial']

    for strategy_name_res, metrics_res in results.items():
        if strategy_name_res != 'initial' and isinstance(metrics_res, dict):
            if 'error' not in metrics_res or metrics_res.get('error') == 'Not run or error occurred':
                if all(k in metrics_res for k in ['macs', 'params', config['pruning_primary_metric']]):
                    final_processed_results[strategy_name_res] = metrics_res
                else:
                    print(f"Strategy '{strategy_name_res}' metrics incomplete. Excluding from final plot.")
            else:
                print(f"Strategy '{strategy_name_res}' failed: {metrics_res['error']}. Excluding from final plot.")

    if len(final_processed_results) > 0 : # Check if there's anything to plot
        compare_results_and_plot_regression(
             final_processed_results,
             metric_key=config['pruning_primary_metric'],
             lower_is_better=config.get('pruning_primary_metric_lower_is_better', True),
             output_dir=config['output_dir']
         )
    else:
        print("No successful strategies (nor initial results) to compare for plotting.")

    print("\nWorkflow completed!")

### Run the main function

In [34]:
if __name__ == "__main__":
    main_lstm()

Using device: cuda
Loading and processing data...
Loading dataset from: ./data/energydata_complete.csv
Original data shape: (19735, 29)
Data shape after selecting columns & cleaning NaNs: (19735, 26)
Data split: Train=13815, Val=1973, Test=3947
Features and target scaled using MinMaxScaler (fit on train only).
Creating sequences...
Data loaded successfully:
  Input features per step: 25
  Sequence length: 72
  Train sequences/batches: 13743 / 214
  Val sequences/batches: 1901 / 30
  Test sequences/batches: 3875 / 61

Model Architecture:
Initial model parameter device: cuda:0
Obtained gradient batch with BS=64

--- Initial Training ---
Using ReduceLROnPlateau scheduler (factor=0.5, patience=10)
Starting training for up to 50 epochs...




Epoch 1/50: Train Loss=0.011269, Time: 3.17s, LR: 5.00e-04, Val Loss=0.007128 (Best Val Loss: 0.007128 -> Model Saved)
Epoch 2/50: Train Loss=0.010201, Time: 3.58s, LR: 5.00e-04, Val Loss=0.007119 (Best Val Loss: 0.007119 -> Model Saved)
Epoch 3/50: Train Loss=0.010056, Time: 3.54s, LR: 5.00e-04, Val Loss=0.007113 (Best Val Loss: 0.007113 -> Model Saved)
Epoch 4/50: Train Loss=0.009956, Time: 3.54s, LR: 5.00e-04, Val Loss=0.007114 (Val Loss did not improve for 1 epoch(s))
Epoch 5/50: Train Loss=0.009961, Time: 3.50s, LR: 5.00e-04, Val Loss=0.007149 (Val Loss did not improve for 2 epoch(s))
Epoch 6/50: Train Loss=0.009947, Time: 3.56s, LR: 5.00e-04, Val Loss=0.007131 (Val Loss did not improve for 3 epoch(s))
Epoch 7/50: Train Loss=0.009941, Time: 3.56s, LR: 5.00e-04, Val Loss=0.007147 (Val Loss did not improve for 4 epoch(s))
Epoch 8/50: Train Loss=0.009931, Time: 3.58s, LR: 5.00e-04, Val Loss=0.007130 (Val Loss did not improve for 5 epoch(s))
Epoch 9/50: Train Loss=0.009939, Time: 3.31

  result = _VF.lstm(


Metrics calculated on original scale.
Evaluation Metrics: MSE=7496.0786, MAE=48.9322, RMSE=86.5799, R2=-0.0000, MAPE=55.41%
Initial Model Eval: MACs=4,334,366,119, Params=2,475,850, SizeMB=9.90, MSE=7496.0786

Final Pruning Targets:
  Targeting MACs <= 3,000,000,000 (Initial: 4,334,366,119)
  Targeting Size <= 5.00 MB (Derived Params Target: 1,310,720)
  Using Effective Params Target <= 1,310,720 (Initial: 2,475,850)

===== Processing Strategy: Magnitude_L1 =====
Loaded initial weights from lstm_energy_initial_best_val.pth for strategy Magnitude_L1
No prunable layers selected for strategy Magnitude_L1. Copying initial results and skipping pruning+finetuning.

===== Processing Strategy: Magnitude_L2 =====
Loaded initial weights from lstm_energy_initial_best_val.pth for strategy Magnitude_L2
No prunable layers selected for strategy Magnitude_L2. Copying initial results and skipping pruning+finetuning.

===== Processing Strategy: Random =====
Loaded initial weights from lstm_energy_initia

In [None]:
import pandas as pd

# Path is defined in your code
dataset_path = './data/energydata_complete.csv'
df = pd.read_csv(dataset_path)

# Print the head of the original dataframe
print("Original dataframe shape:", df.shape)
print(df.head())

# If you want to see the target column statistics
print("\nTarget column summary statistics:")
print(df['Appliances'].describe())

# To see the feature columns
print("\nFeatures in the dataset:")
print(df.columns.tolist())

In [None]:
inputs, targets = next(iter(train_loader))

print(f"\nBatch shape - Inputs: {inputs.shape}, Targets: {targets.shape}")
print(f"Input features sample (first sequence):\n{inputs[0]}")
print(f"Target values sample:\n{targets[:5]}")