In [35]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import AdamW # Using AdamW for weight decay
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import os # Added for file path handling
import traceback

In [36]:
# Data Loading and Validation
def load_and_validate_data(transformer_predictions_file):
    """
    Load and validate transformer predictions CSV file.

    Args:
        transformer_predictions_file (str): Path to CSV file.

    Returns:
        pandas.DataFrame: Validated dataframe.

    Raises:
        FileNotFoundError: If the CSV file does not exist.
        ValueError: If required columns are missing or have wrong types.
        Exception: For other potential loading errors.
    """
    print(f"Attempting to load data from: {transformer_predictions_file}")
    if not os.path.exists(transformer_predictions_file):
        raise FileNotFoundError(f"Error: Input file not found at {transformer_predictions_file}")

    try:
        df = pd.read_csv(transformer_predictions_file)
        print(f"Successfully loaded CSV. Shape: {df.shape}")
        print(f"Columns found: {df.columns.tolist()}")

        # Basic validation checks
        # Ensure these columns exist based on subsequent usage
        required_columns = ['Sequence', 'Step', 'SourceID', 'Predicted_Proportion',
                            'Predicted_Increment', 'Predicted_Cumulative',
                            'GroundTruth_Increment', 'GroundTruth_Cumulative']

        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns in input CSV: {', '.join(missing_cols)}")
        print("All required columns are present.")

        # Check for essential numeric types
        numeric_cols_to_check = ['Step', 'Predicted_Proportion', 'Predicted_Increment',
                                 'Predicted_Cumulative', 'GroundTruth_Increment', 'GroundTruth_Cumulative']
        for col in numeric_cols_to_check:
             # Attempt to convert to numeric, coercing errors to NaN
             df[col] = pd.to_numeric(df[col], errors='coerce')
             if df[col].isnull().any():
                 print(f"Warning: Column '{col}' contains non-numeric values that were converted to NaN.")
                 # Optionally raise ValueError if NaNs are unacceptable here
                 # raise ValueError(f"Column '{col}' must be entirely numeric.")

        print("Data validation checks passed.")
        return df

    except Exception as e:
        print(f"Data loading error: {e}")
        traceback.print_exc() # Print detailed traceback
        raise

In [37]:
# Proportion Binning
def bin_proportions(proportions, bin_size=0.05):
    """
    Bin proportions and normalize if the sum deviates significantly from 1.0.

    Args:
        proportions (array-like): Original proportion values.
        bin_size (float): Bin resolution.

    Returns:
        numpy.ndarray: Binned and potentially normalized proportions.
    """
    # Ensure input is a numpy array for calculations
    proportions = np.asarray(proportions)
    
    # Avoid division by zero if bin_size is zero
    if bin_size <= 0:
        raise ValueError("bin_size must be positive.")
        
    binned_props = np.round(proportions / bin_size) * bin_size
    total = np.sum(binned_props)

    # Normalize only if the sum is significantly different from 1.0
    # Use a small tolerance (epsilon) for floating point comparison
    epsilon = 1e-6
    if abs(total - 1.0) > 0.1 and abs(total) > epsilon: # Avoid division by near-zero total
        print(f"Warning: Sum of binned proportions ({total:.4f}) deviates > 0.1 from 1.0. Normalizing.")
        binned_props = binned_props / total
    elif abs(total) <= epsilon and len(binned_props) > 0:
         print("Warning: Sum of binned proportions is zero or near-zero. Cannot normalize.")
         # Return original binned props or zeros, depending on desired behavior
         # Returning zeros might be safer if normalization is expected
         # return np.zeros_like(binned_props)

    return binned_props

In [38]:
# Data Processing for LSTM (Minor logging added)
def process_transformer_predictions(transformer_predictions_file):
    """
    Process transformer predictions for LSTM model input.

    Args:
        transformer_predictions_file (str): Path to CSV file.

    Returns:
        dict: Processed data dictionary containing padded sequences, masks, etc.
    """
    df = load_and_validate_data(transformer_predictions_file)
    sequences = df['Sequence'].unique()
    print(f"Found {len(sequences)} unique sequences in the data.")

    X_data, y_increments, y_cumulative, masks = [], [], [], []
    prop_scaler, inc_scaler, cum_scaler = StandardScaler(), StandardScaler(), StandardScaler()
    processed_sequence_ids = [] # Keep track of sequences successfully processed

    for seq_id in sequences:
        seq_data = df[df['Sequence'] == seq_id].sort_values('Step')
        seq_len = len(seq_data)
        if seq_len == 0:
            print(f"Warning: Sequence {seq_id} has no data. Skipping.")
            continue

        try:
            binned_proportions = bin_proportions(seq_data['Predicted_Proportion'].values)
            scaled_props = prop_scaler.fit_transform(binned_proportions.reshape(-1, 1)).flatten()
            # Handle potential NaNs introduced by coerce during loading before scaling
            gt_inc_valid = seq_data['GroundTruth_Increment'].dropna()
            gt_cum_valid = seq_data['GroundTruth_Cumulative'].dropna()
            if gt_inc_valid.empty or gt_cum_valid.empty:
                 print(f"Warning: Sequence {seq_id} has only NaN ground truth values after coercion. Skipping.")
                 continue

            scaled_gt_inc = inc_scaler.fit_transform(gt_inc_valid.values.reshape(-1, 1)).flatten()
            scaled_gt_cum = cum_scaler.fit_transform(gt_cum_valid.values.reshape(-1, 1)).flatten()

            # Ensure lengths match after potential dropna (this indicates an issue if they don't)
            if len(scaled_gt_inc) != seq_len or len(scaled_gt_cum) != seq_len:
                 print(f"Warning: Length mismatch after handling NaNs in sequence {seq_id}. Check input data. Skipping.")
                 # This case needs careful handling - maybe impute NaNs instead of dropping?
                 # For now, skipping the sequence if lengths don't match after dropna.
                 continue


            max_step = seq_data['Step'].max()
            normalized_step = seq_data['Step'].values / max_step if max_step > 0 else np.zeros(seq_len)

            features = np.column_stack([scaled_props, scaled_gt_inc, scaled_gt_cum, normalized_step])
            increments = seq_data['GroundTruth_Increment'].values # Use original values with potential NaNs for target
            cumulative = seq_data['GroundTruth_Cumulative'].values # Use original values with potential NaNs for target

            X_data.append(features)
            y_increments.append(increments)
            y_cumulative.append(cumulative)
            masks.append(np.ones(seq_len))
            processed_sequence_ids.append(seq_id) # Add ID if processed successfully

        except Exception as e:
            print(f"Error processing sequence {seq_id}: {e}. Skipping sequence.")
            traceback.print_exc()

    if not X_data:
         raise ValueError("No valid sequences found after processing. Cannot proceed.")

    num_processed_sequences = len(X_data)
    print(f"Successfully processed {num_processed_sequences} sequences.")

    max_length = max(len(x) for x in X_data)
    num_features = X_data[0].shape[1]
    print(f"Padding sequences to max length: {max_length}")

    # Initialize padded arrays - use np.nan as fill value for targets might be better
    X_padded = np.zeros((num_processed_sequences, max_length, num_features), dtype=np.float32)
    y_increments_padded = np.full((num_processed_sequences, max_length), np.nan, dtype=np.float32)
    y_cumulative_padded = np.full((num_processed_sequences, max_length), np.nan, dtype=np.float32)
    masks_padded = np.zeros((num_processed_sequences, max_length), dtype=np.float32)

    for i in range(num_processed_sequences):
        seq_len = len(X_data[i])
        X_padded[i, :seq_len, :] = X_data[i]
        y_increments_padded[i, :seq_len] = y_increments[i]
        y_cumulative_padded[i, :seq_len] = y_cumulative[i]
        # Mask should be 0 where target is NaN, 1 otherwise
        masks_padded[i, :seq_len] = (~np.isnan(y_increments[i])).astype(np.float32) # Example: mask based on increment NaNs

    # Recalculate total times based on the potentially NaN-filled padded ground truth
    y_total_times = []
    for i in range(num_processed_sequences):
        valid_indices = np.where(masks_padded[i] == 1)[0]
        if len(valid_indices) > 0:
            last_valid_index = valid_indices[-1]
            y_total_times.append(y_cumulative_padded[i, last_valid_index])
        else:
            y_total_times.append(0) # Or np.nan if preferred
    y_total_times = np.array(y_total_times, dtype=np.float32)

    # Replace NaNs in target arrays with 0 for training (loss function needs numbers)
    # The mask will handle ignoring these steps during loss calculation.
    y_increments_padded = np.nan_to_num(y_increments_padded, nan=0.0)
    y_cumulative_padded = np.nan_to_num(y_cumulative_padded, nan=0.0)
    # y_total_times should already be numeric based on calculation above

    print("Data processing and padding complete.")
    return {
        'X': X_padded,
        'y_increments': y_increments_padded,
        'y_cumulative': y_cumulative_padded,
        'y_total_times': y_total_times,
        'masks': masks_padded, # Crucial: Mask reflects original NaNs
        'sequences': processed_sequence_ids, # Use the list of successfully processed IDs
        'df': df
    }

In [39]:
# Custom Masked Loss Function (Updated mask handling logic)
def custom_masked_huber_loss(mask_tensor, delta=1.0):
    """
    Factory function to create a masked Huber loss.
    Mask should be 1 for valid steps, 0 for padded/invalid steps.

    Args:
        mask_tensor (tf.Tensor): The mask indicating valid time steps (batch_size, seq_len).
                                 Passed during training setup.
        delta (float): Huber loss delta parameter.

    Returns:
        Callable: A loss function `masked_huber(y_true, y_pred)`.
    """
    # Convert the persistent mask tensor to float32 once
    mask_float32 = tf.cast(mask_tensor, tf.float32)

    def masked_huber(y_true, y_pred):
        """
        Calculate Huber loss only for valid (masked == 1) time steps.

        Args:
            y_true (tf.Tensor): Ground truth tensor (batch_size, seq_len).
            y_pred (tf.Tensor): Prediction tensor (batch_size, seq_len).

        Returns:
            tf.Tensor: Scalar mean loss over the batch.
        """
        # Slice the persistent mask for the current batch size dynamically
        current_batch_size = tf.shape(y_true)[0]
        # Assume mask_tensor corresponds row-wise to the full dataset
        mask_batch = mask_float32[:current_batch_size]

        # Ensure mask_batch has compatible shape (e.g., (batch_size, seq_len))
        # This might require reshaping or broadcasting depending on exact shapes
        # Assuming y_true, y_pred, mask_batch are all (batch_size, seq_len)

        # Huber loss calculation
        error = y_true - y_pred
        abs_error = tf.abs(error)
        quadratic = tf.minimum(abs_error, delta)
        linear = abs_error - quadratic
        huber_loss = 0.5 * quadratic**2 + delta * linear

        # Apply the mask (element-wise multiplication)
        masked_loss = huber_loss * mask_batch

        # Calculate mean loss per sequence, avoiding division by zero
        # Sum loss over sequence length dimension
        sum_loss_per_sequence = tf.reduce_sum(masked_loss, axis=1)
        # Sum mask over sequence length dimension to get count of valid tokens
        valid_tokens_per_sequence = tf.reduce_sum(mask_batch, axis=1)

        # Avoid division by zero for sequences with no valid tokens
        # Replace zero counts with 1; the corresponding sum_loss will also be 0.
        valid_tokens_safe = tf.where(valid_tokens_per_sequence == 0,
                                     tf.ones_like(valid_tokens_per_sequence),
                                     valid_tokens_per_sequence)

        mean_loss_per_sequence = sum_loss_per_sequence / valid_tokens_safe

        # Return the mean loss across the batch
        return tf.reduce_mean(mean_loss_per_sequence)

    # Assign a name for clarity in logs/history if possible
    masked_huber.__name__ = f'masked_huber_delta_{delta}'
    return masked_huber

In [40]:
# Improved LSTM Model Definition
class ImprovedTimeDiffLSTM(tf.keras.Model):
    """
    LSTM model with enhancements like LayerNorm, Attention, Residuals, and Regularization.
    Predicts time differences, total time, and calculates cumulative times.
    """
    def __init__(self, hidden_units=64, dropout_rate=0.3, num_attention_heads=4, l2_reg=0.001):
        """
        Initialize the model layers.

        Args:
            hidden_units (int): Number of units in the main LSTM layer.
            dropout_rate (float): Dropout rate for LSTM and Attention layers.
            num_attention_heads (int): Number of heads for MultiHeadAttention.
            l2_reg (float): L2 regularization factor for the first LSTM kernel.
        """
        super().__init__() # Use super().__init__() for Python 3 style

        # Input normalization
        self.input_normalization = layers.LayerNormalization(name="InputNorm")

        # --- LSTM Block ---
        # First LSTM layer with regularization and return sequences
        self.lstm_layer1 = layers.LSTM(
            hidden_units,
            return_sequences=True,
            dropout=dropout_rate,
            recurrent_dropout=dropout_rate, # Be cautious with recurrent_dropout on GPU
            kernel_regularizer=tf.keras.regularizers.l2(l2_reg),
            name="LSTM1"
        )
        # Dense layer for residual connection matching dimensions
        # Ensure it outputs `hidden_units` to match lstm_layer1 output
        self.residual_dense = layers.Dense(hidden_units, activation='relu', name="ResidualDense")
        self.add_layer1 = layers.Add(name="AddResidual1") # Explicit Add layer
        self.norm_layer1 = layers.LayerNormalization(name="Norm1") # Normalize after residual

        # Second LSTM layer
        self.lstm_layer2 = layers.LSTM(
            hidden_units // 2, # Reduce dimensionality
            return_sequences=True,
            dropout=dropout_rate,
            recurrent_dropout=dropout_rate,
            name="LSTM2"
        )

        # --- Attention Block ---
        self.attention = layers.MultiHeadAttention(
            num_heads=num_attention_heads,
            key_dim=hidden_units // 2, # Key dim often matches the query/value dim
            dropout=dropout_rate,
            name="MultiHeadAttention"
        )
        self.add_layer2 = layers.Add(name="AddResidual2") # Add attention output to LSTM output
        self.norm_layer2 = layers.LayerNormalization(name="Norm2") # Normalize after attention

        # --- Output Heads ---
        # 1. Time Difference Head (predicts increment for each step)
        # Use 'relu' to enforce non-negativity for time differences
        self.time_diff_head = layers.Dense(1, activation='relu', name='TimeDiffHead')

        # 2. Total Time Head (predicts total duration for the sequence)
        # Takes the aggregated sequence representation after attention
        # Use 'softplus' or 'relu' for non-negativity
        self.total_time_head = layers.Dense(1, activation='softplus', name='TotalTimeHead')

    def call(self, inputs, training=False):
        """
        Forward pass of the model.

        Args:
            inputs (tf.Tensor): Input tensor (batch_size, seq_len, num_features).
            training (bool): Whether the model is in training mode (for dropout).

        Returns:
            tuple: (predicted_time_diffs, predicted_total_time, predicted_cumulative_times)
        """
        # Input normalization
        x = self.input_normalization(inputs)

        # --- LSTM Block with Residual ---
        lstm_out1 = self.lstm_layer1(x, training=training)
        residual = self.residual_dense(x) # Project input for residual connection
        lstm_out1 = self.add_layer1([lstm_out1, residual])
        lstm_out1 = self.norm_layer1(lstm_out1) # Normalize after adding

        # Second LSTM layer
        lstm_out2 = self.lstm_layer2(lstm_out1, training=training)

        # --- Attention Block with Residual ---
        # Use lstm_out2 as query, key, and value for self-attention
        attn_out = self.attention(query=lstm_out2, value=lstm_out2, key=lstm_out2, training=training)
        # Add residual connection from before attention
        attn_out = self.add_layer2([attn_out, lstm_out2])
        attn_out = self.norm_layer2(attn_out) # Normalize after adding

        # --- Output Generation ---
        # 1. Time Differences (per step)
        # Apply the dense head to the output of the attention block
        time_diffs = self.time_diff_head(attn_out)
        # Remove the last dimension (Dense adds a dim of 1)
        time_diffs = tf.squeeze(time_diffs, axis=-1) # Shape: (batch_size, seq_len)

        # 2. Total Time (per sequence)
        # Aggregate the sequence information (e.g., mean pooling over time)
        sequence_encoding = tf.reduce_mean(attn_out, axis=1) # Shape: (batch_size, hidden_units//2)
        total_time = self.total_time_head(sequence_encoding) # Shape: (batch_size, 1)

        # 3. Cumulative Times (calculated from predicted differences)
        # Ensure calculation happens correctly even if time_diffs has NaNs (shouldn't if using relu)
        cumulative_times = tf.cumsum(time_diffs, axis=1) # Shape: (batch_size, seq_len)

        return time_diffs, total_time, cumulative_times

    # Optional: Build method to define input shape explicitly
    def build(self, input_shape):
        super().build(input_shape)

In [41]:
# Model Training Function (Added build confirmation)
def train_improved_lstm(transformer_predictions_file, epochs=100, batch_size=32):
    """
    Preprocess data, build, compile, and train the improved LSTM model.

    Args:
        transformer_predictions_file (str): Path to the transformer predictions CSV.
        epochs (int): Number of training epochs.
        batch_size (int): Training batch size.

    Returns:
        tuple: (trained_model, training_history, processed_data)
    """
    print("-" * 30)
    print("Starting LSTM Model Training Process")
    print("-" * 30)

    # 1. Process Data
    print("Processing data...")
    data = process_transformer_predictions(transformer_predictions_file)
    X_train = data['X']
    y_inc_train = data['y_increments']
    y_cum_train = data['y_cumulative']
    y_total_train = data['y_total_times']
    masks_train = data['masks']

    if X_train.shape[0] == 0:
        print("Error: No data available for training after processing.")
        return None, None, data

    input_shape = (X_train.shape[1], X_train.shape[2])
    print(f"Input shape for LSTM: {input_shape}")
    print(f"Number of sequences for training/validation: {X_train.shape[0]}")
    print(f"Target shapes: Increments {y_inc_train.shape}, Cumulative {y_cum_train.shape}, Total {y_total_train.shape}")
    print(f"Mask shape: {masks_train.shape}, Mask sum (total valid steps): {np.sum(masks_train)}")

    # 2. Build Model
    print("Building model...")
    lstm_model = ImprovedTimeDiffLSTM()
    # Explicitly build the model
    lstm_model.build(input_shape=(None,) + input_shape)
    # *** Add confirmation print ***
    print(f"Model built status: {lstm_model.built}")
    # Print summary *after* building
    lstm_model.summary(line_length=100)

    # 3. Define Loss Functions
    print("Defining loss functions...")
    masked_huber_loss_fn = custom_masked_huber_loss(tf.constant(masks_train, dtype=tf.float32))
    total_time_loss_fn = tf.keras.losses.MeanSquaredError(name='total_time_mse')
    loss_functions = [masked_huber_loss_fn, total_time_loss_fn, masked_huber_loss_fn]
    loss_weights = [0.4, 0.3, 0.3]
    print(f"Loss functions set. Weights: {loss_weights}")

    # 4. Define Optimizer
    print("Configuring optimizer...")
    learning_rate_schedule = ExponentialDecay(
        initial_learning_rate=0.001, decay_steps=1000, decay_rate=0.96, staircase=True
    )
    optimizer = AdamW(learning_rate=learning_rate_schedule, weight_decay=0.001)

    # 5. Compile Model
    print("Compiling model...")
    lstm_model.compile(
        optimizer=optimizer,
        loss=loss_functions,
        loss_weights=loss_weights,
    )
    print("Model compiled successfully.")

    # 6. Define Callbacks
    print("Configuring callbacks...")
    callbacks = [
        EarlyStopping(
            monitor='val_loss', patience=15, restore_best_weights=True, verbose=1
        )
    ]

    # 7. Train Model
    print(f"Starting training for {epochs} epochs with batch size {batch_size}...")
    history = lstm_model.fit(
        X_train,
        [y_inc_train, y_total_train, y_cum_train],
        epochs=epochs,
        batch_size=batch_size,
        validation_split=0.2,
        callbacks=callbacks,
        verbose=1
    )
    print("Training finished.")
    final_train_loss = history.history['loss'][-1]
    final_val_loss = history.history.get('val_loss', [np.nan])[-1]
    print(f"Final Training Loss: {final_train_loss:.4f}")
    print(f"Final Validation Loss: {final_val_loss:.4f}")
    print("-" * 30)

    return lstm_model, history, data


In [42]:
# Prediction Generation Function (Changed output filename)
def generate_lstm_predictions_from_transformer_csv(lstm_model, data):
    """
    Generate predictions using the trained LSTM model and compare with transformer.

    Args:
        lstm_model (tf.keras.Model): The trained LSTM model.
        data (dict): The processed data dictionary from `process_transformer_predictions`.

    Returns:
        pandas.DataFrame: Dataframe containing original data, transformer predictions,
                          LSTM predictions, and comparison metrics.
    """
    print("\n" + "-" * 30)
    print("Generating LSTM predictions...")
    print("-" * 30)

    X_predict = data['X']
    original_df = data['df']
    processed_sequences = data['sequences']
    masks_predict = data['masks']

    if X_predict.shape[0] == 0:
        print("Error: No data available for prediction.")
        return pd.DataFrame()

    print(f"Predicting on {X_predict.shape[0]} sequences...")
    try:
        time_diffs_pred, total_time_pred, cumulative_times_pred = lstm_model.predict(X_predict)
        print("Model prediction successful.")
        print(f"Predicted shapes: Diffs {time_diffs_pred.shape}, Total {total_time_pred.shape}, Cumul {cumulative_times_pred.shape}")
    except Exception as e:
        print(f"Error during model prediction: {e}")
        traceback.print_exc()
        return pd.DataFrame()

    results_list = []
    print(f"Processing predictions for {len(processed_sequences)} sequences...")

    for seq_idx, seq_id in enumerate(processed_sequences):
        seq_original_data = original_df[original_df['Sequence'] == seq_id].sort_values('Step').copy()
        seq_len = int(np.sum(masks_predict[seq_idx]))

        if seq_len == 0:
            print(f"Warning: Sequence {seq_id} (index {seq_idx}) has length 0 in mask during prediction. Skipping.")
            continue
        if seq_len > time_diffs_pred.shape[1]:
             print(f"Warning: Sequence {seq_id} (index {seq_idx}) mask length ({seq_len}) exceeds prediction dimension ({time_diffs_pred.shape[1]}). Clamping length.")
             seq_len = time_diffs_pred.shape[1]

        current_time_diffs = time_diffs_pred[seq_idx, :seq_len]
        current_cumulative = cumulative_times_pred[seq_idx, :seq_len]
        current_total = total_time_pred[seq_idx, 0]

        if len(seq_original_data) == seq_len:
            seq_original_data['LSTM_Predicted_TimeDiff'] = current_time_diffs
            seq_original_data['LSTM_Predicted_Cumulative'] = current_cumulative
            seq_original_data['LSTM_Predicted_TotalTime'] = current_total
        elif len(seq_original_data) > seq_len:
             print(f"Warning: Original data length ({len(seq_original_data)}) > mask length ({seq_len}) for seq {seq_id}. Padding predictions.")
             padded_diffs = np.pad(current_time_diffs, (0, len(seq_original_data) - seq_len), constant_values=np.nan)
             padded_cumul = np.pad(current_cumulative, (0, len(seq_original_data) - seq_len), constant_values=np.nan)
             seq_original_data['LSTM_Predicted_TimeDiff'] = padded_diffs
             seq_original_data['LSTM_Predicted_Cumulative'] = padded_cumul
             seq_original_data['LSTM_Predicted_TotalTime'] = current_total
        else:
             print(f"Error: Original data length ({len(seq_original_data)}) < mask length ({seq_len}) for seq {seq_id}. Skipping.")
             continue

        required_orig_cols = ['GroundTruth_Increment', 'Predicted_Increment',
                              'GroundTruth_Cumulative', 'Predicted_Cumulative']
        if not all(col in seq_original_data for col in required_orig_cols):
             print(f"Warning: Missing required original prediction columns for sequence {seq_id}. Cannot calculate improvements.")
             seq_original_data['TimeDiff_Improvement_Abs'] = np.nan
             seq_original_data['Cumulative_Improvement_Abs'] = np.nan
        else:
            transformer_diff_error = abs(seq_original_data['GroundTruth_Increment'] - seq_original_data['Predicted_Increment'])
            lstm_diff_error = abs(seq_original_data['GroundTruth_Increment'] - seq_original_data['LSTM_Predicted_TimeDiff'])
            transformer_cum_error = abs(seq_original_data['GroundTruth_Cumulative'] - seq_original_data['Predicted_Cumulative'])
            lstm_cum_error = abs(seq_original_data['GroundTruth_Cumulative'] - seq_original_data['LSTM_Predicted_Cumulative'])
            seq_original_data['TimeDiff_Improvement_Abs'] = transformer_diff_error - lstm_diff_error
            seq_original_data['Cumulative_Improvement_Abs'] = transformer_cum_error - lstm_cum_error

        results_list.append(seq_original_data)

    if not results_list:
        print("Error: No results generated after processing predictions.")
        return pd.DataFrame()

    results_df = pd.concat(results_list, ignore_index=True)
    print(f"Finished processing predictions. Combined results shape: {results_df.shape}")
    print("Combined results head:\n", results_df.head())
    print("Combined results tail:\n", results_df.tail())
    print("Checking for NaNs in LSTM predictions:")
    print(results_df[['LSTM_Predicted_TimeDiff', 'LSTM_Predicted_Cumulative', 'LSTM_Predicted_TotalTime']].isnull().sum())

    # *** Changed output filename ***
    output_filename = 'predictions_lstm_refined_175974.csv'
    output_path = os.path.abspath(output_filename)
    try:
        results_df.to_csv(output_filename, index=False)
        print(f"Combined predictions successfully saved to: {output_path}")
    except Exception as e:
        print(f"Error saving results to CSV ({output_path}): {e}")
        traceback.print_exc()

    print("\n" + "-" * 30)
    print("Overall Model Performance Comparison (Mean Absolute Error)")
    print("-" * 30)
    comparison_cols = required_orig_cols + ['LSTM_Predicted_TimeDiff', 'LSTM_Predicted_Cumulative']
    valid_results = results_df.dropna(subset=comparison_cols)
    print(f"Calculating MAE based on {len(valid_results)} valid rows (after dropping NaNs in comparison columns).")

    if valid_results.empty:
        print("Warning: No valid data points found for calculating overall MAE.")
    else:
        try:
            transformer_time_diff_mae = np.mean(abs(valid_results['GroundTruth_Increment'] - valid_results['Predicted_Increment']))
            lstm_time_diff_mae = np.mean(abs(valid_results['GroundTruth_Increment'] - valid_results['LSTM_Predicted_TimeDiff']))
            transformer_cumulative_mae = np.mean(abs(valid_results['GroundTruth_Cumulative'] - valid_results['Predicted_Cumulative']))
            lstm_cumulative_mae = np.mean(abs(valid_results['GroundTruth_Cumulative'] - valid_results['LSTM_Predicted_Cumulative']))

            time_diff_improvement_pct = (1 - lstm_time_diff_mae / transformer_time_diff_mae) * 100 if transformer_time_diff_mae != 0 else float('inf') if lstm_time_diff_mae < transformer_time_diff_mae else 0
            cumulative_improvement_pct = (1 - lstm_cumulative_mae / transformer_cumulative_mae) * 100 if transformer_cumulative_mae != 0 else float('inf') if lstm_cumulative_mae < transformer_cumulative_mae else 0

            print(f"Time Differences MAE:")
            print(f"  Transformer: {transformer_time_diff_mae:.4f}")
            print(f"  LSTM:        {lstm_time_diff_mae:.4f}")
            print(f"  Improvement: {time_diff_improvement_pct:.2f}%")
            print(f"\nCumulative Times MAE:")
            print(f"  Transformer: {transformer_cumulative_mae:.4f}")
            print(f"  LSTM:        {lstm_cumulative_mae:.4f}")
            print(f"  Improvement: {cumulative_improvement_pct:.2f}%")
        except Exception as e:
            print(f"Error calculating overall MAE: {e}")
            traceback.print_exc()

    print("-" * 30)
    return results_df

In [43]:
# Main Execution Block
def main():
    """
    Main function to run the LSTM model training, prediction, and visualization.
    """
    print("=" * 50)
    print(" LSTM Model Training and Evaluation Pipeline")
    print("=" * 50)

    # --- Configuration ---
    transformer_predictions_file = "predictions_transformer.csv"
    training_epochs = 100
    training_batch_size = 32
    output_suffix = "_runFinal182625" # Suffix for output files

    lstm_model = None
    history = None
    data = None
    results_df = None

    try:
        # --- Step 1: Train LSTM Model ---
        print("\n--- Step 1: Training LSTM Model ---")
        lstm_model, history, data = train_improved_lstm(
            transformer_predictions_file,
            epochs=training_epochs,
            batch_size=training_batch_size
        )

        # --- Step 2: Generate Predictions ---
        print("\n--- Step 2: Generating LSTM Predictions ---")
        if lstm_model is None or data is None:
             print("Model training or data processing failed. Cannot generate predictions.")
        else:
            # Pass suffix to prediction function if needed, or handle filenames inside main
            results_df = generate_lstm_predictions_from_transformer_csv(lstm_model, data) # Filename handled inside

        # --- Step 3: Display Sample Results ---
        print("\n--- Step 3: Displaying Sample Results ---")
        if results_df is not None and not results_df.empty:
            print("\nSample Combined Predictions (Head):")
            display_cols = ['Sequence', 'Step', 'SourceID',
                            'GroundTruth_Increment', 'Predicted_Increment', 'LSTM_Predicted_TimeDiff',
                            'GroundTruth_Cumulative','Predicted_Cumulative','LSTM_Predicted_Cumulative',
                            'TimeDiff_Improvement_Abs', 'Cumulative_Improvement_Abs']
            display_cols = [col for col in display_cols if col in results_df.columns]
            print(results_df[display_cols].head(10).to_string())
        else:
            print("No results dataframe generated or it is empty.")

        # --- Step 4: Generate Visualizations ---
        print("\n--- Step 4: Generating Visualizations ---")
        # Pass suffix to visualization function if needed, or handle filenames inside main
        visualize_results(results_df, history, num_samples=3) # Filename handled inside

        print("\n" + "=" * 50)
        print(" Pipeline Execution Completed")
        # Provide paths to output files with the new suffix
        print(f"  - Combined predictions CSV saved to: {os.path.abspath(f'predictions_lstm_refined{output_suffix}.csv')}")
        print(f"  - Training performance plot saved to: {os.path.abspath(f'training_performance{output_suffix}.png')}")
        print(f"  - Sequence comparison plots saved as: sequence_<ID>_comparison{output_suffix}.png")
        print("=" * 50)

    except FileNotFoundError as fnf_error:
        print(f"\nFatal Error: {fnf_error}")
        print("Please ensure the input CSV file exists and the path is correct.")
    except ValueError as val_error:
        print(f"\nFatal Error: {val_error}")
        print("Please check the data format, required columns, and content in the input file.")
    except Exception as e:
        print(f"\nAn unexpected fatal error occurred in main execution: {e}")
        print("\n--- Traceback ---")
        traceback.print_exc()
        print("--- End Traceback ---")

In [44]:
# Visualization Function (Changed output filenames)
def visualize_results(results_df, history, num_samples=3):
    """
    Generate visualizations for training history and prediction comparisons.

    Args:
        results_df (pd.DataFrame): Dataframe with combined predictions.
        history (tf.keras.callbacks.History): Training history object.
        num_samples (int): Number of sample sequences to plot.
    """
    print("\n" + "-" * 30)
    print("Generating visualizations...")
    print("-" * 30)

    # --- Plot Suffix ---
    file_suffix = "_run2" # Suffix for plot filenames

    if history is None:
        print("Warning: Training history object is None. Skipping history plots.")
    else:
        # --- 1. Plot Training History ---
        try:
            plt.figure(figsize=(14, 6))
            plt.subplot(1, 2, 1)
            plt.plot(history.history['loss'], label='Training Loss')
            if 'val_loss' in history.history:
                plt.plot(history.history['val_loss'], label='Validation Loss')
            else:
                 print("Warning: 'val_loss' not found in history.")
            plt.title('Model Loss During Training')
            plt.xlabel('Epoch')
            plt.ylabel('Loss (Weighted Sum)')
            plt.legend()
            plt.grid(True, linestyle='--', alpha=0.6)

            plt.subplot(1, 2, 2)
            plotted_specific_loss = False
            loss_keys = [k for k in history.history.keys() if 'loss' in k and k != 'loss' and k != 'val_loss']
            if loss_keys:
                 for key in loss_keys[:1]:
                     val_key = 'val_' + key
                     plt.plot(history.history[key], label=f'Train {key}')
                     if val_key in history.history:
                         plt.plot(history.history[val_key], label=f'Val {key}')
                     plt.title(f'{key.replace("_"," ").title()}')
                     plotted_specific_loss = True
                 plt.xlabel('Epoch')
                 plt.ylabel('Loss')
                 plt.legend()
            else:
                plt.text(0.5, 0.5, 'No specific loss keys found', horizontalalignment='center', verticalalignment='center')
                plt.title('Additional Training Metrics')
                plt.xlabel('Epoch')
            plt.grid(True, linestyle='--', alpha=0.6)

            plt.tight_layout()
            # *** Changed output filename ***
            output_filename = f'training_performance{file_suffix}.png'
            output_path = os.path.abspath(output_filename)
            print(f"Attempting to save training performance plot to: {output_path}")
            plt.savefig(output_filename)
            print(f"Successfully saved training performance plot.")
            plt.close()
        except Exception as e:
            print(f"Error generating or saving training history plot: {e}")
            traceback.print_exc()
            plt.close()

    # --- 2. Plot Prediction Comparison for Sample Sequences ---
    if results_df is None or results_df.empty:
        print("Warning: Results dataframe is empty or None. Skipping prediction comparison plots.")
        return

    required_lstm_cols = ['LSTM_Predicted_TimeDiff', 'LSTM_Predicted_Cumulative']
    if not all(col in results_df.columns for col in required_lstm_cols):
        print(f"Warning: Missing required LSTM prediction columns ({required_lstm_cols}) in results_df. Skipping comparison plots.")
        return
    if results_df[required_lstm_cols].isnull().all().all():
         print(f"Warning: LSTM prediction columns ({required_lstm_cols}) contain only NaN values. Skipping comparison plots.")
         return

    sequences_to_plot = results_df['Sequence'].unique()
    if len(sequences_to_plot) == 0:
        print("No unique sequences found in results_df to plot.")
        return

    np.random.seed(42)
    np.random.shuffle(sequences_to_plot)
    sample_sequences = sequences_to_plot[:min(num_samples, len(sequences_to_plot))]
    print(f"Attempting to plot prediction comparisons for {len(sample_sequences)} sample sequences: {sample_sequences}")

    plot_cols_increments = ['GroundTruth_Increment', 'Predicted_Increment', 'LSTM_Predicted_TimeDiff']
    plot_cols_cumulative = ['GroundTruth_Cumulative', 'Predicted_Cumulative', 'LSTM_Predicted_Cumulative']
    plot_labels = ['Ground Truth', 'Transformer Pred.', 'LSTM Pred.']
    plot_styles = ['o-', 's--', '^-']

    for seq_id in sample_sequences:
        print(f"\nProcessing plot for Sequence ID: {seq_id}")
        seq_results = results_df[results_df['Sequence'] == seq_id].sort_values('Step')
        if seq_results.empty:
            print(f"  Skipping plot for sequence {seq_id} as no data found in results_df.")
            continue
        print(f"  Data shape for sequence {seq_id}: {seq_results.shape}")

        try:
            plt.figure(figsize=(15, 7))
            plt.subplot(1, 2, 1)
            for i, col in enumerate(plot_cols_increments):
                if col in seq_results.columns and not seq_results[col].isnull().all():
                    plt.plot(seq_results['Step'], seq_results[col], plot_styles[i], label=plot_labels[i], alpha=0.8)
                    print(f"  Plotted '{col}' for increments.")
                else:
                    print(f"  Skipping plot for '{col}' (increments) - column missing or all NaN.")
            plt.title(f'Sequence {seq_id} - Time Increments')
            plt.xlabel('Step')
            plt.ylabel('Time Increment')
            plt.legend()
            plt.grid(True, linestyle='--', alpha=0.6)

            plt.subplot(1, 2, 2)
            for i, col in enumerate(plot_cols_cumulative):
                 if col in seq_results.columns and not seq_results[col].isnull().all():
                    plt.plot(seq_results['Step'], seq_results[col], plot_styles[i], label=plot_labels[i], alpha=0.8)
                    print(f"  Plotted '{col}' for cumulative.")
                 else:
                    print(f"  Skipping plot for '{col}' (cumulative) - column missing or all NaN.")
            plt.title(f'Sequence {seq_id} - Cumulative Time')
            plt.xlabel('Step')
            plt.ylabel('Cumulative Time')
            plt.legend()
            plt.grid(True, linestyle='--', alpha=0.6)

            plt.tight_layout()
            # *** Changed output filename ***
            output_filename = f'sequence_{seq_id}_comparison{file_suffix}.png'
            output_path = os.path.abspath(output_filename)
            print(f"  Attempting to save comparison plot to: {output_path}")
            plt.savefig(output_filename)
            print(f"  Successfully saved comparison plot for sequence {seq_id}.")
            plt.close()

        except Exception as e:
            print(f"Error generating or saving comparison plot for sequence {seq_id}: {e}")
            traceback.print_exc()
            plt.close()

    print("-" * 30)
    print("Visualization generation finished.")
    print("-" * 30)



In [45]:
# Main Execution Block
def main():
    """
    Main function to run the LSTM model training, prediction, and visualization.
    """
    print("=" * 50)
    print(" LSTM Model Training and Evaluation Pipeline")
    print("=" * 50)

    # --- Configuration ---
    transformer_predictions_file = "predictions_transformer_175974.csv" # Ensure this file exists!
    training_epochs = 100 # Use the original epoch count
    training_batch_size = 32

    lstm_model = None
    history = None
    data = None
    results_df = None

    try:
        # --- Step 1: Train LSTM Model ---
        print("\n--- Step 1: Training LSTM Model ---")
        lstm_model, history, data = train_improved_lstm(
            transformer_predictions_file,
            epochs=training_epochs,
            batch_size=training_batch_size
        )

        # --- Step 2: Generate Predictions ---
        print("\n--- Step 2: Generating LSTM Predictions ---")
        if lstm_model is None or data is None:
             print("Model training or data processing failed. Cannot generate predictions.")
             # Optionally: try loading a pre-trained model if available
        else:
            results_df = generate_lstm_predictions_from_transformer_csv(lstm_model, data)

        # --- Step 3: Display Sample Results ---
        print("\n--- Step 3: Displaying Sample Results ---")
        if results_df is not None and not results_df.empty:
            print("\nSample Combined Predictions (Head):")
            display_cols = ['Sequence', 'Step', 'SourceID',
                            'GroundTruth_Increment', 'Predicted_Increment', 'LSTM_Predicted_TimeDiff',
                            'GroundTruth_Cumulative','Predicted_Cumulative','LSTM_Predicted_Cumulative',
                            'TimeDiff_Improvement_Abs', 'Cumulative_Improvement_Abs']
            display_cols = [col for col in display_cols if col in results_df.columns] # Filter existing columns
            print(results_df[display_cols].head(10).to_string())
        else:
            print("No results dataframe generated or it is empty.")

        # --- Step 4: Generate Visualizations ---
        print("\n--- Step 4: Generating Visualizations ---")
        # Pass history and results_df even if they are None/empty, the function handles it
        visualize_results(results_df, history, num_samples=3)

        print("\n" + "=" * 50)
        print(" Pipeline Execution Completed")
        # Provide paths to output files
        print(f"  - Combined predictions CSV saved to: {os.path.abspath('predictions_lstm_refined_1.csv')}")
        print(f"  - Training performance plot saved to: {os.path.abspath('training_performance.png')}")
        print("  - Sequence comparison plots saved as: sequence_<ID>_comparison.png")
        print("=" * 50)


    except FileNotFoundError as fnf_error:
        print(f"\nFatal Error: {fnf_error}")
        print("Please ensure the input CSV file exists and the path is correct.")
    except ValueError as val_error:
        print(f"\nFatal Error: {val_error}")
        print("Please check the data format, required columns, and content in the input file.")
    except Exception as e:
        print(f"\nAn unexpected fatal error occurred in main execution: {e}")
        print("\n--- Traceback ---")
        traceback.print_exc()
        print("--- End Traceback ---")

In [46]:
# Script Entry Point
if __name__ == "__main__":
    np.random.seed(42)
    tf.random.set_seed(42)
    # import matplotlib
    # matplotlib.use('Agg') # Uncomment if running in a non-GUI environment
    main()

 LSTM Model Training and Evaluation Pipeline

--- Step 1: Training LSTM Model ---
------------------------------
Starting LSTM Model Training Process
------------------------------
Processing data...
Attempting to load data from: predictions_transformer_175974.csv
Successfully loaded CSV. Shape: (3349, 8)
Columns found: ['Sequence', 'Step', 'SourceID', 'Predicted_Proportion', 'Predicted_Increment', 'Predicted_Cumulative', 'GroundTruth_Increment', 'GroundTruth_Cumulative']
All required columns are present.
Data validation checks passed.
Found 137 unique sequences in the data.
Successfully processed 137 sequences.
Padding sequences to max length: 170
Data processing and padding complete.
Input shape for LSTM: (170, 4)
Number of sequences for training/validation: 137
Target shapes: Increments (137, 170), Cumulative (137, 170), Total (137,)
Mask shape: (137, 170), Mask sum (total valid steps): 3349.0
Building model...
Model built status: True


Defining loss functions...
Loss functions set. Weights: [0.4, 0.3, 0.3]
Configuring optimizer...
Compiling model...
Model compiled successfully.
Configuring callbacks...
Starting training for 100 epochs with batch size 32...
Epoch 1/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 272ms/step - loss: 95.3831 - masked_huber_delta_10_loss: 50.2491 - total_time_mse_loss: 0.0736 - val_loss: 27.4389 - val_masked_huber_delta_10_loss: 16.1503 - val_total_time_mse_loss: 3.1342e-04
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - loss: 90.6827 - masked_huber_delta_10_loss: 45.9130 - total_time_mse_loss: 0.0014 - val_loss: 26.2535 - val_masked_huber_delta_10_loss: 14.8058 - val_total_time_mse_loss: 9.8587e-04
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - loss: 87.9340 - masked_huber_delta_10_loss: 43.2589 - total_time_mse_loss: 0.0028 - val_loss: 26.6464 - val_masked_huber_delta_10_loss: 15.0640 - val_