In [7]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# --- 1. Constants and Configuration ---

# This should match the MAX_SEQ_LEN from the Transformer model for consistency.
MAX_SEQ_LEN = 128

# --- 2. Data Loading and Preprocessing ---

def load_and_preprocess_data(proportions_file, ground_truth_file):
    """
    Loads the predicted proportions from the first model and aligns them with the
    true total time for each sequence. It also extracts the number of steps as a
    key feature.
    
    Args:
        proportions_file (str): Path to the CSV file containing predicted proportions.
        ground_truth_file (str): Path to the original data file to get the true total time.

    Returns:
        A tuple containing:
        - Padded sequences of proportions (X_seq).
        - An array of the number of steps for each sequence (X_steps).
        - An array of total times (y).
        - The dataframe from the proportions_file for final output generation.
    """
    if not os.path.exists(proportions_file):
        print(f"❌ Error: Proportions file not found at '{proportions_file}'")
        return None, None, None, None
    if not os.path.exists(ground_truth_file):
        print(f"❌ Error: Ground truth data file not found at '{ground_truth_file}'")
        return None, None, None, None

    # Load the predictions from the first model
    props_df = pd.read_csv(proportions_file)
    
    # Load the original data to get the true total time
    truth_df = pd.read_csv(ground_truth_file)
    
    # --- Re-calculate the true total time using the definitive logic ---
    truth_df['step_duration'] = truth_df.groupby('SeqOrder')['timediff'].diff().fillna(truth_df['timediff'])
    truth_df['step_duration'] = truth_df['step_duration'].clip(lower=0)
    truth_df['Step'] = truth_df.groupby('SeqOrder').cumcount()
    
    end_marker_step = truth_df[truth_df['sourceID'] == 10].groupby('SeqOrder')['Step'].first()
    truth_df['end_marker_step'] = truth_df['SeqOrder'].map(end_marker_step)
    truth_df.loc[truth_df['Step'] > truth_df['end_marker_step'], 'step_duration'] = 0
    
    total_times = truth_df.groupby('SeqOrder')['step_duration'].sum()

    # --- Prepare data for the LSTM ---
    X_sequences = []
    X_num_steps = []
    
    # The input features are the predicted proportions and the number of steps
    for _, g in props_df.groupby('SeqOrder'):
        X_sequences.append(g['predicted_proportion'].values.reshape(-1, 1))
        X_num_steps.append(len(g)) # Add the number of steps as a feature
    
    # The target variable is the true total time
    y_total_times = props_df['SeqOrder'].unique()
    y_sequences = np.array([total_times.get(seq_id, 0) for seq_id in y_total_times])

    # Pad the proportion sequences to a uniform length
    X_padded_seq = tf.keras.preprocessing.sequence.pad_sequences(
        X_sequences, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32'
    )
    
    X_steps_arr = np.array(X_num_steps, dtype='float32').reshape(-1, 1)

    print(f"Successfully processed {len(X_padded_seq)} sequences.")
    
    return X_padded_seq, X_steps_arr, y_sequences.reshape(-1, 1), props_df


# --- 3. LSTM Model Architecture ---

def build_lstm_model(sequence_shape, scalar_shape):
    """
    Builds the simplified, dual-input LSTM model for total time prediction.
    """
    # --- Input Branch 1: Sequence Data (Proportions) ---
    sequence_input = layers.Input(shape=sequence_shape, name='sequence_input')
    masked_sequence = layers.Masking(mask_value=0.)(sequence_input)
    # Simplified to one LSTM layer, which is more appropriate for a small dataset
    lstm_out = layers.LSTM(32, return_sequences=False)(masked_sequence)
    
    # --- Input Branch 2: Scalar Data (Number of Steps) ---
    scalar_input = layers.Input(shape=scalar_shape, name='scalar_input')
    
    # --- Merged Branch ---
    concatenated = layers.concatenate([lstm_out, scalar_input])
    x = layers.Dense(16, activation='relu')(concatenated)
    
    # The final output layer has one neuron.
    outputs = layers.Dense(1, name='total_time_output')(x) # Linear activation for scaled output
    
    model = tf.keras.Model(inputs=[sequence_input, scalar_input], outputs=outputs)
    return model


# --- 4. Training and Prediction Orchestration ---

def main():
    """Main function to run the data processing, training, and prediction."""
    
    # Input file from the previous (Transformer) model
    proportions_file = 'prediction_176401_proportions_final_all.csv'
    # Original data file to get the ground truth total time
    ground_truth_file = 'data/176401/encoded_176401_condensed_full.csv'
    # Final output file
    output_predictions_file = 'prediction_176401_total_time_all.csv'
    
    X_seq, X_steps, y, props_df = load_and_preprocess_data(proportions_file, ground_truth_file)
    if X_seq is None:
        return

    # --- Prepare data for training with scaling ---
    X_seq_train, X_seq_val, X_steps_train, X_steps_val, y_train, y_val = train_test_split(
        X_seq, X_steps, y, test_size=0.2, random_state=42
    )
    
    # --- Scaling Features ---
    # Scale the scalar input (number of steps)
    scaler_steps = StandardScaler()
    X_steps_train_scaled = scaler_steps.fit_transform(X_steps_train)
    X_steps_val_scaled = scaler_steps.transform(X_steps_val)
    
    # Scale the target variable (total time)
    scaler_y = StandardScaler()
    y_train_scaled = scaler_y.fit_transform(y_train)
    y_val_scaled = scaler_y.transform(y_val)
    
    print(f"\nData shapes (Train): X_seq={X_seq_train.shape}, X_steps={X_steps_train_scaled.shape}, y={y_train_scaled.shape}")
    print(f"Data shapes (Val):   X_seq={X_seq_val.shape}, X_steps={X_steps_val_scaled.shape}, y={y_val_scaled.shape}")

    # Define the input shapes for the dual-input model
    sequence_shape = X_seq_train.shape[1:]
    scalar_shape = (1,)
    model = build_lstm_model(sequence_shape, scalar_shape)
    
    # Compile the model with Mean Squared Error, as we are now predicting a scaled value.
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), # A slightly higher learning rate can help
        loss='mse',
        metrics=['mae']
    )
    model.summary()
    
    print("\n--- Starting LSTM Model Training ---")
    model.fit(
        [X_seq_train, X_steps_train_scaled],
        y_train_scaled,
        validation_data=([X_seq_val, X_steps_val_scaled], y_val_scaled),
        epochs=200, # Increased epochs
        batch_size=32,
        callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)]
    )
    print("--- LSTM Model Training Finished ---\n")

    # --- Generate Predictions and Create Final Output ---
    print("--- Generating total time predictions for the entire dataset ---")
    # Scale the full X_steps data before prediction
    X_steps_scaled = scaler_steps.transform(X_steps)
    scaled_predictions = model.predict([X_seq, X_steps_scaled])
    
    # IMPORTANT: Inverse transform the predictions to get them back to the original scale (seconds)
    predicted_times = scaler_y.inverse_transform(scaled_predictions).flatten()
    
    # Create a mapping from SeqOrder to the predicted time
    seq_order_to_time = dict(zip(props_df['SeqOrder'].unique(), predicted_times))
    
    # Add a new column for the predicted total time
    props_df['predicted_total_time'] = np.nan

    # Find the index of the first occurrence of sourceID 10 for each sequence
    end_marker_indices = props_df[props_df['sourceID'] == 10].groupby('SeqOrder')['Step'].idxmin()

    # Place the predicted total time on the row with sourceID == 10
    for seq_order, idx in end_marker_indices.items():
        if seq_order in seq_order_to_time:
            props_df.loc[idx, 'predicted_total_time'] = seq_order_to_time[seq_order]

    # Save the final, merged dataframe to a new CSV file
    props_df.to_csv(output_predictions_file, index=False)
    print(f"✅ Final predictions with total time saved to '{output_predictions_file}'")

    print("\n--- Sample of Final Predictions ---")
    # Display rows around a sequence end to verify the output
    sample_output = props_df[props_df['SeqOrder'] == 0]
    print(sample_output)

In [9]:
if __name__ == "__main__":
    main()

Successfully processed 35 sequences.

Data shapes (Train): X_seq=(28, 128, 1), X_steps=(28, 1), y=(28, 1)
Data shapes (Val):   X_seq=(7, 128, 1), X_steps=(7, 1), y=(7, 1)



--- Starting LSTM Model Training ---
Epoch 1/200




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 1.1225 - mae: 0.5854 - val_loss: 21.9973 - val_mae: 1.9875
Epoch 2/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step - loss: 1.0902 - mae: 0.5983 - val_loss: 21.7844 - val_mae: 2.0173
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - loss: 1.0638 - mae: 0.6221 - val_loss: 21.5106 - val_mae: 2.0505
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - loss: 1.0370 - mae: 0.6460 - val_loss: 21.3903 - val_mae: 2.1036
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - loss: 1.0133 - mae: 0.6714 - val_loss: 21.3847 - val_mae: 2.1697
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - loss: 1.0053 - mae: 0.7082 - val_loss: 21.4308 - val_mae: 2.1878
Epoch 7/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - loss: 1.0024 - mae: 0.



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step
✅ Final predictions with total time saved to 'prediction_176401_total_time.csv'

--- Sample of Final Predictions ---
    SeqOrder  Step  sourceID  timediff  step_duration  true_proportion  \
0          0     0        11         0            0.0         0.000000   
1          0     1         4         4            4.0         0.011799   
2          0     2         5        13            9.0         0.026549   
3          0     3         5        14            1.0         0.002950   
4          0     4         5        28           14.0         0.041298   
5          0     5         0        28            0.0         0.000000   
6          0     6         1        36            8.0         0.023599   
7          0     7         1        45            9.0         0.026549   
8          0     8         1       106           61.0         0.179941   
9          0     9         5       140           34.0         0.100295