In [16]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [None]:
# --- 1. Constants and Configuration ---

MAX_SEQ_LEN = 128
# --- UPDATED FEATURES ---
# Give the model more context by including the sourceID and step number.
FEATURE_COLUMNS = ['predicted_proportion', 'sourceID', 'Step']

# --- 2. Data Loading and Preparation for LSTM ---

def load_and_prepare_data(file_path):
    """
    Loads proportion data, determines the true total time for each sequence
    using the final cumulative timediff, and prepares data for the LSTM.
    """
    if not os.path.exists(file_path):
        print(f"❌ Error: Predictions file not found at '{file_path}'")
        return None, None, None

    df = pd.read_csv(file_path)
    df['true_total_time'] = df.groupby('SeqOrder')['timediff'].transform('max')

    grouped = df.groupby('SeqOrder')
    sequences = []
    total_times = []
    
    print(f"Processing {len(grouped)} sequences for the LSTM model...")
    for _, group in grouped:
        sequences.append(group[FEATURE_COLUMNS].values)
        total_times.append(group['true_total_time'].iloc[0])

    return sequences, np.array(total_times), df

# --- 3. LSTM Model Architecture (Improved) ---

def build_lstm_model(input_shape):
    """
    Builds a more powerful LSTM model for total time prediction.
    """
    inputs = layers.Input(shape=(None, input_shape[-1]))
    masking = layers.Masking(mask_value=0.0)(inputs)
    # Increased complexity to capture non-linear patterns
    lstm1 = layers.LSTM(64, return_sequences=True)(masking)
    lstm2 = layers.LSTM(32, return_sequences=False)(lstm1)
    # Dropout helps prevent overfitting
    dropout = layers.Dropout(0.2)(lstm2)
    dense1 = layers.Dense(16, activation='relu')(dropout)
    outputs = layers.Dense(1)(dense1)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

# --- 4. Visualization Function ---

def create_visualizations(history, results_df):
    """
    Generates and saves plots for model analysis for ALL sequences.
    """
    print("\n--- Generating Visualizations ---")
    
    output_plot_dir = 'sequence_plots'
    os.makedirs(output_plot_dir, exist_ok=True)
    
    # Plot 1: Training & Validation Loss (Global)
    plt.figure(figsize=(10, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss (MSE)')
    plt.legend()
    plt.grid(True)
    plt.savefig('training_loss_plot.png')
    print("✅ Saved training loss plot.")
    plt.close()

    all_seq_orders = results_df['SeqOrder'].unique()
    print(f"Generating comparison plots for {len(all_seq_orders)} sequences...")

    for seq_order in all_seq_orders:
        sample_df = results_df[results_df['SeqOrder'] == seq_order]

        # Plot 2: Cumulative Time Comparison
        plt.figure(figsize=(12, 7))
        plt.plot(sample_df['Step'], sample_df['timediff'], label='True Cumulative Time', marker='o')
        plt.plot(sample_df['Step'], sample_df['predicted_cumulative_time'], label='Predicted Cumulative Time', marker='x', linestyle='--')
        plt.title(f'Cumulative Time Comparison for Sequence {seq_order}')
        plt.xlabel('Step in Sequence')
        plt.ylabel('Time (seconds)')
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(output_plot_dir, f'cumulative_time_comparison_seq_{seq_order}.png'))
        plt.close()
        
        # Plot 3: Time Increment Comparison
        true_increments = sample_df['timediff'].diff().fillna(sample_df['timediff'].iloc[0])
        plt.figure(figsize=(12, 7))
        plt.plot(sample_df['Step'], true_increments, label='True Time Increment', marker='o')
        plt.plot(sample_df['Step'], sample_df['predicted_time_increment'], label='Predicted Time Increment', marker='x', linestyle='--')
        plt.title(f'Time Increment Comparison for Sequence {seq_order}')
        plt.xlabel('Step in Sequence')
        plt.ylabel('Time (seconds)')
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(output_plot_dir, f'time_increment_comparison_seq_{seq_order}.png'))
        plt.close()

    print(f"✅ Saved individual sequence plots to the '{output_plot_dir}' directory.")

    # Plot 4: Total Time Prediction Analysis (Global)
    total_time_analysis = results_df[['SeqOrder', 'true_total_time', 'predicted_total_time']].drop_duplicates()
    plt.figure(figsize=(10, 10))
    plt.scatter(total_time_analysis['true_total_time'], total_time_analysis['predicted_total_time'], alpha=0.6, label='Predictions')
    plt.plot([0, total_time_analysis['true_total_time'].max()], [0, total_time_analysis['true_total_time'].max()], color='red', linestyle='--', label='Perfect Prediction Line')
    plt.title('True vs. Predicted Total Time')
    plt.xlabel('True Total Time (s)')
    plt.ylabel('Predicted Total Time (s)')
    plt.legend()
    plt.grid(True)
    plt.axis('equal')
    plt.savefig('total_time_prediction_analysis.png')
    print("✅ Saved total time prediction analysis plot.")
    plt.close()

# --- 5. Main Orchestration ---

def main():
    """Main function to run the data processing, training, and prediction."""
    
    proportions_file = 'prediction_176401_proportions.csv'
    output_file = 'predictions_total_time_176401.csv'
    
    sequences, total_times, original_df = load_and_prepare_data(proportions_file)
    if sequences is None: return

    indices = np.arange(len(sequences))
    train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)

    X_train_unpadded = [sequences[i] for i in train_indices]
    y_train_raw = total_times[train_indices]
    X_val_unpadded = [sequences[i] for i in val_indices]
    y_val_raw = total_times[val_indices]
    
    # --- SCALING INPUT AND OUTPUT ---
    # Scale the input features (X)
    scaler_X = StandardScaler()
    # Fit the scaler ONLY on the training data to avoid data leakage
    X_train_combined = np.vstack(X_train_unpadded)
    scaler_X.fit(X_train_combined)
    
    # Transform both training and validation sets
    X_train_scaled = [scaler_X.transform(seq) for seq in X_train_unpadded]
    X_val_scaled = [scaler_X.transform(seq) for seq in X_val_unpadded]

    # Scale the output/target variable (y)
    scaler_y = StandardScaler()
    y_train = scaler_y.fit_transform(y_train_raw.reshape(-1, 1))
    y_val = scaler_y.transform(y_val_raw.reshape(-1, 1))

    X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train_scaled, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val_scaled, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')

    model = build_lstm_model(X_train.shape)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')
    model.summary()

    print("\n--- Starting LSTM Model Training ---")
    
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=100,
        batch_size=16,
        callbacks=[early_stopping, reduce_lr_on_plateau]
    )
    print("--- LSTM Model Training Finished ---\n")

    print("--- Generating total time predictions for all sequences ---")
    # Scale and pad all sequences for the final prediction
    all_sequences_scaled = [scaler_X.transform(seq) for seq in sequences]
    X_all_padded = tf.keras.preprocessing.sequence.pad_sequences(all_sequences_scaled, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    
    scaled_predictions = model.predict(X_all_padded)
    predicted_total_times = scaler_y.inverse_transform(scaled_predictions).flatten()

    seq_order_to_time = {seq_order: time for seq_order, time in zip(original_df['SeqOrder'].unique(), predicted_total_times)}
    
    results_df = original_df.copy()
    results_df['predicted_total_time'] = results_df['SeqOrder'].map(seq_order_to_time)
    results_df['predicted_time_increment'] = results_df['predicted_proportion'] * results_df['predicted_total_time']
    results_df['predicted_cumulative_time'] = results_df.groupby('SeqOrder')['predicted_time_increment'].cumsum()

    results_df.to_csv(output_file, index=False)
    print(f"✅ Final predictions saved to '{output_file}'")
    
    create_visualizations(history, results_df)
    
    print("\n--- Sample of Final Output ---")
    print(results_df[['SeqOrder', 'Step', 'timediff', 'true_total_time', 'predicted_total_time', 'predicted_cumulative_time']].head(20))


In [18]:
if __name__ == "__main__":
    # Note: To run this, you need to have the data file 
    # 'data/176401/encoded_176401_condensed.csv' available in the correct path.
    # If the file is not found, the script will print an error and exit.
    # You may need to create the 'data' directory structure or update the path in main().
    main()

KeyError: 'Step'