In [10]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [11]:
# --- 1. Constants and Configuration ---

MAX_SEQ_LEN = 128
# Use multiple features to give the model context for predicting increments
FEATURE_COLUMNS = ['predicted_proportion', 'sourceID', 'Step']

# --- 2. Data Loading and Preparation: Targeting Increments ---

def load_and_prepare_data(file_path):
    """
    Loads data and prepares it for an increment-prediction model.
    The target variable 'y' is now a sequence of time increments for each step.
    """
    if not os.path.exists(file_path):
        print(f"❌ Error: Predictions file not found at '{file_path}'")
        return None, None, None, None

    df = pd.read_csv(file_path)

    # --- NEW TARGET: CALCULATE TIME INCREMENTS ---
    # The model will learn to predict the duration of each individual step.
    df['time_increment'] = df.groupby('SeqOrder')['timediff'].diff().fillna(df['timediff'])

    grouped = df.groupby('SeqOrder')
    sequences_X = []
    sequences_y = [] # Target is now a sequence of increments
    
    print(f"Processing {len(grouped)} sequences for the LSTM model...")
    for _, group in grouped:
        sequences_X.append(group[FEATURE_COLUMNS].values)
        sequences_y.append(group['time_increment'].values.reshape(-1, 1))

    return sequences_X, sequences_y, df

# --- 3. LSTM Model Architecture: Sequence-to-Sequence ---

def build_seq2seq_lstm_model(input_shape):
    """
    Builds a sequence-to-sequence LSTM model to predict an output for every timestep.
    """
    inputs = layers.Input(shape=(None, input_shape[-1]))
    masking = layers.Masking(mask_value=0.0)(inputs)
    
    # LSTM layers must return sequences to make a prediction for each step
    lstm1 = layers.LSTM(64, return_sequences=True)(masking)
    dropout1 = layers.Dropout(0.2)(lstm1)
    lstm2 = layers.LSTM(32, return_sequences=True)(dropout1)
    
    # UPDATED: Replaced TimeDistributed wrapper with a direct Dense layer.
    # In modern Keras, a Dense layer automatically applies to the last dimension
    # (the features) of each time step, which is more robust and avoids the graph error.
    outputs = layers.Dense(1)(lstm2)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

# --- 4. Visualization Function ---

def create_visualizations(history, results_df):
    """
    Generates and saves plots for model analysis.
    """
    print("\n--- Generating Visualizations ---")
    
    output_plot_dir = 'sequence_plots'
    os.makedirs(output_plot_dir, exist_ok=True)
    
    plt.figure(figsize=(10, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss (MSE)')
    plt.legend()
    plt.grid(True)
    plt.savefig('training_loss_plot.png')
    print("✅ Saved training loss plot.")
    plt.close()

    all_seq_orders = results_df['SeqOrder'].unique()
    print(f"Generating comparison plots for {len(all_seq_orders)} sequences...")

    for seq_order in all_seq_orders:
        sample_df = results_df[results_df['SeqOrder'] == seq_order]
        plt.figure(figsize=(12, 7))
        plt.plot(sample_df['Step'], sample_df['timediff'], label='True Cumulative Time', marker='o')
        plt.plot(sample_df['Step'], sample_df['predicted_cumulative_time'], label='Predicted Cumulative Time', marker='x', linestyle='--')
        plt.title(f'Cumulative Time Comparison for Sequence {seq_order}')
        plt.xlabel('Step')
        plt.ylabel('Time (s)')
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(output_plot_dir, f'cumulative_time_comparison_seq_{seq_order}.png'))
        plt.close()
        
        plt.figure(figsize=(12, 7))
        plt.plot(sample_df['Step'], sample_df['time_increment'], label='True Time Increment', marker='o')
        plt.plot(sample_df['Step'], sample_df['predicted_time_increment'], label='Predicted Time Increment', marker='x', linestyle='--')
        plt.title(f'Time Increment Comparison for Sequence {seq_order}')
        plt.xlabel('Step')
        plt.ylabel('Time (s)')
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(output_plot_dir, f'time_increment_comparison_seq_{seq_order}.png'))
        plt.close()

    print(f"✅ Saved individual sequence plots to the '{output_plot_dir}' directory.")
    
    # For total time, we need to calculate it from the final cumulative time
    total_time_analysis = results_df.groupby('SeqOrder').agg(
        true_total_time=('timediff', 'max'),
        predicted_total_time=('predicted_cumulative_time', 'max')
    ).reset_index()

    plt.figure(figsize=(10, 10))
    plt.scatter(total_time_analysis['true_total_time'], total_time_analysis['predicted_total_time'], alpha=0.6)
    plt.plot([0, total_time_analysis['true_total_time'].max()], [0, total_time_analysis['true_total_time'].max()], color='red', linestyle='--')
    plt.title('True vs. Predicted Total Time')
    plt.xlabel('True Total Time (s)')
    plt.ylabel('Predicted Total Time (s)')
    plt.grid(True)
    plt.axis('equal')
    plt.savefig('total_time_prediction_analysis.png')
    print("✅ Saved total time prediction analysis plot.")
    plt.close()

# --- 5. Main Orchestration ---

def main():
    proportions_file = 'prediction_176401_proportions.csv'
    output_file = 'predictions_total_time_176401.csv'
    
    sequences_X, sequences_y, original_df = load_and_prepare_data(proportions_file)
    if sequences_X is None: return

    indices = np.arange(len(sequences_X))
    train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)

    X_train_unpadded = [sequences_X[i] for i in train_indices]
    y_train_unpadded = [sequences_y[i] for i in train_indices]
    X_val_unpadded = [sequences_X[i] for i in val_indices]
    y_val_unpadded = [sequences_y[i] for i in val_indices]

    scaler_X = StandardScaler().fit(np.vstack(X_train_unpadded))
    scaler_y = StandardScaler().fit(np.vstack(y_train_unpadded))

    X_train_scaled = [scaler_X.transform(seq) for seq in X_train_unpadded]
    y_train_scaled = [scaler_y.transform(seq) for seq in y_train_unpadded]
    X_val_scaled = [scaler_X.transform(seq) for seq in X_val_unpadded]
    y_val_scaled = [scaler_y.transform(seq) for seq in y_val_unpadded]

    X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train_scaled, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    y_train = tf.keras.preprocessing.sequence.pad_sequences(y_train_scaled, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val_scaled, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    y_val = tf.keras.preprocessing.sequence.pad_sequences(y_val_scaled, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')

    model = build_seq2seq_lstm_model(X_train.shape)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')
    model.summary()

    print("\n--- Starting LSTM Model Training ---")
    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
        tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001)
    ]
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=16, callbacks=callbacks)
    print("--- LSTM Model Training Finished ---\n")

    print("--- Generating predictions for all sequences ---")
    all_sequences_scaled = [scaler_X.transform(seq) for seq in sequences_X]
    X_all_padded = tf.keras.preprocessing.sequence.pad_sequences(all_sequences_scaled, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    
    predictions_scaled = model.predict(X_all_padded)
    
    # Reshape predictions and inverse transform
    num_samples = predictions_scaled.shape[0]
    num_timesteps = predictions_scaled.shape[1]
    predictions_reshaped = predictions_scaled.reshape(num_samples * num_timesteps, 1)
    predictions_unscaled_reshaped = scaler_y.inverse_transform(predictions_reshaped)
    predicted_increments_padded = predictions_unscaled_reshaped.reshape(num_samples, num_timesteps, 1)

    # Add predicted increments to the original DataFrame
    results_df = original_df.copy()
    predicted_increments_list = []
    for i, seq in enumerate(sequences_X):
        actual_len = len(seq)
        predicted_increments_list.extend(predicted_increments_padded[i, :actual_len, 0])
    
    results_df['predicted_time_increment'] = predicted_increments_list
    results_df['predicted_cumulative_time'] = results_df.groupby('SeqOrder')['predicted_time_increment'].cumsum()

    results_df.to_csv(output_file, index=False)
    print(f"✅ Final predictions saved to '{output_file}'")
    
    create_visualizations(history, results_df)
    
    print("\n--- Sample of Final Output ---")
    print(results_df[['SeqOrder', 'Step', 'timediff', 'predicted_cumulative_time', 'time_increment', 'predicted_time_increment']].head(20))


In [12]:
if __name__ == "__main__":
    main()

Processing 35 sequences for the LSTM model...



--- Starting LSTM Model Training ---
Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 572ms/step - loss: 1.0017 - val_loss: 14.0093 - learning_rate: 0.0010
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - loss: 1.1790 - val_loss: 13.9797 - learning_rate: 0.0010
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step - loss: 0.9701 - val_loss: 13.9756 - learning_rate: 0.0010
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - loss: 0.7342 - val_loss: 13.9814 - learning_rate: 0.0010
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - loss: 0.9493 - val_loss: 13.9720 - learning_rate: 0.0010
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - loss: 1.0544 - val_loss: 13.9602 - learning_rate: 0.0010
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - loss: 0.894