In [16]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
import os # For checking file existence
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # For target scaling

In [17]:
# --- 1. Constants and Configuration ---

MAX_SEQ_LEN = 128
FEATURE_COLUMNS = ['predicted_proportion']

# --- 2. Data Loading and Preparation for LSTM ---

def load_and_prepare_data(file_path):
    """
    Loads proportion data, determines the true total time for each sequence
    using the final cumulative timediff, and prepares data for the LSTM.
    """
    if not os.path.exists(file_path):
        print(f"❌ Error: Predictions file not found at '{file_path}'")
        return None, None, None

    df = pd.read_csv(file_path)

    # --- CORRECTED LOGIC ---
    # The true total time is the maximum (i.e., the last) cumulative timediff in the sequence.
    df['true_total_time'] = df.groupby('SeqOrder')['timediff'].transform('max')

    grouped = df.groupby('SeqOrder')
    sequences = []
    total_times = []
    
    print(f"Processing {len(grouped)} sequences for the LSTM model...")
    for _, group in grouped:
        sequences.append(group[FEATURE_COLUMNS].values)
        total_times.append(group['true_total_time'].iloc[0])

    return sequences, np.array(total_times), df

# --- 3. LSTM Model Architecture (Refined) ---

def build_lstm_model(input_shape):
    """
    Builds a refined LSTM model for total time prediction.
    """
    inputs = layers.Input(shape=(None, input_shape[-1]))
    masking = layers.Masking(mask_value=0.0)(inputs)
    lstm1 = layers.LSTM(32, return_sequences=True)(masking)
    lstm2 = layers.LSTM(16, return_sequences=False)(lstm1)
    dense1 = layers.Dense(8, activation='relu')(lstm2)
    outputs = layers.Dense(1)(dense1)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

# --- 4. Visualization Function ---

def create_visualizations(history, results_df):
    """
    Generates and saves plots for model analysis.
    """
    print("\n--- Generating Visualizations ---")
    
    # Plot 1: Training & Validation Loss
    plt.figure(figsize=(10, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss (MSE)')
    plt.legend()
    plt.grid(True)
    plt.savefig('training_loss_plot.png')
    print("✅ Saved training loss plot.")
    plt.close()

    # Select a sample sequence for detailed plots
    sample_seq_order = results_df['SeqOrder'].unique()[0]
    sample_df = results_df[results_df['SeqOrder'] == sample_seq_order]

    # Plot 2: Cumulative Time Comparison
    plt.figure(figsize=(12, 7))
    plt.plot(sample_df['Step'], sample_df['timediff'], label='True Cumulative Time', marker='o')
    plt.plot(sample_df['Step'], sample_df['predicted_cumulative_time'], label='Predicted Cumulative Time', marker='x', linestyle='--')
    plt.title(f'Cumulative Time Comparison for Sequence {sample_seq_order}')
    plt.xlabel('Step in Sequence')
    plt.ylabel('Time (seconds)')
    plt.legend()
    plt.grid(True)
    plt.savefig('cumulative_time_comparison.png')
    print("✅ Saved cumulative time comparison plot.")
    plt.close()
    
    # Plot 3: Time Increment Comparison
    true_increments = sample_df['timediff'].diff().fillna(sample_df['timediff'].iloc[0])
    plt.figure(figsize=(12, 7))
    plt.plot(sample_df['Step'], true_increments, label='True Time Increment', marker='o')
    plt.plot(sample_df['Step'], sample_df['predicted_time_increment'], label='Predicted Time Increment', marker='x', linestyle='--')
    plt.title(f'Time Increment Comparison for Sequence {sample_seq_order}')
    plt.xlabel('Step in Sequence')
    plt.ylabel('Time (seconds)')
    plt.legend()
    plt.grid(True)
    plt.savefig('time_increment_comparison.png')
    print("✅ Saved time increment comparison plot.")
    plt.close()

    # Plot 4: Total Time Prediction Analysis
    total_time_analysis = results_df[['SeqOrder', 'true_total_time', 'predicted_total_time']].drop_duplicates()
    plt.figure(figsize=(10, 10))
    plt.scatter(total_time_analysis['true_total_time'], total_time_analysis['predicted_total_time'], alpha=0.6, label='Predictions')
    plt.plot([0, total_time_analysis['true_total_time'].max()], [0, total_time_analysis['true_total_time'].max()], color='red', linestyle='--', label='Perfect Prediction Line')
    plt.title('True vs. Predicted Total Time')
    plt.xlabel('True Total Time (s)')
    plt.ylabel('Predicted Total Time (s)')
    plt.legend()
    plt.grid(True)
    plt.axis('equal')
    plt.savefig('total_time_prediction_analysis.png')
    print("✅ Saved total time prediction analysis plot.")
    plt.close()

# --- 5. Main Orchestration ---

def main():
    """Main function to run the data processing, training, and prediction."""
    
    proportions_file = 'prediction_176401_proportions.csv'
    output_file = 'predictions_total_time.csv'
    
    sequences, total_times, original_df = load_and_prepare_data(proportions_file)
    if sequences is None: return

    indices = np.arange(len(sequences))
    train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)

    X_train_unpadded = [sequences[i] for i in train_indices]
    y_train_raw = total_times[train_indices]
    X_val_unpadded = [sequences[i] for i in val_indices]
    y_val_raw = total_times[val_indices]
    
    scaler = StandardScaler()
    y_train = scaler.fit_transform(y_train_raw.reshape(-1, 1))
    y_val = scaler.transform(y_val_raw.reshape(-1, 1))

    X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train_unpadded, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val_unpadded, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')

    model = build_lstm_model(X_train.shape)
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.summary()

    print("\n--- Starting LSTM Model Training ---")
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=100,
        batch_size=16,
        callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)]
    )
    print("--- LSTM Model Training Finished ---\n")

    print("--- Generating total time predictions for all sequences ---")
    X_all_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    scaled_predictions = model.predict(X_all_padded)
    predicted_total_times = scaler.inverse_transform(scaled_predictions).flatten()

    seq_order_to_time = {seq_order: time for seq_order, time in zip(original_df['SeqOrder'].unique(), predicted_total_times)}
    
    results_df = original_df.copy()
    results_df['predicted_total_time'] = results_df['SeqOrder'].map(seq_order_to_time)
    results_df['predicted_time_increment'] = results_df['predicted_proportion'] * results_df['predicted_total_time']
    results_df['predicted_cumulative_time'] = results_df.groupby('SeqOrder')['predicted_time_increment'].cumsum()

    results_df.to_csv(output_file, index=False)
    print(f"✅ Final predictions saved to '{output_file}'")
    
    create_visualizations(history, results_df)
    
    print("\n--- Sample of Final Output ---")
    print(results_df[['SeqOrder', 'Step', 'timediff', 'true_total_time', 'predicted_total_time', 'predicted_cumulative_time']].head(20))


In [18]:
if __name__ == "__main__":
    main()

Processing 35 sequences for the LSTM model...



--- Starting LSTM Model Training ---
Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 395ms/step - loss: 1.1247 - val_loss: 21.6057
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - loss: 0.8692 - val_loss: 21.6181
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - loss: 0.8733 - val_loss: 21.6235
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - loss: 0.9695 - val_loss: 21.6171
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - loss: 0.9796 - val_loss: 21.6034
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - loss: 1.1192 - val_loss: 21.5888
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - loss: 0.8126 - val_loss: 21.5873
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - loss: 1.1023 - val_loss: 21.5744
Epoch 9/1