In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [2]:
# --- 1. Constants and Configuration ---

# The maximum number of steps we'll consider in a single sequence.
# Sequences longer than this will be truncated, shorter ones will be padded.
MAX_SEQ_LEN = 128

# The features from the CSV file that the model will use as input.
FEATURE_COLUMNS = [
    'sourceID', 'PTAB', 'BodyGroup_from', 'BodyGroup_to',
    'Position_encoded', 'Direction_encoded'
]


# --- 2. Data Loading and Preprocessing (Corrected based on new rules) ---

def load_and_preprocess_data(file_path):
    """
    Loads data with cumulative timestamps, calculates individual step durations and time 
    proportions based on specific sequence rules, and groups the data into sequences.
    This version PRESERVES the original file order to respect the cumulative timediff logic.
    """
    if not os.path.exists(file_path):
        print(f"❌ Error: Data file not found at '{file_path}'")
        return None, None, None

    df = pd.read_csv(file_path)

    # --- Correctly Calculate Step Durations and Proportions ---
    
    # DO NOT sort the dataframe. The original order is crucial for the cumulative timediff logic.
    
    # Robustly create the 'Step' column in case it doesn't exist.
    df['Step'] = df.groupby('SeqOrder').cumcount()
    
    # Rule 1: Create a new 'step_duration' column. This is the true time difference for each step.
    # It's calculated by taking the difference from the previous cumulative 'timediff'.
    df['step_duration'] = df.groupby('SeqOrder')['timediff'].diff().fillna(df['timediff'])
    # Ensure no negative durations, which can happen if a timer resets.
    df['step_duration'] = df['step_duration'].clip(lower=0)
    
    # Rule 2: The business logic states the FIRST step with sourceID == 10 is the true end marker.
    # Find the step number of this marker for each sequence.
    end_marker_step = df[df['sourceID'] == 10].groupby('SeqOrder')['Step'].first()
    df['end_marker_step'] = df['SeqOrder'].map(end_marker_step)

    # Any step AFTER the end marker is considered post-sequence and should have a duration of 0.
    df.loc[df['Step'] > df['end_marker_step'], 'step_duration'] = 0

    # Rule 3: The total time for the sequence (the denominator for proportions) is the SUM
    # of all the now-corrected step durations. This ensures the proportions sum to 1.
    df['total_time'] = df.groupby('SeqOrder')['step_duration'].transform('sum')

    # Rule 4: Calculate the final, correct proportion using the correct step duration and total time.
    # This ensures the target proportions for each sequence are valid and based on the rules.
    df['true_proportion'] = df['step_duration'] / (df['total_time'] + 1e-9)


    # Group data by sequence for the model
    grouped = df.groupby('SeqOrder')
    sequences = []
    proportions = []
    
    print(f"Processing {len(grouped)} sequences...")

    for _, group in grouped:
        seq_features = group[FEATURE_COLUMNS].values
        seq_proportions = group['true_proportion'].values.reshape(-1, 1)
        
        sequences.append(seq_features)
        proportions.append(seq_proportions)

    # Return the processed dataframe for creating the final output file
    return sequences, proportions, df


# --- 3. Transformer Model Architecture ---

class PositionalEmbedding(layers.Layer):
    """Adds positional information to the input embeddings."""
    def __init__(self, max_len, embed_dim):
        super(PositionalEmbedding, self).__init__()
        self.pos_encoding = self.positional_encoding(max_len, embed_dim)

    def get_config(self):
        config = super().get_config()
        config.update({
            'max_len': self.pos_encoding.shape[1],
            'embed_dim': self.pos_encoding.shape[2]
        })
        return config

    def positional_encoding(self, max_len, embed_dim):
        pos = np.arange(max_len)[:, np.newaxis]
        i = np.arange(embed_dim)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(embed_dim))
        angle_rads = pos * angle_rates
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]

class TransformerEncoder(layers.Layer):
    """Transformer Encoder Block."""
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoder, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.att.key_dim,
            'num_heads': self.att.num_heads,
            'ff_dim': self.ffn.layers[0].units,
            'rate': self.dropout1.rate
        })
        return config

    def call(self, inputs, training=False):
        # The mask is implicitly passed through the layers.
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class MaskedSoftmax(layers.Layer):
    """Applies softmax activation while respecting the mask."""
    def __init__(self, **kwargs):
        super(MaskedSoftmax, self).__init__(**kwargs)
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is None:
            return tf.keras.activations.softmax(inputs, axis=1)

        # Expand mask dimensions to match inputs
        mask = tf.expand_dims(mask, -1)
        
        # Set logits for masked steps to a large negative number
        masked_inputs = tf.where(mask, inputs, -1e9)
        
        return tf.keras.activations.softmax(masked_inputs, axis=1)
        
def build_transformer_model(input_shape, num_heads=4, ff_dim=32, embed_dim=32, num_transformer_blocks=2):
    """
    Builds the single-output Transformer model for proportion prediction.
    """
    num_features = input_shape[-1]
    
    inputs = layers.Input(shape=(None, num_features), name="input_features")
    
    # This layer creates a mask that is passed to all subsequent layers.
    # It masks timesteps where all features are 0 (our padding value).
    masking_layer = layers.Masking(mask_value=0.)(inputs)
    
    dense_proj = layers.Dense(embed_dim, activation="relu")(masking_layer)
    x = PositionalEmbedding(max_len=MAX_SEQ_LEN, embed_dim=embed_dim)(dense_proj)
    
    for _ in range(num_transformer_blocks):
        x = TransformerEncoder(embed_dim, num_heads, ff_dim)(x)
    
    # --- Output Branch: Proportions ---
    time_step_logits = layers.Dense(1, name="time_step_logits")(x)
    proportions_output = MaskedSoftmax(name="proportions_output")(time_step_logits)
    
    model = tf.keras.Model(
        inputs=inputs, 
        outputs=proportions_output
    )
    return model


# --- 4. Training and Prediction Orchestration ---

def main():
    """Main function to run the data processing, training, and prediction."""
    
    data_file = 'data/176401/encoded_176401_condensed_full.csv'
    output_predictions_file = 'prediction_176401_proportions_final_all.csv'
    
    sequences, proportions, processed_df = load_and_preprocess_data(data_file)
    if sequences is None:
        return

    # --- Prepare data for training and prediction ---
    sequence_indices = np.arange(len(sequences))
    train_indices, val_indices = train_test_split(sequence_indices, test_size=0.2, random_state=42)

    X_train_unpadded = [sequences[i] for i in train_indices]
    y_prop_train_unpadded = [proportions[i] for i in train_indices]
    
    X_val_unpadded = [sequences[i] for i in val_indices]
    y_prop_val_unpadded = [proportions[i] for i in val_indices]

    X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train_unpadded, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    y_prop_train = tf.keras.preprocessing.sequence.pad_sequences(y_prop_train_unpadded, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val_unpadded, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    y_prop_val = tf.keras.preprocessing.sequence.pad_sequences(y_prop_val_unpadded, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    
    X_all_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    
    print(f"\nData shapes (Train): X={X_train.shape}, y_proportions={y_prop_train.shape}")
    print(f"Data shapes (Val):   X={X_val.shape}, y_proportions={y_prop_val.shape}")

    input_shape = X_train.shape[1:]
    model = build_transformer_model(input_shape)
    
    model.compile(
        optimizer="adam", 
        loss=tf.keras.losses.KLDivergence()
    )
    model.summary()
    
    print("\n--- Starting Model Training ---")
    model.fit(
        X_train, 
        y_prop_train,
        validation_data=(X_val, y_prop_val),
        epochs=50,
        batch_size=32,
        callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)]
    )
    print("--- Model Training Finished ---\n")

    # --- Generate Predictions and Create Final Output ---
    print("--- Generating predictions for the entire dataset ---")
    pred_proportions_padded = model.predict(X_all_padded)
    
    # Add the predicted proportions back to the original dataframe for easy output generation
    # First, create a placeholder column
    processed_df['predicted_proportion'] = 0.0
    
    unique_seq_orders = processed_df['SeqOrder'].unique()
    
    for i, seq_order_val in enumerate(unique_seq_orders):
        # Get the slice of the dataframe for the current sequence
        seq_indices = processed_df[processed_df['SeqOrder'] == seq_order_val].index
        actual_len = len(seq_indices)
        
        # Get the corresponding predictions
        pred_props = pred_proportions_padded[i, :actual_len, 0]
        
        # Assign the predictions to the correct rows in the dataframe
        processed_df.loc[seq_indices, 'predicted_proportion'] = pred_props
            
    # Select and order columns for the final output file for clarity and verification
    output_columns = [
        'SeqOrder',
        'Step',
        'sourceID',
        'timediff', # Original cumulative timediff for verification
        'step_duration', # The calculated individual step duration
        'true_proportion',
        'predicted_proportion'
    ]
    
    final_df = processed_df[output_columns]

    final_df.to_csv(output_predictions_file, index=False)
    print(f"✅ Predictions for all sequences saved to '{output_predictions_file}'")

    print("\n--- Sample of Predictions ---")
    print(final_df.head(20))

    print("\n--- Verifying Predicted Proportions Sum to 1 (for first 5 sequences) ---")
    print(final_df.groupby('SeqOrder')['predicted_proportion'].sum().head())
    
    print("\n--- Verifying True Proportions Sum to 1 (for first 5 sequences) ---")
    print(final_df.groupby('SeqOrder')['true_proportion'].sum().head())




In [3]:
if __name__ == "__main__":
    main()

Processing 223 sequences...

Data shapes (Train): X=(178, 128, 6), y_proportions=(178, 128, 1)
Data shapes (Val):   X=(45, 128, 6), y_proportions=(45, 128, 1)






--- Starting Model Training ---
Epoch 1/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 110ms/step - loss: 0.0244 - val_loss: 0.0133
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - loss: 0.0150 - val_loss: 0.0118
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 0.0131 - val_loss: 0.0130
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 0.0144 - val_loss: 0.0128
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 0.0134 - val_loss: 0.0126
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 0.0132 - val_loss: 0.0126
Epoch 7/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 0.0135 - val_loss: 0.0127
Epoch 8/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 0.0132 - val_loss: 0.0111
Epoch 9/50
[1m6/6[0m [32m━━