In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [2]:
# --- 1. Constants and Configuration ---

# The maximum number of steps we'll consider in a single sequence.
# Sequences longer than this will be truncated, shorter ones will be padded.
MAX_SEQ_LEN = 128

# The features from the CSV file that the model will use as input.
FEATURE_COLUMNS = [
    'sourceID', 'PTAB', 'BodyGroup_from', 'BodyGroup_to',
    'Position_encoded', 'Direction_encoded'
]

# --- 2. Data Loading and Preprocessing ---

def load_and_preprocess_data(file_path):
    """
    Loads data, calculates time proportions, and groups it into sequences.
    This function returns unpadded lists of sequences.
    """
    if not os.path.exists(file_path):
        print(f"❌ Error: Data file not found at '{file_path}'")
        return None, None, None

    df = pd.read_csv(file_path)

    # --- Calculate Time Proportions (The Target Variable) ---
    df['total_time'] = df.groupby('SeqOrder')['timediff'].transform('sum')
    df['time_proportion'] = df['timediff'] / (df['total_time'] + 1e-9)

    # Group data by sequence
    grouped = df.groupby('SeqOrder')
    sequences = []
    proportions = []
    
    print(f"Processing {len(grouped)} sequences...")

    for _, group in grouped:
        seq_features = group[FEATURE_COLUMNS].values
        seq_proportions = group['time_proportion'].values.reshape(-1, 1)
        sequences.append(seq_features)
        proportions.append(seq_proportions)

    return sequences, proportions, df

# --- 3. Transformer Model Architecture ---

class PositionalEmbedding(layers.Layer):
    """Adds positional information to the input embeddings."""
    def __init__(self, max_len, embed_dim):
        super(PositionalEmbedding, self).__init__()
        self.pos_encoding = self.positional_encoding(max_len, embed_dim)

    def get_config(self):
        config = super().get_config()
        return config

    def positional_encoding(self, max_len, embed_dim):
        pos = np.arange(max_len)[:, np.newaxis]
        i = np.arange(embed_dim)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(embed_dim))
        angle_rads = pos * angle_rates
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]

class TransformerEncoder(layers.Layer):
    """Transformer Encoder Block."""
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoder, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def get_config(self):
        config = super().get_config()
        return config

    def call(self, inputs, training=False, mask=None):
        # The attention layer will use the mask to ignore padded inputs
        attn_output = self.att(inputs, inputs, attention_mask=mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def build_transformer_model(input_shape, num_heads=4, ff_dim=32, embed_dim=32, num_transformer_blocks=2):
    """Builds the Transformer model for proportion prediction."""
    num_features = input_shape[-1]
    
    inputs = layers.Input(shape=(None, num_features))
    
    # This layer creates a mask that is passed to all subsequent layers.
    # It masks timesteps where all features are 0 (our padding value).
    masking_layer = layers.Masking(mask_value=0.)(inputs)
    
    dense_proj = layers.Dense(embed_dim, activation="relu")(masking_layer)
    x = PositionalEmbedding(max_len=MAX_SEQ_LEN, embed_dim=embed_dim)(dense_proj)
    
    for _ in range(num_transformer_blocks):
        x = TransformerEncoder(embed_dim, num_heads, ff_dim)(x)
        
    time_step_logits = layers.Dense(1, name="time_step_logits")(x)
    proportions_output = layers.Softmax(axis=1, name="proportions_output")(time_step_logits)
    
    model = tf.keras.Model(inputs=inputs, outputs=proportions_output)
    return model

# --- 4. Custom Loss Function ---

def asymmetric_mse(y_true, y_pred):
    """
    Custom Mean Squared Error loss function that penalizes underestimation more heavily.
    """
    error = y_true - y_pred
    is_underestimation = error > 0
    penalty_weight = 10.0
    loss = tf.where(is_underestimation, penalty_weight * tf.square(error), tf.square(error))
    mask = tf.cast(tf.not_equal(y_true, 0), dtype=tf.float32)
    loss *= mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)


# --- 5. Training and Prediction Orchestration ---

def main():
    """Main function to run the data processing, training, and prediction."""
    
    data_file = 'data/176401/encoded_176401_condensed.csv'
    output_predictions_file = 'prediction_176401_proportions.csv'
    
    sequences, proportions, original_df = load_and_preprocess_data(data_file)
    if sequences is None:
        return

    # --- Prepare data for training and prediction ---
    sequence_indices = np.arange(len(sequences))
    train_indices, val_indices = train_test_split(sequence_indices, test_size=0.2, random_state=42)

    # Create training and validation sets for model fitting
    X_train_unpadded = [sequences[i] for i in train_indices]
    y_train_unpadded = [proportions[i] for i in train_indices]
    X_val_unpadded = [sequences[i] for i in val_indices]
    y_val_unpadded = [proportions[i] for i in val_indices]

    X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train_unpadded, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    y_train = tf.keras.preprocessing.sequence.pad_sequences(y_train_unpadded, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val_unpadded, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    y_val = tf.keras.preprocessing.sequence.pad_sequences(y_val_unpadded, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    
    # Pad the entire dataset for final predictions after training
    X_all_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post', dtype='float32')
    
    print(f"\nData shapes: X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"Data shapes: X_val: {X_val.shape}, y_val: {y_val.shape}")

    input_shape = X_train.shape[1:]
    model = build_transformer_model(input_shape)
    
    model.compile(optimizer="adam", loss=asymmetric_mse)
    model.summary()
    
    print("\n--- Starting Model Training ---")
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50,
        batch_size=32,
        callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)]
    )
    print("--- Model Training Finished ---\n")

    # --- Generate Predictions for the ENTIRE Dataset ---
    print("--- Generating predictions for the entire dataset ---")
    predictions = model.predict(X_all_padded)
    
    results = []
    unique_seq_orders = original_df['SeqOrder'].unique()
    
    # Loop over all sequences to build the final results
    for i, original_seq_index in enumerate(sequence_indices):
        actual_len = len(sequences[original_seq_index])
        seq_order_val = unique_seq_orders[original_seq_index]
        
        true_props = proportions[original_seq_index].flatten()
        pred_props = predictions[i, :actual_len, 0]
        
        seq_data = original_df[original_df['SeqOrder'] == seq_order_val]
        
        for j in range(actual_len):
            results.append({
                'SeqOrder': seq_order_val,
                'Step': j,
                'sourceID': seq_data.iloc[j]['sourceID'],
                'timediff': seq_data.iloc[j]['timediff'],
                'true_proportion': true_props[j],
                'predicted_proportion': pred_props[j]
            })
            
    results_df = pd.DataFrame(results)

    # Sort the final results and save to CSV
    results_df = results_df.sort_values(by=['SeqOrder', 'Step']).reset_index(drop=True)
    results_df.to_csv(output_predictions_file, index=False)
    print(f"✅ Predictions for all sequences saved to '{output_predictions_file}'")


    print("\n--- Sample of Predictions ---")
    print(results_df.head(20))

    print("\n--- Verifying Proportions Sum to 1 (for first 5 sequences) ---")
    print(results_df.groupby('SeqOrder')['predicted_proportion'].sum().head())





In [3]:
if __name__ == "__main__":
    main()

Processing 35 sequences...

Data shapes: X_train: (28, 128, 6), y_train: (28, 128, 1)
Data shapes: X_val: (7, 128, 6), y_val: (7, 128, 1)






--- Starting Model Training ---
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - loss: 0.0202 - val_loss: 0.0174
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - loss: 0.0158 - val_loss: 0.0170
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - loss: 0.0151 - val_loss: 0.0177
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step - loss: 0.0151 - val_loss: 0.0175
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - loss: 0.0145 - val_loss: 0.0161
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step - loss: 0.0134 - val_loss: 0.0134
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - loss: 0.0118 - val_loss: 0.0115
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - loss: 0.0104 - val_loss: 0.0111
Epoch 9/50
[1m1/1[0m [32m━━━━━