In [51]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [52]:
# Constants and Encoding
START_TOKEN = 13
END_TOKEN = 14
ENCODING_LEGEND = {
    'MRI_CCS_11': 1, 'MRI_EXU_95': 2, 'MRI_FRR_18': 3, 'MRI_FRR_257': 4,
    'MRI_FRR_264': 5, 'MRI_FRR_3': 6, 'MRI_FRR_34': 7, 'MRI_MPT_1005': 8,
    'MRI_MSR_100': 9, 'MRI_MSR_104': 10, 'MRI_MSR_21': 11, 'MRI_MSR_34': 12,
    'START': START_TOKEN, 'END': END_TOKEN
}
reverse_encoding = {v: k for k, v in ENCODING_LEGEND.items()}

In [53]:
def load_and_preprocess_data(data_file):
    data = pd.read_csv(data_file)
    
    # Split sequences based on "SeqOrder"
    all_sequences_tokens = []
    all_sequences_times = []
    all_sequences_sourceids = []

    current_tokens = []
    current_times = []
    current_sourceids = []

    for idx, row in data.iterrows():
        seq_order = row['SeqOrder']
        s_id = row['sourceID']
        t_diff = float(row['timediff'])
        
        if seq_order == 0 and current_tokens:
            # Finalize previous sequence
            token_seq = [START_TOKEN] + [int(ENCODING_LEGEND.get(str(x), x)) for x in current_tokens] + [END_TOKEN]
            time_seq = [0.0] + current_times
            
            all_sequences_tokens.append(token_seq)
            all_sequences_times.append(time_seq)
            all_sequences_sourceids.append(current_sourceids)
            
            current_tokens = []
            current_times = []
            current_sourceids = []
        
        current_tokens.append(s_id)
        current_times.append(t_diff)
        current_sourceids.append(str(s_id))

    # Add last sequence
    if current_tokens:
        token_seq = [START_TOKEN] + [int(ENCODING_LEGEND.get(str(x), x)) for x in current_tokens] + [END_TOKEN]
        time_seq = [0.0] + current_times
        
        all_sequences_tokens.append(token_seq)
        all_sequences_times.append(time_seq)
        all_sequences_sourceids.append(current_sourceids)

    return all_sequences_tokens, all_sequences_times, all_sequences_sourceids

In [54]:
def prepare_training_data(sequences_tokens, sequences_times):
    X_list, Y_list, masks_list, total_times_list = [], [], [], []
    
    for tokens, times in zip(sequences_tokens, sequences_times):
        total_time = times[-1]
        
        x_seq = tokens[:-1]    # input
        y_seq = times[1:]      # target cumulative times
        
        # Mask: valid tokens are those not equal to the pad value
        mask_seq = [1 if t != END_TOKEN else 0 for t in x_seq]
        
        X_list.append(x_seq)
        Y_list.append(y_seq)
        masks_list.append(mask_seq)
        total_times_list.append(total_time)

    max_len = max(len(x) for x in X_list)
    
    # Pad sequences
    X_train = pad_sequences(X_list, maxlen=max_len, padding='post', value=END_TOKEN)
    Y_cum_target = pad_sequences(Y_list, maxlen=max_len, padding='post', value=0.0)
    mask_train = pad_sequences(masks_list, maxlen=max_len, padding='post', value=0)
    
    X_train = np.array(X_train, dtype=np.int32)
    Y_cum_target = np.array(Y_cum_target, dtype=np.float32)
    mask_train = np.array(mask_train, dtype=np.float32)
    total_times = np.array(total_times_list, dtype=np.float32)
    
    return X_train, Y_cum_target, mask_train, total_times

In [55]:
# ----------------------------
# Transformer Components (unchanged)
# ----------------------------
def positional_encoding(length, depth):
    depth = depth / 2
    positions = np.arange(length)[:, np.newaxis]
    depths = np.arange(depth)[np.newaxis, :] / depth
    angle_rates = 1 / (10000 ** depths)
    angle_rads = positions * angle_rates
    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)
    return tf.cast(pos_encoding, dtype=tf.float32)

class PositionalEmbedding(layers.Layer):
    def __init__(self, vocab_size, d_model, max_len=16384, use_embedding=True):
        super(PositionalEmbedding, self).__init__()
        self.d_model = d_model
        self.use_embedding = use_embedding
        if self.use_embedding:
            self.embedding = layers.Embedding(vocab_size, d_model, mask_zero=True)
        else:
            self.embedding = layers.Dense(d_model, activation="relu")
        self.max_len = max_len
        self.pos_encoding = positional_encoding(self.max_len, d_model)
    
    def compute_mask(self, *args, **kwargs):
        if self.use_embedding:
            return self.embedding.compute_mask(*args, **kwargs)
        else:
            return None
    
    def call(self, x):
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        seq_len = tf.shape(x)[1]
        x += self.pos_encoding[tf.newaxis, :seq_len, :]
        return x

class FeedForward(layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            layers.Dense(dff, activation='relu'),
            layers.Dense(d_model),
            layers.Dropout(dropout_rate)
        ])
        self.add = layers.Add()
        self.layer_norm = layers.LayerNormalization()
    
    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        return x

class CausalSelfAttention(layers.Layer):
    def __init__(self, num_heads, d_model, dropout_rate=0.1):
        super().__init__()
        self.mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout_rate)
        self.add = layers.Add()
        self.layer_norm = layers.LayerNormalization()
    
    def call(self, x):
        attn_output = self.mha(query=x, key=x, value=x, use_causal_mask=True)
        x = self.add([x, attn_output])
        x = self.layer_norm(x)
        return x

class SelfAttentionFeedForwardLayer(layers.Layer):
    def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
        super().__init__()
        self.self_attention = CausalSelfAttention(num_heads=num_heads, d_model=d_model, dropout_rate=dropout_rate)
        self.ffn = FeedForward(d_model, dff, dropout_rate)
    
    def call(self, x):
        x = self.self_attention(x)
        x = self.ffn(x)
        return x

class Encoder(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1, max_len=16384):
        super().__init__()
        self.pos_embedding = PositionalEmbedding(vocab_size, d_model)
        self.enc_layers = [SelfAttentionFeedForwardLayer(d_model, num_heads, dff, dropout_rate)
                           for _ in range(num_layers)]
        self.dropout = layers.Dropout(dropout_rate)
    
    def call(self, x):
        x = self.pos_embedding(x)
        x = self.dropout(x)
        for layer in self.enc_layers:
            x = layer(x)
        return x

class Decoder(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1, max_len=16384):
        super().__init__()
        self.pos_embedding = PositionalEmbedding(vocab_size, d_model)
        self.dropout = layers.Dropout(dropout_rate)
        self.dec_layers = [SelfAttentionFeedForwardLayer(d_model, num_heads, dff, dropout_rate)
                           for _ in range(num_layers)]
    
    def call(self, x, context):
        x = self.pos_embedding(x)
        x = self.dropout(x)
        for layer in self.dec_layers:
            x = layer(x)
        return x

In [56]:
class TimeDiffTransformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, dropout_rate=0.1, max_len=16384):
        super().__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, dropout_rate)
        
        # Modified heads to handle individual sequences
        self.proportion_head = layers.Dense(1)  # Remove activation
        self.total_time_head = layers.Dense(1, activation='relu')
    
    def call(self, inputs):
        encoder_out = self.encoder(inputs)
        
        # Sequence-level attention for proportions
        proportions = self.proportion_head(encoder_out)
        proportions = tf.squeeze(proportions, axis=-1)
        proportions = tf.nn.softmax(proportions, axis=1)  # Apply softmax explicitly
        
        # Total time prediction
        seq_attention = tf.reduce_mean(encoder_out, axis=1)
        total_time = self.total_time_head(seq_attention)
        
        return proportions, total_time

In [57]:
def compute_time_differences(proportions, total_time, mask):
    # Ensure proportions and mask have compatible shapes
    proportions = tf.reshape(proportions, tf.shape(mask))
    
    # Apply mask to ensure only valid tokens contribute
    proportions *= tf.cast(mask, tf.float32)
    
    # Compute row-wise sum for normalization to handle variable-length sequences
    row_sums = tf.reduce_sum(proportions, axis=1, keepdims=True)
    row_sums = tf.where(row_sums == 0, tf.ones_like(row_sums), row_sums)
    
    # Normalize proportions
    proportions /= row_sums
    
    # Compute increments (broadcasting total_time)
    increments = proportions * tf.expand_dims(total_time, axis=1)
    
    # Compute cumulative times
    cumulative_times = tf.math.cumsum(increments, axis=1)
    
    return proportions, increments, cumulative_times

In [58]:
def train_transformer(data_file, epochs=50, batch_size=32):
    try:
        # Load and preprocess data
        sequences_tokens, sequences_times, sequences_sourceids = load_and_preprocess_data(data_file)
        X_train, Y_cum_target, mask_train, total_times = prepare_training_data(sequences_tokens, sequences_times)
        
        # Model parameters
        vocab_size = max(ENCODING_LEGEND.values()) + 1
        max_seq_len = X_train.shape[1]
        
        model = TimeDiffTransformer(
            num_layers=3, 
            d_model=64, 
            num_heads=8, 
            dff=128,
            input_vocab_size=vocab_size, 
            dropout_rate=0.1, 
            max_len=max_seq_len
        )
        
        # Initial model call to build weights
        _ = model(X_train)
        
        optimizer = tf.keras.optimizers.Adam()
        
        @tf.function
        def train_step(x, y_cum, mask, total_time):
            with tf.GradientTape() as tape:
                pred_props, pred_total = model(x)
                
                # Compute true proportions
                time_diffs = y_cum[:, 1:] - y_cum[:, :-1]
                true_total = total_time[:, tf.newaxis]
                true_props = time_diffs / tf.where(true_total == 0, tf.ones_like(true_total), true_total)
                
                # Pad true_props to match prediction shape
                true_props_padded = tf.pad(true_props, [[0, 0], [0, 1]], constant_values=0)
                
                # Compute masked losses
                props_loss = tf.keras.losses.MeanSquaredError()(true_props_padded, pred_props)
                total_time_loss = tf.keras.losses.MeanSquaredError()(total_time, pred_total)
                
                total_loss = props_loss + total_time_loss
            
            grads = tape.gradient(total_loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            
            return total_loss, props_loss, total_time_loss
        
        # Training loop
        for epoch in range(epochs):
            loss, props_loss, total_loss = train_step(X_train, Y_cum_target, mask_train, total_times)
            print(f"Epoch {epoch+1}/{epochs} - Total Loss: {loss.numpy():.4f}")
        
        return model, X_train, Y_cum_target, mask_train, total_times, sequences_sourceids
    
    except Exception as e:
        print(f"Error in train_transformer: {e}")
        raise

In [59]:
def generate_predictions_csv(model, X_train, Y_cum_target, mask_train, total_times, sequences_sourceids):
    # Predict time differences
    proportions_pred, increments_pred, cumulative_pred = compute_time_differences(
        model(X_train)[0], total_times, mask_train
    )
    
    # Convert TensorFlow tensors to NumPy arrays
    proportions_pred = proportions_pred.numpy()
    increments_pred = increments_pred.numpy()
    cumulative_pred = cumulative_pred.numpy()
    
    # Ground truth computations
    gt_increments = np.concatenate([Y_cum_target[:, 0:1], Y_cum_target[:, 1:] - Y_cum_target[:, :-1]], axis=1)
    
    # Collect predictions in a list of DataFrames
    output_dataframes = []
    
    for seq_idx in range(X_train.shape[0]):
        # Identify valid indices
        valid_mask = mask_train[seq_idx] == 1
        valid_indices = np.where(valid_mask)[0]
        
        # Safety checks
        if len(valid_indices) == 0:
            print(f"Warning: No valid indices for sequence {seq_idx}")
            continue
        
        # Ensure sourceids list is long enough
        safe_sourceids = sequences_sourceids[seq_idx] if seq_idx < len(sequences_sourceids) else []
        
        # Prepare data with safe indexing
        seq_df = pd.DataFrame({
            'Sequence': [seq_idx] * len(valid_indices),
            'Step': range(1, len(valid_indices) + 1),
            'SourceID': [safe_sourceids[i] if i < len(safe_sourceids) else f'Unknown_{i}' for i in range(len(valid_indices))],
            'Predicted_Proportion': proportions_pred[seq_idx][valid_indices],
            'Predicted_Increment': increments_pred[seq_idx][valid_indices],
            'Predicted_Cumulative': cumulative_pred[seq_idx][valid_indices],
            'GroundTruth_Increment': gt_increments[seq_idx][valid_indices],
            'GroundTruth_Cumulative': Y_cum_target[seq_idx][valid_indices]
        })
        
        output_dataframes.append(seq_df)
    
    # Combine and save
    if not output_dataframes:
        print("No valid sequences found for predictions.")
        return pd.DataFrame()
    
    predictions_df = pd.concat(output_dataframes, ignore_index=True)
    predictions_df.to_csv('predictions_transformer.csv', index=False)
    print("Predictions saved to predictions_transformer.csv")
    
    return predictions_df

In [60]:
def main():
    try:
        data_file = "encoded_182625.csv"  # Replace with your actual file path
        
        # Train model and get predictions
        result = train_transformer(data_file)
        if result is None:
            print("Training failed, no results returned.")
            return
        
        model, X_train, Y_cum_target, mask_train, total_times, sequences_sourceids = result
        
        predictions_df = generate_predictions_csv(
            model, X_train, Y_cum_target, mask_train, total_times, sequences_sourceids
        )
        
        print("\nSample Predictions:")
        print(predictions_df.head(10))
    
    except Exception as e:
        print(f"Error in main: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()



Epoch 1/50 - Total Loss: 261922.9531
Epoch 2/50 - Total Loss: 261922.9531
Epoch 3/50 - Total Loss: 261087.2344
Epoch 4/50 - Total Loss: 257413.1562
Epoch 5/50 - Total Loss: 255475.0469
Epoch 6/50 - Total Loss: 254331.1875
Epoch 7/50 - Total Loss: 253649.4219
Epoch 8/50 - Total Loss: 253306.2969
Epoch 9/50 - Total Loss: 253087.5000
Epoch 10/50 - Total Loss: 252934.1562
Epoch 11/50 - Total Loss: 252820.9062
Epoch 12/50 - Total Loss: 252732.1562
Epoch 13/50 - Total Loss: 252661.0625
Epoch 14/50 - Total Loss: 252597.5625
Epoch 15/50 - Total Loss: 252536.5938
Epoch 16/50 - Total Loss: 252473.8750
Epoch 17/50 - Total Loss: 252406.7188
Epoch 18/50 - Total Loss: 252335.5938
Epoch 19/50 - Total Loss: 252263.4219
Epoch 20/50 - Total Loss: 252192.3594
Epoch 21/50 - Total Loss: 252123.3438
Epoch 22/50 - Total Loss: 252056.3125
Epoch 23/50 - Total Loss: 251990.6094
Epoch 24/50 - Total Loss: 251925.6875
Epoch 25/50 - Total Loss: 251861.0469
Epoch 26/50 - Total Loss: 251796.3906
Epoch 27/50 - Total L