In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers

# Output library versions
print(f"numpy version: {np.__version__}")
print(f"tensorflow version: {tf.__version__}")

numpy version: 1.26.4
tensorflow version: 2.18.0


In [196]:
# ----------------------------
# Constants and Encoding Legend
# ----------------------------
START_TOKEN = 13
END_TOKEN = 14

ENCODING_LEGEND = {
    'MRI_CCS_11': 1, 'MRI_EXU_95': 2, 'MRI_FRR_18': 3, 'MRI_FRR_257': 4,
    'MRI_FRR_264': 5, 'MRI_FRR_3': 6, 'MRI_FRR_34': 7, 'MRI_MPT_1005': 8,
    'MRI_MSR_100': 9, 'MRI_MSR_104': 10, 'MRI_MSR_21': 11, 'MRI_MSR_34': 12,
    'START': START_TOKEN,
    'END': END_TOKEN
}
# Build reverse mapping for decoding:
reverse_encoding = {v: k for k, v in ENCODING_LEGEND.items()}

CHAR_TO_INT = {
    '0': 0,
    '1': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    '5': 5,
    '6': 6,
    '7': 7,
    '8': 8,
    '9': 9,
    '10': 10,
    '11': 11,
    '12': 12,
}

In [197]:
# ----------------------------
# Data Loading and Preparation
# ----------------------------
data_file = "encoded_182625.csv"  # Ensure this file is in your working directory.
data = pd.read_csv(data_file)
print("Loaded CSV with columns:", data.columns.tolist())

Loaded CSV with columns: ['SeqOrder', 'sourceID', 'timediff', 'PTAB', 'BodyGroup_from', 'BodyGroup_to']


In [198]:
# We assume the CSV contains columns "sourceID" and "timediff"
source_ids = data['sourceID'].dropna().astype(float).tolist()
cumulative_times = data['timediff'].dropna().astype(float).tolist()

# Create a token sequence: add START at the beginning and END at the end.
sequence = [START_TOKEN] + [int(s) for s in source_ids] + [END_TOKEN]

# Adjust cumulative times:
# Always prepend 0 and then, if necessary, append the last cumulative time so that
# the number of cumulative times equals the number of tokens in 'sequence'.
cumulative_times = [0.0] + cumulative_times  # Prepend 0 unconditionally.
if len(sequence) != len(cumulative_times):
    cumulative_times = cumulative_times + [cumulative_times[-1]]

# Now, sequence and cumulative_times both have length = n + 2.
# For training, we shift the data:
# Use all tokens except the last as input.
# Use cumulative times from the second token onward as targets.
X_train = np.expand_dims(np.array(sequence[:-1], dtype=np.int32), axis=0)       # shape: (1, n+1)
Y_cum_target = np.expand_dims(np.array(cumulative_times[1:], dtype=np.float32), axis=0)  # shape: (1, n+1

In [199]:
# ----------------------------
# Target Computation Function
# ----------------------------
def compute_true_targets(cumulative_times):
    """
    Given cumulative times (shape: [batch, L_target]), compute:
      - true_proportions: the incremental differences normalized by total time.
      - true_total: the total time (last cumulative value) per sample.
    """
    # Compute increments: first value plus differences.
    diffs = cumulative_times[:, 0:1]  # shape: (batch, 1)
    diffs = tf.concat([diffs, cumulative_times[:, 1:] - cumulative_times[:, :-1]], axis=1)
    true_total = cumulative_times[:, -1:]
    true_total_safe = tf.where(true_total == 0, tf.ones_like(true_total), true_total)
    true_proportions = diffs / true_total_safe
    return true_proportions, true_total

true_prop, true_total = compute_true_targets(tf.convert_to_tensor(Y_cum_target))

In [200]:
# ----------------------------
# Transformer Components
# ----------------------------
def positional_encoding(length, depth):
    depth = depth / 2
    positions = np.arange(length)[:, np.newaxis]      # shape: (length, 1)
    depths = np.arange(depth)[np.newaxis, :] / depth    # shape: (1, depth)
    angle_rates = 1 / (10000 ** depths)
    angle_rads = positions * angle_rates
    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)
    return tf.cast(pos_encoding, dtype=tf.float32)

In [201]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, vocab_size, d_model, max_len=4096, use_embedding=True):
        super(PositionalEmbedding, self).__init__()
        self.d_model = d_model
        self.use_embedding = use_embedding
        if self.use_embedding:
            self.embedding = layers.Embedding(vocab_size, d_model, mask_zero=True)
        else:
            self.embedding = layers.Dense(d_model, activation="relu")
        self.max_len = max_len
        self.pos_encoding = positional_encoding(self.max_len, d_model)
    
    def compute_mask(self, *args, **kwargs):
        if self.use_embedding:
            return self.embedding.compute_mask(*args, **kwargs)
        else:
            return None
    
    def call(self, x):
        # x shape: (batch, seq_len)
        x = self.embedding(x)  # (batch, seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        seq_len = tf.shape(x)[1]
        x += self.pos_encoding[tf.newaxis, :seq_len, :]
        return x

In [202]:
class FeedForward(layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            layers.Dense(dff, activation='relu'),
            layers.Dense(d_model),
            layers.Dropout(dropout_rate)
        ])
        self.add = layers.Add()
        self.layer_norm = layers.LayerNormalization()
    
    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        return x

In [None]:
class CausalSelfAttention(layers.Layer):
    def __init__(self, num_heads, d_model, dropout_rate=0.1):
        super().__init__()
        self.mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout_rate)
        self.add = layers.Add()
        self.layer_norm = layers.LayerNormalization()
    
    def call(self, x):
        attn_output = self.mha(query=x, key=x, value=x, use_causal_mask=True)
        x = self.add([x, attn_output])
        x = self.layer_norm(x)
        return x

In [203]:
class SelfAttentionFeedForwardLayer(layers.Layer):
    def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
        super().__init__()
        self.self_attention = CausalSelfAttention(num_heads=num_heads, d_model=d_model, dropout_rate=dropout_rate)
        self.ffn = FeedForward(d_model, dff, dropout_rate)
    
    def call(self, x):
        x = self.self_attention(x)
        x = self.ffn(x)
        return x

In [204]:
class Encoder(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1):
        super().__init__()
        self.pos_embedding = PositionalEmbedding(vocab_size, d_model)
        self.enc_layers = [SelfAttentionFeedForwardLayer(d_model, num_heads, dff, dropout_rate)
                           for _ in range(num_layers)]
        self.dropout = layers.Dropout(dropout_rate)
    
    def call(self, x):
        x = self.pos_embedding(x)
        x = self.dropout(x)
        for layer in self.enc_layers:
            x = layer(x)
        return x  # (batch, seq_len, d_model)

In [206]:
class Decoder(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1):
        super().__init__()
        self.pos_embedding = PositionalEmbedding(vocab_size, d_model)
        self.dropout = layers.Dropout(dropout_rate)
        self.dec_layers = [SelfAttentionFeedForwardLayer(d_model, num_heads, dff, dropout_rate)
                           for _ in range(num_layers)]
    
    def call(self, x, context):
        x = self.pos_embedding(x)
        x = self.dropout(x)
        for layer in self.dec_layers:
            x = layer(x)
        return x  # (batch, seq_len, d_model)

In [207]:
# ----------------------------
# TimeDiffTransformer Model
# ----------------------------
class TimeDiffTransformer(tf.keras.Model):
    """
    This model takes a sequence input (tokenized examination steps) and predicts:
      1. A sequence of per-step proportions (via softmax so they sum to 1).
      2. An overall total time (a nonnegative scalar via ReLU).
    The predicted per-step time differences (increments) are computed by multiplying
    the proportions with the total time.
    """
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, dropout_rate=0.1):
        super().__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, dropout_rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, input_vocab_size, dropout_rate)
        self.proportion_head = layers.Dense(1)  # one value per token
        self.total_time_head = layers.Dense(1, activation='relu')  # one scalar per sample
    
    def call(self, inputs):
        encoder_out = self.encoder(inputs)              # (batch, seq_len, d_model)
        decoder_out = self.decoder(inputs, encoder_out)   # (batch, seq_len, d_model)
        proportions_logits = self.proportion_head(decoder_out)  # (batch, seq_len, 1)
        proportions_logits = tf.squeeze(proportions_logits, axis=-1)  # (batch, seq_len)
        proportions = tf.nn.softmax(proportions_logits, axis=-1)
        pooled_encoder = tf.reduce_mean(encoder_out, axis=1)  # (batch, d_model)
        total_time = self.total_time_head(pooled_encoder)       # (batch, 1)
        return proportions, total_time
    
    def predict_time_differences(self, inputs):
        proportions, total_time = self(inputs)
        pred_increments = proportions * total_time  # (batch, seq_len)
        pred_cumulative = tf.math.cumsum(pred_increments, axis=1)
        return proportions, pred_increments, pred_cumulative

In [208]:
# ----------------------------
# Loss Functions
# ----------------------------
def proportion_loss(y_true, y_pred):
    return tf.keras.losses.MeanSquaredError()(y_true, y_pred)

def total_time_loss(y_true, y_pred):
    return tf.keras.losses.MeanSquaredError()(y_true, y_pred)

In [None]:
# ----------------------------
# Model Instantiation, Compilation, and Training
# ----------------------------
vocab_size = max(ENCODING_LEGEND.values()) + 1  # e.g., 15
model = TimeDiffTransformer(num_layers=3, d_model=32, num_heads=8, dff=128,
                            input_vocab_size=vocab_size, dropout_rate=0.1)

# Force a forward pass to build the model's weights.
_ = model(X_train)
model.summary()

# Compile with two losses.
model.compile(optimizer='adam', loss=[proportion_loss, total_time_loss])

# Train the model.
# (For demonstration; for real use, you will need more sequences.)
model.fit(X_train, [true_prop, true_total], epochs=5)



Epoch 1/20


ValueError: Dimensions must be equal, but are 3800 and 3801 for '{{node compile_loss/proportion_loss/mean_squared_error/sub}} = Sub[T=DT_FLOAT](data_1, time_diff_transformer_10_1/Softmax)' with input shapes: [?,3800], [?,3801].

In [None]:
# ----------------------------
# Inference and CSV Output
# ----------------------------
proportions_pred, increments_pred, cumulative_pred = model.predict_time_differences(X_train)

# Convert predictions to numpy arrays.
proportions_pred = proportions_pred.numpy()[0]  # shape: (L,)
increments_pred = increments_pred.numpy()[0]    # shape: (L,)
cumulative_pred = cumulative_pred.numpy()[0]    # shape: (L,)

# Ground truth for the shifted targets.
ground_truth_increments = np.concatenate([Y_cum_target[:, 0:1], Y_cum_target[:, 1:] - Y_cum_target[:, :-1]], axis=1)[0]
ground_truth_cumulative = Y_cum_target[0]

# Decode the sourceIDs corresponding to the target steps (i.e., excluding the START token).
decoded_sourceIDs = []
for token in sequence[1:]:
    if token == END_TOKEN:
        decoded_sourceIDs.append("END")
    else:
        decoded_sourceIDs.append(reverse_encoding.get(token, f"UNK({token})"))

# Create an output DataFrame.
output_df = pd.DataFrame({
    "Step": np.arange(1, len(sequence)),
    "SourceID": decoded_sourceIDs,
    "Predicted_Proportion": proportions_pred,
    "Predicted_Increment": increments_pred,
    "Predicted_Cumulative": cumulative_pred,
    "GroundTruth_Increment": ground_truth_increments,
    "GroundTruth_Cumulative": ground_truth_cumulative
})

# Save predictions to CSV.
output_csv = "predictions.csv"
output_df.to_csv(output_csv, index=False)
print(f"\nPredictions saved to {output_csv}")
print("\nSample predictions:")
print(output_df.head(10))

ValueError: All arrays must be of the same length