In [113]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

In [114]:
# Set the data directory and sequence length
data_directory = "../data/filtered_blocks_padded/"
sequence_length = 36
batch_size = 32  # Adjust batch size as needed

In [115]:
# Encoding legend
encoding_legend = {
    1: 'MRI_CCS_11', 2: 'MRI_EXU_95', 3: 'MRI_FRR_18', 4: 'MRI_FRR_257',
    5: 'MRI_FRR_264', 6: 'MRI_FRR_3', 7: 'MRI_FRR_34', 8: 'MRI_MPT_1005',
    9: 'MRI_MSR_100', 10: 'START', 11: 'MRI_MSR_21', 12: 'MRI_MSR_34',
    0: 'PADDED',  # Add a padding category
    10: 'START',  # Start token
    9: 'END'      # End token
}

In [129]:
def load_data(directory, sequence_length):
    inputs_list = []
    targets_list = []
    file_count = 0  # Track the number of files processed

    for file_name in os.listdir(directory):
        if file_name.endswith('.csv'):
            file_count += 1
            file_path = os.path.join(directory, file_name)
            df = pd.read_csv(file_path)
            
            inputs = df['sourceID'].values
            targets = df['sourceID'].shift(-1).fillna(0).values  # Shift by 1 and replace NaN with 0

            inputs = inputs.astype(int)
            targets = targets.astype(int)

            input_sequences = []
            target_sequences = []

            # Pad sequences shorter than sequence_length
            for i in range(len(inputs) - sequence_length + 1):
                input_sequences.append(inputs[i:i + sequence_length])
                target_sequences.append(targets[i:i + sequence_length])

            inputs_list.append(np.array(input_sequences))
            targets_list.append(np.array(target_sequences))
    
    # Combine all the sequences from all blocks
    inputs = np.concatenate(inputs_list, axis=0)
    targets = np.concatenate(targets_list, axis=0)

    print(f"Total files processed: {file_count}")
    print(f"Combined inputs shape: {inputs.shape}")
    print(f"Combined targets shape: {targets.shape}")

    return inputs, targets


In [130]:
# Load data from the directory
inputs, targets = load_data(data_directory, sequence_length)
print(f"Loaded dataset with inputs shape {inputs.shape} and targets shape {targets.shape}")

Total files processed: 326
Combined inputs shape: (326, 36)
Combined targets shape: (326, 36)
Loaded dataset with inputs shape (326, 36) and targets shape (326, 36)


In [118]:
# Create a TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
dataset = dataset.batch(batch_size, drop_remainder=False)

In [119]:
# Transformer Model Implementation
class PositionalEncoding(layers.Layer):
    def __init__(self, sequence_length, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(sequence_length, d_model)
    
    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)

        # Apply sine to even indices in the array
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

        # Apply cosine to odd indices in the array
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

In [120]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [121]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [131]:
def create_transformer_model(num_classes, sequence_length, embedding_dim=64, num_heads=4, ff_dim=128, num_blocks=4):
    print(f"Creating Transformer Model with:")
    print(f"- Vocabulary size: {num_classes}")
    print(f"- Sequence length: {sequence_length}")
    print(f"- Embedding dimension: {embedding_dim}")
    print(f"- Number of heads: {num_heads}")
    print(f"- FFN dimension: {ff_dim}")
    print(f"- Number of blocks: {num_blocks}")

    inputs = tf.keras.Input(shape=(sequence_length,))
    embedding = tf.keras.layers.Embedding(num_classes, embedding_dim)(inputs)
    
    x = embedding
    for _ in range(num_blocks):
        attention_output = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(x, x)
        attention_output = tf.keras.layers.LayerNormalization()(attention_output + x)
        
        ffn_output = tf.keras.layers.Dense(ff_dim, activation='relu')(attention_output)
        ffn_output = tf.keras.layers.Dense(embedding_dim)(ffn_output)
        x = tf.keras.layers.LayerNormalization()(ffn_output + attention_output)
    
    output = tf.keras.layers.Dense(num_classes)(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=output)
    return model


In [132]:
# Parameters for the model
vocab_size = np.max(inputs) + 1  # Number of unique sourceIDs
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

In [124]:
# Create the model
model = create_transformer_model(vocab_size, sequence_length, embed_dim, num_heads, ff_dim)
model.compile(optimizer=Adam(), loss=SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"])
model.summary()

In [125]:
# Training the model
epochs = 10
for batch in dataset.take(1):  # Take one batch as an example
    input_batch, target_batch = batch
    print(f"Example batch - Input shape: {input_batch.shape}, Target shape: {target_batch.shape}")

history = model.fit(dataset, epochs=epochs)


Epoch 1/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 35ms/step - accuracy: 0.7351 - loss: 1.1846
Epoch 2/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.8553 - loss: 0.4760
Epoch 3/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.8579 - loss: 0.4258
Epoch 4/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.8594 - loss: 0.4045
Epoch 5/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.8609 - loss: 0.3920
Epoch 6/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.8619 - loss: 0.3827
Epoch 7/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.8628 - loss: 0.3758
Epoch 8/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.8637 - loss: 0.3702
Epoch 9/100
[1m11/11[0m [32m━━━━━━━━

In [126]:
# Mapping predictions back to human-readable labels
def map_prediction_to_label(prediction, encoding_legend):
    mapped_labels = []
    for token in prediction.flatten():  # Flatten to handle each token individually
        label = encoding_legend.get(token, "Unknown")  # Retrieve label or mark as "Unknown"
        mapped_labels.append(label)
    return mapped_labels

In [127]:
def predict_next_sequence(input_sequence, model):
    print(f"Input sequence for prediction: {input_sequence}")
    input_sequence = np.array(input_sequence).reshape(1, -1)  # Reshape for single input
    prediction = model.predict(input_sequence)
    
    print(f"Prediction raw output shape: {prediction.shape}")
    
    predicted_sequence = np.argmax(prediction, axis=-1)  # Get predicted token (sourceID)
    print(f"Predicted sequence (encoded): {predicted_sequence}")
    
    # Mapping to human-readable labels
    human_readable_prediction = map_prediction_to_label(predicted_sequence[0], encoding_legend)
    print(f"Predicted sequence (human-readable): {human_readable_prediction}")
    
    return predicted_sequence, human_readable_prediction


In [128]:
# Example prediction
example_input = inputs[0]  # Use the first input sequence as an example
predicted_sequence, human_readable_sequence = predict_next_sequence(example_input, model)
print(f"Predicted sequence (encoded): {predicted_sequence}")
print(f"Predicted sequence (human-readable): {human_readable_sequence}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted sequence (encoded): [[4 5 4 5 8 4 9 5 4 4 5 4 5 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Predicted sequence (human-readable): ['MRI_FRR_257', 'MRI_FRR_264', 'MRI_FRR_257', 'MRI_FRR_264', 'MRI_MPT_1005', 'MRI_FRR_257', 'END', 'MRI_FRR_264', 'MRI_FRR_257', 'MRI_FRR_257', 'MRI_FRR_264', 'MRI_FRR_257', 'MRI_FRR_264', 'MRI_FRR_257', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED']
