In [298]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Embedding, Dropout, LayerNormalization
from tensorflow.keras.models import Model
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [299]:
encoding_legend = {
    1: 'MRI_CCS_11', 2: 'MRI_EXU_95', 3: 'MRI_FRR_18', 4: 'MRI_FRR_257',
    5: 'MRI_FRR_264', 6: 'MRI_FRR_3', 7: 'MRI_FRR_34', 8: 'MRI_MPT_1005',
    9: 'MRI_MSR_100', 10: 'MRI_MSR_104', 11: 'MRI_MSR_21', 12: 'MRI_MSR_34',
    0: 'PADDED',  # Add a padding category
    10: 'START',  # Start token
    9: 'END'      # End token
}

In [300]:
# Positional Encoding
def positional_encoding(position, d_model):
    angle_rads = np.arange(position)[:, np.newaxis] / np.power(10000, (2 * (np.arange(d_model) // 2)) / np.float32(d_model))
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    return tf.cast(angle_rads[np.newaxis, ...], dtype=tf.float32)

In [301]:
# Define MultiHeadAttention Layer
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % num_heads == 0
        self.depth = d_model // num_heads
        self.wq = Dense(d_model)
        self.wk = Dense(d_model)
        self.wv = Dense(d_model)
        self.dense = Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        attention, _ = self.scaled_dot_product_attention(q, k, v, mask)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        attention = tf.reshape(attention, (batch_size, -1, self.d_model))
        return self.dense(attention)

    def scaled_dot_product_attention(self, q, k, v, mask):
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, v)
        return output, attention_weights

In [302]:
# Define Positionwise Feedforward Layer
class PositionwiseFeedforward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff):
        super(PositionwiseFeedforward, self).__init__()
        self.d_model = d_model
        self.dff = dff
        self.dense1 = Dense(dff, activation='relu')
        self.dense2 = Dense(d_model)

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        return x

# Define Transformer Block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(d_model, num_heads)
        self.ffn = PositionwiseFeedforward(d_model, dff)
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, x, training, mask):
        attn_output = self.att(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

# Define Encoder Layer
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, dropout_rate=0.1):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
        self.dropout = Dropout(dropout_rate)
        self.enc_layers = [TransformerBlock(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]

    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)
        return x

# Define Decoder Layer
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, dropout_rate=0.1):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
        self.dropout = Dropout(dropout_rate)
        self.dec_layers = [TransformerBlock(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x = self.dec_layers[i](x, enc_output, training, look_ahead_mask)
        return x

In [303]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(d_model, num_heads)
        self.ffn = PositionwiseFeedforward(d_model, dff)
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, x, training=None, mask=None):  # Ensure training, mask passed as keywords
        attn_output = self.att(x, x, x, mask)  # x already contains embeddings
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2


In [304]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, dropout_rate=0.1):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
        self.dropout = Dropout(dropout_rate)
        self.enc_layers = [TransformerBlock(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]

    def call(self, x, training=None, mask=None):  # training and mask as keyword arguments
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training, mask=mask)
        
        return x

In [305]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, dropout_rate=0.1):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
        self.dropout = Dropout(dropout_rate)
        self.dec_layers = [TransformerBlock(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]

    def call(self, x, enc_output, training=None, look_ahead_mask=None, padding_mask=None):  # Ensure keyword args
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)
        
        for i in range(self.num_layers):
            x = self.dec_layers[i](x, training=training, mask=look_ahead_mask)
        
        return x

In [306]:
class Transformer(Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)
        self.final_layer = Dense(target_vocab_size)

    def call(self, inputs, targets, training=None, look_ahead_mask=None, padding_mask=None):
        enc_output = self.encoder(inputs, training=training, mask=padding_mask)
        dec_output = self.decoder(targets, enc_output, training=training, look_ahead_mask=look_ahead_mask, padding_mask=padding_mask)
        final_output = self.final_layer(dec_output)
        return final_output


In [307]:
# Load and preprocess data
def preprocess_data(directory, encoding_legend):
    all_blocks = []
    all_source_ids = np.array(list(encoding_legend.keys())).reshape(-1, 1)
    encoder = OneHotEncoder(sparse=False)
    encoder.fit(all_source_ids)

    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            block = pd.read_csv(file_path)

            assert block.shape == (36, 5), f"Block {filename} has an unexpected shape {block.shape}"

            source_ids = block[['sourceID']].values.reshape(-1, 1)
            one_hot_encoded_sourceID = encoder.transform(source_ids)

            # Normalize additional features
            timediff = block[['timediff']].values
            ptab = np.nan_to_num(block[['PTAB']].values)
            timediff = (timediff - np.mean(timediff)) / np.std(timediff)
            ptab = (ptab - np.mean(ptab)) / np.std(ptab)
            body_group_from = block[['BodyGroup_from']].values
            body_group_to = block[['BodyGroup_to']].values

            # Create input block with start and end tokens
            X_block = np.concatenate((one_hot_encoded_sourceID, timediff, ptab, body_group_from, body_group_to), axis=1)
            all_blocks.append(X_block)

    all_blocks = np.stack(all_blocks, axis=0)
    return all_blocks, encoder

In [308]:
# Directory containing the 150 CSV data blocks
data_directory = "../data/filtered_blocks_padded/"

In [309]:
# Preprocess data and obtain one-hot encoding for sourceIDs
all_blocks, encoder = preprocess_data(data_directory, encoding_legend)

# Parameters for model
num_layers = 4
d_model = 128
num_heads = 8
dff = 512
input_vocab_size = len(encoder.categories_[0])  # Number of unique sourceIDs
target_vocab_size = input_vocab_size  # Assuming prediction is similar to input
pe_input = 1000
pe_target = 1000
dropout_rate = 0.1



In [310]:
# Create the Transformer model
transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, dropout_rate)

In [311]:
# Example Input (creating dummy data for testing)
# Adjust batch size based on your data length
batch_size = min(32, len(df) // 2)  # Ensure at least some data for batching
max_seq_length = 34  # Adjust as necessary

In [312]:
# Generate dummy input and target sequences (for illustration)
inputs = tf.random.uniform((batch_size, max_seq_length), dtype=tf.int64, minval=0, maxval=input_vocab_size)
targets = tf.random.uniform((batch_size, max_seq_length), dtype=tf.int64, minval=0, maxval=target_vocab_size)
look_ahead_mask = None
padding_mask = None

In [313]:
# Forward Pass
output = transformer(inputs=inputs, targets=targets, training=True, look_ahead_mask=look_ahead_mask, padding_mask=padding_mask)
print(output.shape)  # Should now work correctly


(17, 34, 13)


In [314]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))  # Mask out padding tokens
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)


In [315]:
import pandas as pd

# Load the data block into a DataFrame
data_block = """
sourceID,timediff,PTAB,BodyGroup_from,BodyGroup_to
10,0.0,-500.0,1,4
4,8.0,-1128900.0,1,4
5,16.0,-1128900.0,1,4
1,29.0,-1128900.0,1,4
12,35.0,-1128900.0,1,4
8,36.0,-1128900.0,1,4
1,55.0,-1128900.0,1,4
4,57.0,-200.0,1,4
5,64.0,-200.0,1,4
9,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
0,89.0,-200.0,1,4
"""

# Convert the string to a pandas DataFrame
from io import StringIO
df = pd.read_csv(StringIO(data_block))

# Check the first few rows to ensure it's loaded correctly
print(df.head())


   sourceID  timediff       PTAB  BodyGroup_from  BodyGroup_to
0        10       0.0     -500.0               1             4
1         4       8.0 -1128900.0               1             4
2         5      16.0 -1128900.0               1             4
3         1      29.0 -1128900.0               1             4
4        12      35.0 -1128900.0               1             4


In [316]:
# Adjusting the preprocessing function
def preprocess_data(df):
    inputs = df['sourceID'].values
    targets = df['sourceID'].shift(-1).fillna(0).values  # Shift by 1 and replace NaN with 0
    
    # Convert to integer type for model input
    inputs = inputs.astype(int)
    targets = targets.astype(int)

    # Reshape inputs and targets into sequences (batch_size, sequence_length)
    inputs = np.reshape(inputs, (-1, 1))  # Reshape for each input to be a sequence
    targets = np.reshape(targets, (-1, 1))  # Reshape for each target to be a sequence

    return inputs, targets

# After modifying, apply the preprocessing
inputs, targets = preprocess_data(df)

# Create a dataset from the inputs and targets
dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))

# Create a dataset with a proper batch size
dataset = dataset.batch(batch_size, drop_remainder=True)

# Print the dataset size to ensure it's not empty
print(f"Dataset size: {len(dataset)}")


# Define loss and optimizer
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()

@tf.function
def train_step(batch_inputs, batch_targets):
    with tf.GradientTape() as tape:
        predictions = transformer(batch_inputs, batch_targets, training=True, look_ahead_mask=None, padding_mask=None)
        loss = loss_function(batch_targets, predictions)  # Ensure loss_function is defined
    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    
    return loss  # Make sure this returns the loss
    print(f"Loss: {loss.numpy()}")
    print(f"Inputs shape: {batch_inputs.shape}, Targets shape: {batch_targets.shape}")

Dataset size: 2


In [317]:
transformer.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=loss_function
)

In [318]:
num_epochs = 10

for epoch in range(num_epochs):
    epoch_loss = 0
    for batch_inputs, batch_targets in dataset:
        print(f"Inputs shape: {batch_inputs.shape}, Targets shape: {batch_targets.shape}")
        
        # Ensure the shapes are correct for your model
        batch_inputs = tf.reshape(batch_inputs, [batch_size, -1])
        batch_targets = tf.reshape(batch_targets, [batch_size, -1])

        # Get the loss from the train_step function
        loss = train_step(batch_inputs, batch_targets)
        epoch_loss += loss.numpy()

    # Only print loss if dataset is not empty
    if len(dataset) > 0:
        print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(dataset)}")
    else:
        print(f"Epoch {epoch + 1}, No batches to train on.")


Inputs shape: (17, 1), Targets shape: (17, 1)




Inputs shape: (17, 1), Targets shape: (17, 1)
Epoch 1, Loss: 1.1127723455429077
Inputs shape: (17, 1), Targets shape: (17, 1)
Inputs shape: (17, 1), Targets shape: (17, 1)
Epoch 2, Loss: 0.9065909385681152
Inputs shape: (17, 1), Targets shape: (17, 1)
Inputs shape: (17, 1), Targets shape: (17, 1)
Epoch 3, Loss: 0.8176503777503967
Inputs shape: (17, 1), Targets shape: (17, 1)
Inputs shape: (17, 1), Targets shape: (17, 1)
Epoch 4, Loss: 0.49855658411979675
Inputs shape: (17, 1), Targets shape: (17, 1)
Inputs shape: (17, 1), Targets shape: (17, 1)
Epoch 5, Loss: 0.5470494627952576
Inputs shape: (17, 1), Targets shape: (17, 1)
Inputs shape: (17, 1), Targets shape: (17, 1)
Epoch 6, Loss: 0.47231337428092957
Inputs shape: (17, 1), Targets shape: (17, 1)
Inputs shape: (17, 1), Targets shape: (17, 1)
Epoch 7, Loss: 0.4809064269065857
Inputs shape: (17, 1), Targets shape: (17, 1)
Inputs shape: (17, 1), Targets shape: (17, 1)
Epoch 8, Loss: 0.4690232276916504
Inputs shape: (17, 1), Targets shape

In [320]:
# Assume we have some input data for prediction
# For example, you can use a small batch of data from your dataset
sample_inputs = np.array([[10, 4, 5, 1, 12, 8]])  # Sample input
sample_inputs = tf.convert_to_tensor(sample_inputs)

# Add padding if your model expects sequences of a fixed length
# For example, if the model expects a maximum sequence length of 34, pad as necessary
# Here, let's pad to length 34 with zeros (or adjust accordingly)
padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(sample_inputs, maxlen=34, padding='post')

# Make predictions
predictions = transformer(padded_inputs, targets=tf.zeros((1, 34)), training=False, look_ahead_mask=None, padding_mask=None)

# Convert predictions from logits to class indices
predicted_indices = tf.argmax(predictions, axis=-1).numpy()

# Print the predicted sequences
for i, seq in enumerate(predicted_indices):
    print(f"Predicted sequence {i+1}: {seq}")


Predicted sequence 1: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
