In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, MultiHeadAttention, LayerNormalization, Embedding, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder




In [2]:
# Parameters
learning_rate = 0.01
key_dim = 16
num_heads = 8
sequence_length = 1000  # Adjust as needed
start_token = [0]  # You can customize start token based on your input format
end_token = [0]  # You can customize end token based on your input format
data_directory = "../data/filtered_blocks/"  # Directory containing block CSV files

In [3]:
# Load all block CSV files and concatenate them
def load_blocks(data_dir):
    all_data = []
    for filename in sorted(os.listdir(data_dir)):
        if filename.startswith("block_") and filename.endswith(".csv"):
            file_path = os.path.join(data_dir, filename)
            block_data = pd.read_csv(file_path)
            all_data.append(block_data)
    return pd.concat(all_data, ignore_index=True)

df = load_blocks(data_directory)
print(f"Loaded data from {len(df)} rows across all blocks.")

Loaded data from 4493 rows across all blocks.


In [4]:
# Drop unnecessary columns
columns_to_drop = ['datetime', 'SN', 'ZAxisInPossible', 'ZAxisOutPossible', 'YAxisDownPossible',
                   'YAxisUpPossible', 'BC', 'S1', 'S10', 'S11', 'S12', 'S2', 'S3', 'S4',
                   'S5', 'S6', 'S7', 'S8', 'S9', 'BO1', 'BO2', 'BO3', 'B1', 'B2', 'B3', 'B4',
                   'B5', 'HE2', 'HE4', 'NE2', 'HE1', 'HE3', 'NE1', 'SHA', 'HW1', 'HW2', 'HW3',
                   '18K', 'FA', 'TO', 'BAL', 'BAR', 'BCL', 'BCR', 'HC2', 'HC4', 'HC6', 'HC7',
                   'NC2', 'HC1', 'HC3', 'HC5', 'NC1', 'Na', 'UFL', 'PA1', 'PA2', 'PA3', 'PA4',
                   'PA5', 'PA6', 'SP1', 'SP2', 'SP3', 'SP4', 'SP5', 'SP6', 'SP7', 'SP8', 'BL8',
                   'BR8', 'UFS', 'HEA', 'HEP', 'SC', 'PeH', 'PeN', 'FS', 'FL', 'BY1', 'BY2',
                   'BY3', 'BL', 'BR', 'HE', 'BL4', 'BR4', 'BL1', 'BR1', 'BL2', 'BR2', 'L7',
                   'L4', 'H2L', 'N2L', 'H1U', 'N1U', 'He1', 'He2', 'TR1', 'TR2', 'TR3', 'TR4',
                   'TR5', 'TR6', 'MR', 'ML', 'BL5', 'BR5', 'C24', 'EN', 'SHL', 'SHS', 'BodyPart_from',
                   'BodyPart_to', 'PatientID_from', 'PatientID_to']
df = df.drop(columns=columns_to_drop)
print(f"Dropped unnecessary columns. Remaining columns: {df.columns.tolist()}")


Dropped unnecessary columns. Remaining columns: ['sourceID', 'timediff', 'PTAB', 'BodyGroup_from', 'BodyGroup_to']


In [5]:
# Replace NaN values with 0
df.fillna(0, inplace=True)

In [6]:
# Separate features and target (sourceID)
X = df.drop(columns=['sourceID'])
y_sourceid = df['sourceID']

In [7]:
# One-hot encode 'sourceID'
encoder = OneHotEncoder(sparse=False)
y_sourceid_encoded = encoder.fit_transform(y_sourceid.values.reshape(-1, 1))
original_sourceids = encoder.categories_[0]



In [8]:
# Scale the feature data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Update sequence creation to include start and end tokens
def create_sequences(data, target, seq_length, start_token, end_token):
    sequences = []
    targets = []
    for i in range(len(data) - seq_length):
        seq = data[i:i+seq_length].tolist()
        seq.insert(0, start_token * len(seq[0]))  # Add start token
        seq.append(end_token * len(seq[0]))  # Add end token
        sequences.append(seq)
        targets.append(target[i+seq_length])
    return np.array(sequences), np.array(targets)

In [None]:
# Updated version***
def create_sequences(data, target, seq_length, start_token=10, end_token=9):
    sequences = []
    targets = []
    for i in range(len(data) - seq_length):
        seq = data[i:i+seq_length].tolist()
        # Add start token
        seq = [[start_token] * len(seq[0])] + seq  
        # Add end token
        seq.append([end_token] * len(seq[0]))  
        sequences.append(seq)
        targets.append(target[i + seq_length])
    return np.array(sequences), np.array(targets)

# Call the updated function
X_sequences, y_sequences = create_sequences(X_scaled, y_sourceid_encoded, sequence_length)


In [10]:
# Create sequences with start and end tokens
X_sequences, y_sequences = create_sequences(X_scaled, y_sourceid_encoded, sequence_length, start_token, end_token)

In [11]:
# Transformer Encoder model
def transformer_model(input_shape, output_dim):
    inputs = Input(shape=input_shape)
    
    # Dense layer
    x = Dense(128, activation='relu')(inputs)
    
    # Multi-head attention layer
    attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(x, x)
    
    # Add & Norm
    x = LayerNormalization(epsilon=1e-6)(x + attention_output)
    
    # Feedforward layer
    ff_dim = 256  # adjust this dimension as needed
    x_ffn = Dense(ff_dim, activation='relu')(x)
    x_ffn = Dense(x.shape[-1])(x_ffn) 
    
    # Final classification layer (softmax for multi-class)
    outputs = Dense(output_dim, activation='softmax')(x[:, -1, :])  # Only last timestep output

    return Model(inputs=inputs, outputs=outputs)

In [12]:
# Define model
input_shape = (sequence_length + 2, X_scaled.shape[1])  # Adjust for start and end tokens
output_dim = y_sourceid_encoded.shape[1]
model = transformer_model(input_shape, output_dim)




In [13]:
# Compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss='categorical_crossentropy')

In [None]:
# Train model
history = model.fit(X_sequences, y_sequences, epochs=30, batch_size=32)

Epoch 1/30

Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30

In [None]:
# Predict on training data
predicted_sourceids = model.predict(X_sequences)

In [None]:
# Convert one-hot back to original sourceID using the encoder
predicted_sourceids_final = encoder.inverse_transform(predicted_sourceids)

# Print the predicted sourceIDs
print("Predicted SourceIDs:")
print(predicted_sourceids_final)

In [None]:
#Convert one-hot back to original sourceID using the encoder
predicted_sourceids_final = encoder.inverse_transform(predicted_sourceids)