In [16]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, MultiHeadAttention, LayerNormalization, Embedding, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder




In [51]:
# Variables
learning_rate = 0.01
key_dim = 16
num_heads = 8

In [35]:
# Load the already encoded data
file_path = "../data/encoded_176398_HEAD.csv"
df = pd.read_csv(file_path)
print(f"Data loaded successfully from {file_path}.\n")
print("First 5 rows of the dataset before any processing:")
print(df.head())  # Print first few rows to understand the data structure

Data loaded successfully from ../data/encoded_176398_HEAD.csv.

First 5 rows of the dataset before any processing:
              datetime  sourceID  timediff  ZAxisInPossible  ZAxisOutPossible  \
0  2023-03-27 08:14:34        10       0.0              NaN               NaN   
1  2023-03-27 08:14:49         4      15.0              NaN               NaN   
2  2023-03-27 08:14:56         5      22.0              1.0               0.0   
3  2023-03-27 08:15:08         1      34.0              1.0               0.0   
4  2023-03-27 08:15:39        12      65.0              1.0               0.0   

   YAxisDownPossible  YAxisUpPossible       PTAB  BC  S1  ...  C24  EN  SHL  \
0                NaN              NaN        NaN   0 NaN  ...  NaN NaN  NaN   
1                NaN              NaN -1127700.0   0 NaN  ...  NaN NaN  NaN   
2                1.0              1.0 -1127700.0   0 NaN  ...  NaN NaN  NaN   
3                1.0              1.0 -1127700.0   1 NaN  ...  NaN NaN  NaN   
4  

In [36]:
# Drop unnecessary columns (based on your latest specification)
columns_to_drop = ['datetime', 'SN', 'ZAxisInPossible', 'ZAxisOutPossible', 'YAxisDownPossible', 
                   'YAxisUpPossible', 'BC', 'S1', 'S10', 'S11', 'S12', 'S2', 'S3', 'S4', 
                   'S5', 'S6', 'S7', 'S8', 'S9', 'BO1', 'BO2', 'BO3', 'B1', 'B2', 'B3', 'B4', 
                   'B5', 'HE2', 'HE4', 'NE2', 'HE1', 'HE3', 'NE1', 'SHA', 'HW1', 'HW2', 'HW3', 
                   '18K', 'FA', 'TO', 'BAL', 'BAR', 'BCL', 'BCR', 'HC2', 'HC4', 'HC6', 'HC7', 
                   'NC2', 'HC1', 'HC3', 'HC5', 'NC1', 'Na', 'UFL', 'PA1', 'PA2', 'PA3', 'PA4', 
                   'PA5', 'PA6', 'SP1', 'SP2', 'SP3', 'SP4', 'SP5', 'SP6', 'SP7', 'SP8', 'BL8', 
                   'BR8', 'UFS', 'HEA', 'HEP', 'SC', 'PeH', 'PeN', 'FS', 'FL', 'BY1', 'BY2', 
                   'BY3', 'BL', 'BR', 'HE', 'BL4', 'BR4', 'BL1', 'BR1', 'BL2', 'BR2', 'L7', 
                   'L4', 'H2L', 'N2L', 'H1U', 'N1U', 'He1', 'He2', 'TR1', 'TR2', 'TR3', 'TR4', 
                   'TR5', 'TR6', 'MR', 'ML', 'BL5', 'BR5', 'C24', 'EN', 'SHL', 'SHS','BodyPart_from', 
                   'BodyPart_to', 'PatientID_from', 'PatientID_to']
df = df.drop(columns=columns_to_drop)
print("Dropped unnecessary columns.")
print("Remaining columns:", df.columns.tolist())
print(df.head())

Dropped unnecessary columns.
Remaining columns: ['sourceID', 'timediff', 'PTAB', 'BodyGroup_from', 'BodyGroup_to']
   sourceID  timediff       PTAB  BodyGroup_from  BodyGroup_to
0        10       0.0        NaN               1             4
1         4      15.0 -1127700.0               1             4
2         5      22.0 -1127700.0               1             4
3         1      34.0 -1127700.0               1             4
4        12      65.0 -1127700.0               1             4


In [37]:
# Replace NaN values with 0 in the DataFrame
df.fillna(0, inplace=True)
print(df.head())

   sourceID  timediff       PTAB  BodyGroup_from  BodyGroup_to
0        10       0.0        0.0               1             4
1         4      15.0 -1127700.0               1             4
2         5      22.0 -1127700.0               1             4
3         1      34.0 -1127700.0               1             4
4        12      65.0 -1127700.0               1             4


In [38]:
# Separate features and target (sourceID)
X = df.drop(columns=['sourceID'])
y_sourceid = df['sourceID']

In [39]:
# One-hot encode 'sourceID'
encoder = OneHotEncoder(sparse=False)
y_sourceid_encoded = encoder.fit_transform(y_sourceid.values.reshape(-1, 1))
original_sourceids = encoder.categories_[0]
# Print the first few rows of y_sourceid_encoded to confirm one-hot encoding worked
print("\nFirst few one-hot encoded 'sourceID' values:")
print(y_sourceid_encoded[:5])
print("Shape of one-hot encoded 'sourceID':", y_sourceid_encoded.shape)


First few one-hot encoded 'sourceID' values:
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
Shape of one-hot encoded 'sourceID': (4501, 12)




In [40]:
# Scale the feature data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [41]:
# Assuming sequence length of 5 for Transformer input
sequence_length = 1000

In [42]:
def create_sequences(data, target, seq_length):
    sequences = []
    targets = []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i+seq_length])
        targets.append(target[i+seq_length])
    return np.array(sequences), np.array(targets)

X_sequences, y_sequences = create_sequences(X_scaled, y_sourceid_encoded, sequence_length)

In [43]:
# Transformer Encoder model
def transformer_model(input_shape, output_dim):
    inputs = Input(shape=input_shape)
    
    # Embedding layer to transform input to higher dimensional space (if needed)
    x = Dense(128, activation='relu')(inputs)
    
    # Multi-head attention layer
    attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(x, x)
    
    # Add & Norm
    x = LayerNormalization(epsilon=1e-6)(x + attention_output)
    
    # Feedforward layer
    ff_dim = 256  # You can adjust this dimension as needed
    x_ffn = Dense(ff_dim, activation='relu')(x)
    x_ffn = Dense(x.shape[-1])(x_ffn) 
    
    # Final classification layer (softmax for multi-class)
    outputs = Dense(output_dim, activation='softmax')(x[:, -1, :])  # Only last timestep output

    return Model(inputs=inputs, outputs=outputs)

In [44]:
# Define model
input_shape = (sequence_length, X_scaled.shape[1])
output_dim = y_sourceid_encoded.shape[1]
model = transformer_model(input_shape, output_dim)

In [45]:
# Compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss='categorical_crossentropy')

In [46]:
# Train model
history = model.fit(X_sequences, y_sequences, epochs=30, batch_size=32)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [47]:
# Predict on training data
predicted_sourceids = model.predict(X_sequences)



In [48]:
# Convert predicted probabilities back to class indices
predicted_classes = np.argmax(predicted_sourceids, axis=1)

In [49]:
# Convert one-hot back to original sourceID using the encoder
predicted_sourceids_final = encoder.inverse_transform(predicted_sourceids)
# Print results
print("\nFirst few predicted sourceIDs:")
print(predicted_sourceids_final[:32])


First few predicted sourceIDs:
[[5]
 [5]
 [4]
 [1]
 [1]
 [1]
 [1]
 [4]
 [4]
 [4]
 [4]
 [4]
 [4]
 [4]
 [5]
 [4]
 [1]
 [1]
 [1]
 [1]
 [4]
 [4]
 [4]
 [4]
 [4]
 [4]
 [4]
 [5]
 [4]
 [1]
 [1]
 [1]]


In [50]:
# Function to map one-hot encoded predictions back to sourceIDs
def map_onehot_to_sourceid(onehot_predictions, encoding_legend):
    sourceids = []
    for prediction in onehot_predictions:
        index = np.argmax(prediction)  # Find the index of the highest value
        sourceid = encoding_legend[index + 1]  # Map back using the legend (1-based index)
        sourceids.append(sourceid)
    return sourceids

# Encoding legend mapping (this is just a sample, adjust to your actual encoding legend)
encoding_legend = {
    1: 'MRI_CCS_11',
    2: 'MRI_EXU_95',
    3: 'MRI_FRR_18',
    4: 'MRI_FRR_257',
    5: 'MRI_FRR_264',
    6: 'MRI_FRR_3',
    7: 'MRI_FRR_34',
    8: 'MRI_MPT_1005',
    9: 'MRI_MSR_100',
    10: 'MRI_MSR_104',
    11: 'MRI_MSR_21',
    12: 'MRI_MSR_34'
}

# Map predicted one-hot encodings to original sourceIDs
predicted_sourceids_final = map_onehot_to_sourceid(predicted_sourceids, encoding_legend)

# Print the final predicted sourceIDs
print("\nFirst few predicted sourceIDs:")
print(predicted_sourceids_final[:32])

# Optional: Save the predicted sourceIDs to a CSV file
output_df = pd.DataFrame(predicted_sourceids_final, columns=["Predicted SourceID"])
output_df.to_csv("predicted_sourceids_final_1.csv", index=False)
print("Predicted sourceIDs saved to 'predicted_sourceids_final_1.csv'.")



First few predicted sourceIDs:
['MRI_FRR_264', 'MRI_FRR_264', 'MRI_FRR_257', 'MRI_CCS_11', 'MRI_CCS_11', 'MRI_CCS_11', 'MRI_CCS_11', 'MRI_FRR_257', 'MRI_FRR_257', 'MRI_FRR_257', 'MRI_FRR_257', 'MRI_FRR_257', 'MRI_FRR_257', 'MRI_FRR_257', 'MRI_FRR_264', 'MRI_FRR_257', 'MRI_CCS_11', 'MRI_CCS_11', 'MRI_CCS_11', 'MRI_CCS_11', 'MRI_FRR_257', 'MRI_FRR_257', 'MRI_FRR_257', 'MRI_FRR_257', 'MRI_FRR_257', 'MRI_FRR_257', 'MRI_FRR_257', 'MRI_FRR_264', 'MRI_FRR_257', 'MRI_CCS_11', 'MRI_CCS_11', 'MRI_CCS_11']
Predicted sourceIDs saved to 'predicted_sourceids_final_lower_lr.csv'.
