In [None]:
# ✅ Import Necessary Libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Embedding, Masking, Input, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pickle

# ✅ Load Data
file_path_activity = "Sepsis_Merged_Selected_Features_Activity.csv"
file_path_biomarkers = "Sepsis_Biomarkers_Next_Activity.csv"
df_activity = pd.read_csv(file_path_activity)
df_biomarkers = pd.read_csv(file_path_biomarkers)

# ✅ Create 'Activity_Sequence' Column by Merging Activity Columns
activity_columns = [col for col in df_activity.columns if "Activity" in col]
df_activity["Activity_Sequence"] = df_activity[activity_columns].apply(lambda row: " -> ".join(row.dropna().astype(str)), axis=1)

# ✅ Merge Biomarkers with Activity Data on 'Case ID' to Ensure Inclusion
df_merged = df_activity.merge(df_biomarkers, on="Case ID", how="left")

# ✅ Create an Expanded Dataset Where Each Row Represents a Progressive Sequence
expanded_data = []

for _, row in df_merged.iterrows():
    full_sequence = row["Activity_Sequence"].split(" -> ")
    
    for i in range(1, len(full_sequence)):  # Create progressive sequences
        input_seq = " -> ".join(full_sequence[:i])  # Keep increasing sequence length
        next_activity = full_sequence[i]  # The next step in the sequence
        
        expanded_data.append({
            "Case ID": row["Case ID"],
            "Input_Sequence": input_seq,
            "Next_Activity": next_activity,
            "Biomarker": row["Biomarker"],
            "Biomarker_Value": row["Value"],
            "Biomarker_Range": row["Range"]
        })

# ✅ Convert to DataFrame
df_expanded = pd.DataFrame(expanded_data)

# ✅ Encode Next Activity as the Target Variable
label_encoder = LabelEncoder()
df_expanded["Next_Activity_Encoded"] = label_encoder.fit_transform(df_expanded["Next_Activity"])

# ✅ Tokenize Sequences
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(df_expanded["Input_Sequence"])
sequences = tokenizer.texts_to_sequences(df_expanded["Input_Sequence"])
max_sequence_length = max(map(len, sequences))
X_seq = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# ✅ Select Relevant Features (Including Biomarkers)
selected_features = [
    "SIRSCriteria2OrMore", "Infusion", "SIRSCritTemperature", "DiagnosticLacticAcid",
    "SIRSCritHeartRate", "DiagnosticXthorax", "SIRSCritTachypnea",
    "DiagnosticUrinarySediment", "Age", "InfectionSuspected"
]

# ✅ Merge Features from df_merged Based on 'Case ID'
df_expanded = df_expanded.merge(df_merged[["Case ID"] + selected_features], on="Case ID", how="left")

# ✅ Ensure the Feature Dataset Matches the Sequence Dataset
X_features = df_expanded[selected_features]

# ✅ Normalize Features
scaler = StandardScaler()
X_features = pd.DataFrame(scaler.fit_transform(X_features), columns=selected_features)

# ✅ Convert Next Activity to One-Hot Encoding
num_classes = len(label_encoder.classes_)
y_seq = tf.keras.utils.to_categorical(df_expanded["Next_Activity_Encoded"], num_classes=num_classes)

# ✅ Ensure All Datasets Have the Same Length Before Training
min_length = min(len(X_seq), len(X_features), len(y_seq))
X_seq = X_seq[:min_length]
X_features = X_features[:min_length]
y_seq = y_seq[:min_length]

# ✅ Split Data into Training and Testing Sets
X_train_seq, X_test_seq, X_train_features, X_test_features, y_train, y_test = train_test_split(
    X_seq, X_features, y_seq, test_size=0.2, random_state=42, stratify=y_seq
)

# ✅ Define the LSTM-Based Model
sequence_input = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=max_sequence_length)(sequence_input)
masking_layer = Masking(mask_value=0.0)(embedding_layer)
lstm_layer = LSTM(128, return_sequences=False, dropout=0.2)(masking_layer)

feature_input = Input(shape=(len(selected_features),))
feature_dense = Dense(32, activation='relu')(feature_input)

merged = Concatenate()([lstm_layer, feature_dense])
dense_layer = Dense(64, activation='relu')(merged)
output_layer = Dense(num_classes, activation='softmax')(dense_layer)

# ✅ Compile the Model
model = Model(inputs=[sequence_input, feature_input], outputs=output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# ✅ Train the Model
history = model.fit([X_train_seq, X_train_features], y_train, validation_data=([X_test_seq, X_test_features], y_test), epochs=10, batch_size=32)

# ✅ Evaluate the Model
test_loss, test_accuracy = model.evaluate([X_test_seq, X_test_features], y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")




Epoch 1/10
[1m2957/2957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 67ms/step - accuracy: 0.2596 - loss: 2.0803 - val_accuracy: 0.2671 - val_loss: 2.0357
Epoch 2/10
[1m2957/2957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 74ms/step - accuracy: 0.2647 - loss: 2.0303 - val_accuracy: 0.2669 - val_loss: 2.0202
Epoch 3/10
[1m2957/2957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 66ms/step - accuracy: 0.2658 - loss: 2.0168 - val_accuracy: 0.2669 - val_loss: 2.0149
Epoch 4/10
[1m2957/2957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 67ms/step - accuracy: 0.2651 - loss: 2.0069 - val_accuracy: 0.2668 - val_loss: 2.0093
Epoch 5/10
[1m2957/2957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.2693 - loss: 2.0065

In [None]:

# ✅ Example Prediction Function
def predict_next_activity(activity_sequence, feature_values):
    sequence = tokenizer.texts_to_sequences([activity_sequence])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')

    feature_array = np.array(feature_values).reshape(1, -1)
    feature_array = scaler.transform(pd.DataFrame(feature_array, columns=selected_features))

    model_prediction = model.predict([padded_sequence, feature_array])
    predicted_class = np.argmax(model_prediction, axis=1)
    return label_encoder.inverse_transform(predicted_class)[0]

# ✅ Example Usage
example_sequence = "ER Registration -> ER Triage -> Leucocytes"
example_features = [1, 1, 1, 1, 1, 0, 0, 1, 50, 1]

predicted_next_activity = predict_next_activity(example_sequence, example_features)
print(f"Predicted Next Activity: {predicted_next_activity}")
