<h1  align='center'>Sepsis Next Activity and Remaining Time Prediction Model</h1>

<h1  align='center'>Model Training & Prediction</h1>

In [56]:
# ✅ Import Necessary Libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Embedding, Masking, Input, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import save_model


In [57]:

# ✅ Load Data
file_path_activity = "Sepsis_Merged_Selected_Features_Activity.csv"
file_path_biomarkers = "Sepsis_Biomarkers_Next_Activity.csv"
df_activity = pd.read_csv(file_path_activity)
df_biomarkers = pd.read_csv(file_path_biomarkers)


In [58]:

# ✅ Create 'Activity_Sequence' Column by Merging Activity Columns
activity_columns = [col for col in df_activity.columns if "Activity" in col]
df_activity["Activity_Sequence"] = df_activity[activity_columns].apply(lambda row: " -> ".join(row.dropna().astype(str)), axis=1)

# ✅ Merge Biomarkers with Activity Data on 'Case ID' to Ensure Inclusion
df_merged = df_activity.merge(df_biomarkers, on="Case ID", how="left")


In [59]:

# ✅ Create an Expanded Dataset Where Each Row Represents a Progressive Sequence
expanded_data = []

for _, row in df_merged.iterrows():
    full_sequence = row["Activity_Sequence"].split(" -> ")
    
    for i in range(1, len(full_sequence)):  # Create progressive sequences
        input_seq = " -> ".join(full_sequence[:i])  # Keep increasing sequence length
        next_activity = full_sequence[i]  # The next step in the sequence
        
        expanded_data.append({
            "Case ID": row["Case ID"],
            "Input_Sequence": input_seq,
            "Next_Activity": next_activity,
            "Biomarker": row["Biomarker"],
            "Biomarker_Value": row["Value"],
            "Biomarker_Range": row["Range"]
        })

# ✅ Convert to DataFrame
df_expanded = pd.DataFrame(expanded_data)


In [60]:

# ✅ Encode Next Activity as the Target Variable
label_encoder = LabelEncoder()
df_expanded["Next_Activity_Encoded"] = label_encoder.fit_transform(df_expanded["Next_Activity"])

# ✅ Tokenize Sequences
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(df_expanded["Input_Sequence"])
sequences = tokenizer.texts_to_sequences(df_expanded["Input_Sequence"])
max_sequence_length = max(map(len, sequences))
X_seq = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# ✅ Select Relevant Features (Including Biomarkers)
selected_features = [
    "SIRSCriteria2OrMore", "Infusion", "SIRSCritTemperature", "DiagnosticLacticAcid",
    "SIRSCritHeartRate", "DiagnosticXthorax", "SIRSCritTachypnea",
    "DiagnosticUrinarySediment", "Age", "InfectionSuspected"
]

# ✅ Merge Features from df_merged Based on 'Case ID'
df_expanded = df_expanded.merge(df_merged[["Case ID"] + selected_features], on="Case ID", how="left")


In [61]:

# ✅ Ensure the Feature Dataset Matches the Sequence Dataset
X_features = df_expanded[selected_features]

# ✅ Normalize Features
scaler = StandardScaler()
X_features = pd.DataFrame(scaler.fit_transform(X_features), columns=selected_features)

# ✅ Convert Next Activity to One-Hot Encoding
num_classes = df_expanded["Next_Activity_Encoded"].nunique()

y_seq = tf.keras.utils.to_categorical(df_expanded["Next_Activity_Encoded"], num_classes=num_classes)

# ✅ Ensure All Datasets Have the Same Length Before Training
min_length = min(len(X_seq), len(X_features), len(y_seq))
X_seq = X_seq[:min_length]
X_features = X_features[:min_length]
y_seq = y_seq[:min_length]


In [62]:

# ✅ Split Data into Training and Testing Sets
X_train_seq, X_test_seq, X_train_features, X_test_features, y_train, y_test = train_test_split(
    X_seq, X_features, y_seq, test_size=0.2, random_state=42, stratify=y_seq
)


In [63]:

# ✅ Define the LSTM-Based Model
sequence_input = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=max_sequence_length)(sequence_input)
masking_layer = Masking(mask_value=0.0)(embedding_layer)
lstm_layer = LSTM(128, return_sequences=False, dropout=0.2)(masking_layer)

feature_input = Input(shape=(len(selected_features),))
feature_dense = Dense(32, activation='relu')(feature_input)

merged = Concatenate()([lstm_layer, feature_dense])
dense_layer = Dense(64, activation='relu')(merged)
output_layer = Dense(num_classes, activation='softmax')(dense_layer)




In [64]:

# ✅ Compile the Model
model = Model(inputs=[sequence_input, feature_input], outputs=output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# ✅ Train the Model
history = model.fit([X_train_seq, X_train_features], y_train, validation_data=([X_test_seq, X_test_features], y_test), epochs=10, batch_size=32)

# ✅ Evaluate the Model
test_loss, test_accuracy = model.evaluate([X_test_seq, X_test_features], y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")


Epoch 1/10
[1m2957/2957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 62ms/step - accuracy: 0.2598 - loss: 2.0765 - val_accuracy: 0.2658 - val_loss: 2.0340
Epoch 2/10
[1m2957/2957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 62ms/step - accuracy: 0.2652 - loss: 2.0311 - val_accuracy: 0.2583 - val_loss: 2.0276
Epoch 3/10
[1m2957/2957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 61ms/step - accuracy: 0.2636 - loss: 2.0211 - val_accuracy: 0.2658 - val_loss: 2.0163
Epoch 4/10
[1m2957/2957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 61ms/step - accuracy: 0.2676 - loss: 2.0147 - val_accuracy: 0.2702 - val_loss: 2.0118
Epoch 5/10
[1m2957/2957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 61ms/step - accuracy: 0.2699 - loss: 2.0052 - val_accuracy: 0.2703 - val_loss: 2.0059
Epoch 6/10
[1m2957/2957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m434s[0m 147ms/step - accuracy: 0.2724 - loss: 2.0005 - val_accuracy: 0.2695 - val_loss: 2.00

In [65]:
def predict_next_activity(activity_sequence, feature_values, biomarker_values):
    

    # Tokenize and pad the sequence
    sequence = tokenizer.texts_to_sequences([activity_sequence])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')

    # Convert feature values to DataFrame and scale
    feature_array = np.array(feature_values).reshape(1, -1)
    feature_array = scaler.transform(pd.DataFrame(feature_array, columns=selected_features))

    # Ensure biomarker features match the training setup
    biomarker_priority = ["Leucocytes", "CRP", "LacticAcid"]  # Biomarkers used in training
    biomarker_feature_vector = np.zeros(3)  # Fixed size for 3 biomarker features

    for i, biomarker in enumerate(biomarker_priority):
        if biomarker in biomarker_values:
            biomarker_range = biomarker_values[biomarker]
            biomarker_feature_vector[i] = label_encoder.transform([biomarker_range])[0] if biomarker_range in label_encoder.classes_ else 0

    # Check if model expects only clinical features (i.e., no biomarkers were included in training)
    expected_feature_size = model.input_shape[1][1]  # Get expected feature count from model

    if expected_feature_size == len(selected_features):  # Model was trained only on clinical features
        full_feature_array = feature_array  # Ignore biomarker features
    else:  # Model includes biomarker features
        full_feature_array = np.concatenate((feature_array, biomarker_feature_vector.reshape(1, -1)), axis=1)

    # Ensure feature array matches training dimensions
    if full_feature_array.shape[1] != expected_feature_size:
        raise ValueError(f"Feature size mismatch! Expected {expected_feature_size}, but got {full_feature_array.shape[1]}.")

    # Predict using the model
    model_prediction = model.predict([padded_sequence, full_feature_array])
    predicted_class = np.argmax(model_prediction, axis=1)

    return label_encoder.inverse_transform(predicted_class)[0]


In [66]:
# Example activity sequence
example_sequence = "ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid"

# Example clinical feature values 
example_features_new = [1, 1, 1, 1, 1, 0, 0, 1, 50, 1]  

# Example biomarker values (all expected biomarkers)
biomarker_values = {
    "Leucocytes": "Elevated",
    "CRP": "Severe",
    "LacticAcid": "High"
}

# Predict the next activity
predicted_next_activity = predict_next_activity(
    example_sequence, 
    example_features_new, 
    biomarker_values
)

print(f"Predicted Next Activity: {predicted_next_activity}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 347ms/step
Predicted Next Activity: Leucocytes


In [75]:
# Define multiple test cases
test_cases = [
    {
        "activity_sequence": "ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid",
        "feature_values": [1, 1, 1, 1, 1, 0, 0, 1, 50, 1],
        "biomarker_values": {"Leucocytes": "High", "CRP": "Severe", "LacticAcid": "Normal"}
    },
    {
        "activity_sequence": "ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid -> Leucocytes",
        "feature_values": [1, 1, 1, 1, 1, 0, 0, 1, 60, 1],
        "biomarker_values": {"Leucocytes": "High", "CRP": "Severe", "LacticAcid": "Normal", "Leucocytes": "Elevated"}
    },
    {
        "activity_sequence": "ER Registration -> ER Triage -> CRP -> LacticAcid -> CRP",
        "feature_values": [0, 1, 1, 1, 1, 0, 0, 1, 40, 0],
        "biomarker_values": {"CRP": "Moderate", "LacticAcid": "Critical", "CRP": "Severe"}
    },
    {
        "activity_sequence": "ER Registration -> ER Triage -> LacticAcid -> CRP -> LacticAcid",
        "feature_values": [1, 0, 1, 1, 1, 0, 0, 0, 55, 1],
        "biomarker_values": {"LacticAcid": "High", "CRP": "Severe", "LacticAcid": "Critical"}
    }
]


In [76]:

# Run predictions for each test case using predict_next_activity
predictions = []

for test_case in test_cases:
    activity_sequence = test_case["activity_sequence"]
    feature_values = test_case["feature_values"]
    biomarker_values = test_case["biomarker_values"]
    predicted_next_activity = predict_next_activity(activity_sequence, feature_values, biomarker_values)

    predictions.append({
        "activity_sequence": activity_sequence,
        "feature_values": feature_values,
        "biomarker_values": biomarker_values,
        "predicted_next_activity": predicted_next_activity
    })


    
    
# Display the predictions

for prediction in predictions:
    print(f"Activity Sequence: {prediction['activity_sequence']}")
    print(f"Feature Values: {prediction['feature_values']}")
    print(f"Biomarker Values: {prediction['biomarker_values']}")
    print(f"Predicted Next Activity: {prediction['predicted_next_activity']}\n")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Activity Sequence: ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid
Feature Values: [1, 1, 1, 1, 1, 0, 0, 1, 50, 1]
Biomarker Values: {'Leucocytes': 'High', 'CRP': 'Severe', 'LacticAcid': 'Normal'}
Predicted Next Activity: Leucocytes

Activity Sequence: ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid -> Leucocytes
Feature Values: [1, 1, 1, 1, 1, 0, 0, 1, 60, 1]
Biomarker Values: {'Leucocytes': 'Elevated', 'CRP': 'Severe', 'LacticAcid': 'Normal'}
Predicted Next Activity: Leucocytes

Activity Sequence: ER Registration -> ER Triage -> CRP -> LacticAcid -> CRP
Feature Values: [0, 1, 1, 1, 1, 0, 0, 1, 40, 0]
Biomarker Values: {'CRP': 'Severe', 'LacticAcid': 'Critical'}
Predicte

# Training for Remaining Time prediction

In [68]:
# Load Data
file_path_time = "Sepsis_Cases_Log.csv"
df_time = pd.read_csv(file_path_time)

# Convert timestamps to datetime format
df_time["Complete Timestamp"] = pd.to_datetime(df_time["Complete Timestamp"], errors='coerce')
df_time = df_time.dropna(subset=["Complete Timestamp"])
df_time = df_time.sort_values(by=["Case ID", "Complete Timestamp"])

# Compute duration between activities within each case
df_time["Next Timestamp"] = df_time.groupby("Case ID")["Complete Timestamp"].shift(-1)
df_time["Activity Duration"] = (df_time["Next Timestamp"] - df_time["Complete Timestamp"]).dt.total_seconds()
df_time["Case Start Time"] = df_time.groupby("Case ID")["Complete Timestamp"].transform("first")
df_time["Total Case Duration"] = (df_time["Next Timestamp"] - df_time["Case Start Time"]).dt.total_seconds()
df_time = df_time[["Case ID", "Activity", "Activity Duration", "Total Case Duration"]].dropna()

# Compute average remaining time per activity
df_time_avg = df_time.groupby("Activity")["Total Case Duration"].mean().reset_index()
df_time_avg.rename(columns={"Total Case Duration": "Avg Remaining Time"}, inplace=True)
df_time = df_time.merge(df_time_avg, on="Activity", how="left")


  df_time["Complete Timestamp"] = pd.to_datetime(df_time["Complete Timestamp"], errors='coerce')


In [70]:
# Train RandomForest Model for Remaining Time Prediction
X_time = df_time[["Activity Duration"]]
y_time = df_time["Avg Remaining Time"]
X_train_time, X_test_time, y_train_time, y_test_time = train_test_split(X_time, y_time, test_size=0.2, random_state=42)
time_model = RandomForestRegressor(n_estimators=100, random_state=42)
time_model.fit(X_train_time, y_train_time)

# Tokenizer for sequence processing
tokenizer = Tokenizer()
all_activities = df_time["Activity"].unique().tolist()
tokenizer.fit_on_texts(all_activities)
max_sequence_length = max([len(tokenizer.texts_to_sequences([a])[0]) for a in all_activities])


In [71]:
# Function to Predict Next Activity and Remaining Time using NEW Selected Features
def predict_next_activity_and_time_new(activity_sequence, feature_values, biomarker_values):
    # Convert activity sequence to tokenized format
    sequence = tokenizer.texts_to_sequences([activity_sequence])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')

    # Predict the next activity using the updated features
    predicted_next_activity = predict_next_activity(activity_sequence, feature_values, biomarker_values)

    # Estimate Activity Duration from Historical Data
    if predicted_next_activity in df_time_avg["Activity"].values:
        predicted_activity_duration = df_time_avg[df_time_avg["Activity"] == predicted_next_activity]["Avg Remaining Time"].values[0]
    else:
        predicted_activity_duration = 600  # Default to 10 minutes if unknown

    # Predict Remaining Time
    predicted_remaining_time = time_model.predict([[predicted_activity_duration]])[0]
    
    return predicted_next_activity, round(predicted_remaining_time, 2)


In [72]:

# Example Usage with New Features
example_sequence = "ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid"
example_features_new = [1, 1, 1, 1, 1, 0, 0, 1, 50, 1]  # Using only new selected features
biomarker_values = {"Leucocytes": "Elevated", "CRP": "Severe", "LacticAcid": "High"}

predicted_next_activity, predicted_remaining_time = predict_next_activity_and_time_new(
    example_sequence, example_features_new, biomarker_values
)

print(f"Predicted Next Activity: {predicted_next_activity}, Predicted Remaining Time: {predicted_remaining_time} seconds (~{predicted_remaining_time/3600:.2f} hours)")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 406ms/step
Predicted Next Activity: Leucocytes, Predicted Remaining Time: 32473.04 seconds (~9.02 hours)




In [73]:

# Run predictions for each test case using predict_next_activity_and_time_new
predictions = []

for test_case in test_cases:
    activity_sequence = test_case["activity_sequence"]
    feature_values = test_case["feature_values"]
    biomarker_values = test_case["biomarker_values"]
    predicted_next_activity = predict_next_activity_and_time_new(activity_sequence, feature_values, biomarker_values)

    predictions.append({
        "activity_sequence": activity_sequence,
        "feature_values": feature_values,
        "biomarker_values": biomarker_values,
        "predicted_next_activity": predicted_next_activity
    })


    
    
# Display the predictions

for prediction in predictions:
    print(f"Activity Sequence: {prediction['activity_sequence']}")
    print(f"Feature Values: {prediction['feature_values']}")
    print(f"Biomarker Values: {prediction['biomarker_values']}")
    print(f"Predicted Next Activity: {prediction['predicted_next_activity']}\n")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Activity Sequence: ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid
Feature Values: [1, 1, 1, 1, 1, 0, 0, 1, 50, 1]
Biomarker Values: {'Leucocytes': 'High', 'CRP': 'Severe', 'LacticAcid': 'Normal'}
Predicted Next Activity: ('Leucocytes', np.float64(32473.04))

Activity Sequence: ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid -> Leucocytes
Feature Values: [1, 1, 1, 1, 1, 0, 0, 1, 60, 1]
Biomarker Values: {'Leucocytes': 'Elevated', 'CRP': 'Severe', 'LacticAcid': 'Normal'}
Predicted Next Activity: ('Leucocytes', np.float64(32473.04))

Activity Sequence: ER Registration -> ER Triage -> CRP -> LacticAcid -> CRP
Feature Values: [0, 1, 1, 1, 1, 0, 0, 1, 40, 0]
Biomarker Values: {'CRP': 'Severe', 'LacticAcid': 'Critical'}
Predicted Next Activity: ('Leucocytes', np.float64(32473.04))

Activity Sequence: ER Registration -> ER Triage -> LacticAcid -> CRP -> LacticAcid
Feature Values: [1, 0, 1, 



In [74]:

# Save the trained model in Keras format (recommended)
save_model(model, "sepsis_lstm_model.keras")

# Save tokenizer
with open("sepsis_tokenizer.pkl", "wb") as file:
    pickle.dump(tokenizer, file)

# Save label encoder
with open("sepsis_label_encoder.pkl", "wb") as file:
    pickle.dump(label_encoder, file)

# Save feature scaler
with open("sepsis_scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

# Save time prediction model
with open("sepsis_time_model.pkl", "wb") as file:
    pickle.dump(time_model, file)

df_time_avg.to_csv("Sepsis_Avg_Activity_Duration.csv", index=False)
