## BioMakers


In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Embedding, Masking, Input, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load Data
file_path = "Sepsis_Merged_Selected_Features_Activity.csv"
df = pd.read_csv(file_path)
df_biomarkers = pd.read_csv("Sepsis_Biomarkers_Next_Activity.csv")

df.fillna("None", inplace=True)

# Extract activity sequence columns
activity_columns = [col for col in df.columns if "Activity" in col]
df["Activity_Sequence"] = df[activity_columns].apply(lambda row: " -> ".join(row.values), axis=1)

# Encode final activity
y = df["Final Activity"]
label_encoder = LabelEncoder()
df["Final_Activity_Encoded"] = label_encoder.fit_transform(y)

# Tokenize activity sequences
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(df["Activity_Sequence"])
sequences = tokenizer.texts_to_sequences(df["Activity_Sequence"])

# Pad sequences to uniform length
max_sequence_length = max(map(len, sequences))
X_seq = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Select numerical features
feature_columns = [
    "DiagnosticArtAstrup", "DiagnosticUrinarySediment", "SIRSCritHeartRate", "SIRSCritTachypnea",
    "SIRSCritTemperature", "Hypotensie", "SIRSCritLeucos", "DiagnosticLacticAcid", "Oligurie",
    "Hypoxie", "DisfuncOrg", "Infusion", "Age", "InfectionSuspected"
]
X_features = df[feature_columns]

# Normalize all numerical features
scaler = StandardScaler()
X_features = pd.DataFrame(scaler.fit_transform(X_features), columns=feature_columns)

y_seq = tf.keras.utils.to_categorical(df["Final_Activity_Encoded"], num_classes=len(label_encoder.classes_))

# Split data
X_train_seq, X_test_seq, X_train_features, X_test_features, y_train, y_test = train_test_split(
    X_seq, X_features, y_seq, test_size=0.2, random_state=42, stratify=df["Final_Activity_Encoded"]
)

# Define LSTM Model
sequence_input = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=max_sequence_length)(sequence_input)
masking_layer = Masking(mask_value=0.0)(embedding_layer)
lstm_layer = LSTM(64, return_sequences=False)(masking_layer)

# Define Feature Input Model
feature_input = Input(shape=(len(feature_columns),))
feature_dense = Dense(32, activation='relu')(feature_input)

# Merge Sequence and Feature Inputs
merged = Concatenate()([lstm_layer, feature_dense])
dense_layer = Dense(32, activation='relu')(merged)
output_layer = Dense(len(label_encoder.classes_), activation='softmax')(dense_layer)

# Compile Model
model = Model(inputs=[sequence_input, feature_input], outputs=output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train Model
history = model.fit([X_train_seq, X_train_features], y_train, validation_data=([X_test_seq, X_test_features], y_test), epochs=10, batch_size=32)

# Evaluate Model
test_loss, test_accuracy = model.evaluate([X_test_seq, X_test_features], y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Save Model and Tokenizer
model.save("sepsis_lstm_model.h5")
pd.to_pickle(tokenizer, "sepsis_tokenizer.pkl")
pd.to_pickle(label_encoder, "sepsis_label_encoder.pkl")
pd.to_pickle(scaler, "sepsis_scaler.pkl")

# Biomarker-Based Activity Mapping (Prioritized Decision Making)
biomarker_priority = ["LacticAcid", "CRP", "Leucocytes"]
biomarker_next_activity_mapping = {
    "Leucocytes": {"High": "LacticAcid", "Elevated": "CRP", "Normal": "ER Triage"},
    "CRP": {"Severe": "IV Antibiotics", "Moderate": "LacticAcid", "Low": "ER Triage"},
    "LacticAcid": {"Critical": "ICU Admission", "High": "IV Fluid", "Normal": "ER Triage"}
}

# Function to Predict Next Activity with Priority-Based Biomarker Handling
def predict_next_activity(activity_sequence, feature_values, biomarker_values):
    sequence = tokenizer.texts_to_sequences([activity_sequence])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')
    
    feature_array = np.array(feature_values).reshape(1, -1)
    feature_array = scaler.transform(pd.DataFrame(feature_array, columns=feature_columns))
    
    for biomarker in biomarker_priority:
        if biomarker in biomarker_values:
            biomarker_value = biomarker_values[biomarker]
            if biomarker_value in biomarker_next_activity_mapping[biomarker]:
                return biomarker_next_activity_mapping[biomarker][biomarker_value]
    
    model_prediction = model.predict([padded_sequence, feature_array])
    predicted_class = np.argmax(model_prediction, axis=1)
    return label_encoder.inverse_transform(predicted_class)[0]

# Example Usage
example_sequence = "ER Registration -> ER Triage -> Leucocytes"
example_features = [1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 50, 1]
biomarker_values = {"Leucocytes": "High", "CRP": "Severe", "LacticAcid": "Normal"}
predicted_next_activity = predict_next_activity(example_sequence, example_features, biomarker_values)
print(f"Predicted Next Activity: {predicted_next_activity}")


  df.fillna("None", inplace=True)


Epoch 1/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 70ms/step - accuracy: 0.2505 - loss: 2.4508 - val_accuracy: 0.4211 - val_loss: 1.8736
Epoch 2/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.4290 - loss: 1.8305 - val_accuracy: 0.4421 - val_loss: 1.7168
Epoch 3/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.5453 - loss: 1.5787 - val_accuracy: 0.7000 - val_loss: 1.1160
Epoch 4/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - accuracy: 0.6766 - loss: 1.1065 - val_accuracy: 0.7632 - val_loss: 0.9103
Epoch 5/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.7734 - loss: 0.8449 - val_accuracy: 0.7947 - val_loss: 0.7413
Epoch 6/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.8007 - loss: 0.6684 - val_accuracy: 0.8053 - val_loss: 0.6703
Epoch 7/10
[1m24/24[0m [32m━━━━



Test Accuracy: 0.88
Predicted Next Activity: ER Triage


In [2]:
example_sequence = "ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid"
example_features = [1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 50, 1]
biomarker_values = {"Leucocytes": "High", "CRP": "Moderate", "LacticAcid": "Critical"}

predicted_next_activity = predict_next_activity(example_sequence, example_features, biomarker_values)
print(f"Predicted Next Activity: {predicted_next_activity}")


Predicted Next Activity: ICU Admission


In [3]:
example_sequence = "ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid"
example_features = [1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 45, 1]
biomarker_values = {"Leucocytes": "Normal", "CRP": "Low", "LacticAcid": "Normal"}

predicted_next_activity = predict_next_activity(example_sequence, example_features, biomarker_values)
print(f"Test 1 - Predicted Next Activity: {predicted_next_activity}")


Test 1 - Predicted Next Activity: ER Triage


In [4]:
example_sequence = "ER Registration -> ER Triage -> Leucocytes -> CRP"
example_features = [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 55, 1]
biomarker_values = {"Leucocytes": "Elevated", "CRP": "Moderate"}

predicted_next_activity = predict_next_activity(example_sequence, example_features, biomarker_values)
print(f"Test 2 - Predicted Next Activity: {predicted_next_activity}")


Test 2 - Predicted Next Activity: LacticAcid


In [5]:
example_sequence = "ER Registration -> ER Triage -> LacticAcid"
example_features = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 60, 1]
biomarker_values = {"LacticAcid": "Critical"}

predicted_next_activity = predict_next_activity(example_sequence, example_features, biomarker_values)
print(f"Test 3 - Predicted Next Activity: {predicted_next_activity}")


Test 3 - Predicted Next Activity: ICU Admission


In [6]:
example_sequence = "ER Registration -> ER Triage -> CRP"
example_features = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 65, 1]
biomarker_values = {"CRP": "Severe"}

predicted_next_activity = predict_next_activity(example_sequence, example_features, biomarker_values)
print(f"Test 4 - Predicted Next Activity: {predicted_next_activity}")


Test 4 - Predicted Next Activity: IV Antibiotics


In [7]:
example_sequence = "ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid"
example_features = [1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 50, 1]
biomarker_values = {"Leucocytes": "Elevated", "CRP": "Severe", "LacticAcid": "High"}

predicted_next_activity = predict_next_activity(example_sequence, example_features, biomarker_values)
print(f"Test 5 - Predicted Next Activity: {predicted_next_activity}")


Test 5 - Predicted Next Activity: IV Fluid


## Remaining Time Prediction 

In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load Data
file_path_time = "Sepsis_Cases_Log.csv"
df_time = pd.read_csv(file_path_time)

# Convert timestamps to datetime format
df_time["Complete Timestamp"] = pd.to_datetime(df_time["Complete Timestamp"], errors='coerce')
df_time = df_time.dropna(subset=["Complete Timestamp"])
df_time = df_time.sort_values(by=["Case ID", "Complete Timestamp"])

# Compute duration between activities within each case
df_time["Next Timestamp"] = df_time.groupby("Case ID")["Complete Timestamp"].shift(-1)
df_time["Activity Duration"] = (df_time["Next Timestamp"] - df_time["Complete Timestamp"]).dt.total_seconds()
df_time["Case Start Time"] = df_time.groupby("Case ID")["Complete Timestamp"].transform("first")
df_time["Total Case Duration"] = (df_time["Next Timestamp"] - df_time["Case Start Time"]).dt.total_seconds()
df_time = df_time[["Case ID", "Activity", "Activity Duration", "Total Case Duration"]].dropna()

# Compute average remaining time per activity
df_time_avg = df_time.groupby("Activity")["Total Case Duration"].mean().reset_index()
df_time_avg.rename(columns={"Total Case Duration": "Avg Remaining Time"}, inplace=True)
df_time = df_time.merge(df_time_avg, on="Activity", how="left")

# Train RandomForest Model for Remaining Time Prediction
X_time = df_time[["Activity Duration"]]
y_time = df_time["Avg Remaining Time"]
X_train_time, X_test_time, y_train_time, y_test_time = train_test_split(X_time, y_time, test_size=0.2, random_state=42)
time_model = RandomForestRegressor(n_estimators=100, random_state=42)
time_model.fit(X_train_time, y_train_time)

# Tokenizer for sequence processing
tokenizer = Tokenizer()
all_activities = df_time["Activity"].unique().tolist()
tokenizer.fit_on_texts(all_activities)
max_sequence_length = max([len(tokenizer.texts_to_sequences([a])[0]) for a in all_activities])

# Function to Predict Next Activity and Remaining Time
def predict_next_activity_and_time(activity_sequence, feature_values, biomarker_values):
    # Convert activity sequence to tokenized format
    sequence = tokenizer.texts_to_sequences([activity_sequence])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')
    
    # Predict the next activity (this is a placeholder, should be replaced with trained model prediction)
    predicted_next_activity = predict_next_activity(activity_sequence, feature_values, biomarker_values)
    
    # Estimate Activity Duration from Historical Data
    if predicted_next_activity in df_time_avg["Activity"].values:
        predicted_activity_duration = df_time_avg[df_time_avg["Activity"] == predicted_next_activity]["Avg Remaining Time"].values[0]
    else:
        predicted_activity_duration = 600  # Default to 10 minutes if unknown

    # Predict Remaining Time
    predicted_remaining_time = time_model.predict([[predicted_activity_duration]])[0]
    
    return predicted_next_activity, round(predicted_remaining_time, 2)

# Example Usage
example_sequence = "ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid"
example_features = [1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 50, 1]
biomarker_values = {"Leucocytes": "Elevated", "CRP": "Severe", "LacticAcid": "High"}

predicted_next_activity, predicted_remaining_time = predict_next_activity_and_time(example_sequence, example_features, biomarker_values)

print(f"Predicted Next Activity: {predicted_next_activity}, Predicted Remaining Time: {predicted_remaining_time} seconds (~{predicted_remaining_time/3600:.2f} hours)")

  df_time["Complete Timestamp"] = pd.to_datetime(df_time["Complete Timestamp"], errors='coerce')


Predicted Next Activity: IV Fluid, Predicted Remaining Time: 32540.74 seconds (~9.04 hours)




In [9]:
example_sequence = "ER Registration -> ER Triage -> CRP"
example_features = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 65, 1]
biomarker_values = {"CRP": "Severe"}

predicted_next_activity, predicted_remaining_time = predict_next_activity_and_time(example_sequence, example_features, biomarker_values)

print(f"Predicted Next Activity: {predicted_next_activity}, Predicted Remaining Time: {predicted_remaining_time} seconds (~{predicted_remaining_time/3600:.2f} hours)")

Predicted Next Activity: IV Antibiotics, Predicted Remaining Time: 47704.86 seconds (~13.25 hours)




In [10]:
example_sequence = "ER Registration -> ER Triage -> LacticAcid"
example_features = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 60, 1]
biomarker_values = {"LacticAcid": "Critical"}
predicted_next_activity, predicted_remaining_time = predict_next_activity_and_time(example_sequence, example_features, biomarker_values)

print(f"Predicted Next Activity: {predicted_next_activity}, Predicted Remaining Time: {predicted_remaining_time} seconds (~{predicted_remaining_time/3600:.2f} hours)")

Predicted Next Activity: ICU Admission, Predicted Remaining Time: 32540.74 seconds (~9.04 hours)




In [11]:
example_sequence = "ER Registration -> ER Triage -> LacticAcid"
example_features = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 60, 1]
biomarker_values = {"LacticAcid": "Critical"}

predicted_next_activity, predicted_remaining_time = predict_next_activity_and_time(example_sequence, example_features, biomarker_values)

print(f"Predicted Next Activity: {predicted_next_activity}, Predicted Remaining Time: {predicted_remaining_time} seconds (~{predicted_remaining_time/3600:.2f} hours)")

Predicted Next Activity: ICU Admission, Predicted Remaining Time: 32540.74 seconds (~9.04 hours)




In [12]:
example_sequence = "ER Registration -> ER Triage -> Leucocytes -> CRP"
example_features = [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 55, 1]
biomarker_values = {"Leucocytes": "Elevated", "CRP": "Moderate"}

predicted_next_activity, predicted_remaining_time = predict_next_activity_and_time(example_sequence, example_features, biomarker_values)

print(f"Predicted Next Activity: {predicted_next_activity}, Predicted Remaining Time: {predicted_remaining_time} seconds (~{predicted_remaining_time/3600:.2f} hours)")

Predicted Next Activity: LacticAcid, Predicted Remaining Time: 31062.2 seconds (~8.63 hours)




In [13]:
example_sequence = "ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid"
example_features = [1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 45, 1]
biomarker_values = {"Leucocytes": "Normal", "CRP": "Low", "LacticAcid": "Normal"}

predicted_next_activity, predicted_remaining_time = predict_next_activity_and_time(example_sequence, example_features, biomarker_values)

print(f"Predicted Next Activity: {predicted_next_activity}, Predicted Remaining Time: {predicted_remaining_time} seconds (~{predicted_remaining_time/3600:.2f} hours)")



Predicted Next Activity: ER Triage, Predicted Remaining Time: 13502.19 seconds (~3.75 hours)


In [14]:
example_sequence = "ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid"
example_features = [1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 50, 1]
biomarker_values = {"Leucocytes": "High", "CRP": "Moderate", "LacticAcid": "Critical"}

predicted_next_activity, predicted_remaining_time = predict_next_activity_and_time(example_sequence, example_features, biomarker_values)

print(f"Predicted Next Activity: {predicted_next_activity}, Predicted Remaining Time: {predicted_remaining_time} seconds (~{predicted_remaining_time/3600:.2f} hours)")



Predicted Next Activity: ICU Admission, Predicted Remaining Time: 32540.74 seconds (~9.04 hours)


In [15]:
from tensorflow.keras.models import save_model

# Save the trained model
save_model(model, "sepsis_lstm_model.keras")


In [16]:
import pickle

# Save tokenizer
with open("sepsis_tokenizer.pkl", "wb") as file:
    pickle.dump(tokenizer, file)


In [17]:
with open("sepsis_label_encoder.pkl", "wb") as file:
    pickle.dump(label_encoder, file)


In [18]:
with open("sepsis_scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)


In [19]:
with open("sepsis_time_model.pkl", "wb") as file:
    pickle.dump(time_model, file)


In [20]:
df_time_avg.to_csv("Sepsis_Avg_Activity_Duration.csv", index=False)
