In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load Data
file_path_time = "Sepsis_Cases_Log.csv"
df_time = pd.read_csv(file_path_time)

# Convert timestamps to datetime format
df_time["Complete Timestamp"] = pd.to_datetime(df_time["Complete Timestamp"], errors='coerce')
df_time = df_time.dropna(subset=["Complete Timestamp"])
df_time = df_time.sort_values(by=["Case ID", "Complete Timestamp"])

# Compute duration between activities within each case
df_time["Next Timestamp"] = df_time.groupby("Case ID")["Complete Timestamp"].shift(-1)
df_time["Activity Duration"] = (df_time["Next Timestamp"] - df_time["Complete Timestamp"]).dt.total_seconds()
df_time["Case Start Time"] = df_time.groupby("Case ID")["Complete Timestamp"].transform("first")
df_time["Total Case Duration"] = (df_time["Next Timestamp"] - df_time["Case Start Time"]).dt.total_seconds()
df_time = df_time[["Case ID", "Activity", "Activity Duration", "Total Case Duration"]].dropna()

# Compute average remaining time per activity
df_time_avg = df_time.groupby("Activity")["Total Case Duration"].mean().reset_index()
df_time_avg.rename(columns={"Total Case Duration": "Avg Remaining Time"}, inplace=True)
df_time = df_time.merge(df_time_avg, on="Activity", how="left")

# Train RandomForest Model for Remaining Time Prediction
X_time = df_time[["Activity Duration"]]
y_time = df_time["Avg Remaining Time"]
X_train_time, X_test_time, y_train_time, y_test_time = train_test_split(X_time, y_time, test_size=0.2, random_state=42)
time_model = RandomForestRegressor(n_estimators=100, random_state=42)
time_model.fit(X_train_time, y_train_time)

# Tokenizer for sequence processing
tokenizer = Tokenizer()
all_activities = df_time["Activity"].unique().tolist()
tokenizer.fit_on_texts(all_activities)
max_sequence_length = max([len(tokenizer.texts_to_sequences([a])[0]) for a in all_activities])

# Function to Predict Next Activity and Remaining Time
def predict_next_activity_and_time(activity_sequence, feature_values, biomarker_values):
    # Convert activity sequence to tokenized format
    sequence = tokenizer.texts_to_sequences([activity_sequence])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')
    
    # Predict the next activity (this is a placeholder, should be replaced with trained model prediction)
    predicted_next_activity = all_activities[np.random.randint(0, len(all_activities))]
    
    # Estimate Activity Duration from Historical Data
    if predicted_next_activity in df_time_avg["Activity"].values:
        predicted_activity_duration = df_time_avg[df_time_avg["Activity"] == predicted_next_activity]["Avg Remaining Time"].values[0]
    else:
        predicted_activity_duration = 600  # Default to 10 minutes if unknown

    # Predict Remaining Time
    predicted_remaining_time = time_model.predict([[predicted_activity_duration]])[0]
    
    return predicted_next_activity, round(predicted_remaining_time, 2)

# Example Usage
example_sequence = "ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid"
example_features = [1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 50, 1]
biomarker_values = {"Leucocytes": "Elevated", "CRP": "Severe", "LacticAcid": "High"}

predicted_next_activity, predicted_remaining_time = predict_next_activity_and_time(
    example_sequence, example_features, biomarker_values)

print(f"Predicted Next Activity: {predicted_next_activity}, Predicted Remaining Time: {predicted_remaining_time} seconds (~{predicted_remaining_time/3600:.2f} hours)")

  df_time["Complete Timestamp"] = pd.to_datetime(df_time["Complete Timestamp"], errors='coerce')


Predicted Next Activity: Release B, Predicted Remaining Time: 28728.1 seconds (~7.98 hours)


