In [18]:
import pandas as pd

# Load the datasets
activity_flow_path = "Sepsis_Activity_Flow.csv"
biomarkers_path = "Sepsis_Biomarkers_Next_Activity.csv"
features_path = "Sepsis_Selected_Features.csv"

activity_flow_df = pd.read_csv(activity_flow_path)
biomarkers_df = pd.read_csv(biomarkers_path)
features_df = pd.read_csv(features_path)

# Display the first few rows of each dataset to understand their structure
activity_flow_df.head(), biomarkers_df.head(), features_df.head()


(  Case ID       Activity 1  Activity 2        Activity 3  Activity 4  \
 0       A  ER Registration  Leucocytes               CRP  LacticAcid   
 1       B  ER Registration   ER Triage               CRP  LacticAcid   
 2       C  ER Registration   ER Triage  ER Sepsis Triage  Leucocytes   
 3       D  ER Registration   ER Triage  ER Sepsis Triage         CRP   
 4       E  ER Registration   ER Triage  ER Sepsis Triage   IV Liquid   
 
    Activity 5        Activity 6      Activity 7      Activity 8    Activity 9  \
 0   ER Triage  ER Sepsis Triage       IV Liquid  IV Antibiotics  Admission NC   
 1  Leucocytes  ER Sepsis Triage       IV Liquid  IV Antibiotics  Admission NC   
 2         CRP         IV Liquid  IV Antibiotics    Admission NC  Admission NC   
 3  LacticAcid        Leucocytes       IV Liquid  IV Antibiotics  Admission NC   
 4         CRP        Leucocytes      LacticAcid  IV Antibiotics           NaN   
 
    ... Activity 40 Activity 41 Activity 42 Activity 43 Activity 4

In [19]:
# Preprocessing the datasets to extract useful training features

# 1. Transform Activity Flow Data: Convert sequence format into a training dataset where each row represents 
#    (Current Sequence -> Next Activity)
activity_sequences = []
for _, row in activity_flow_df.iterrows():
    case_id = row["Case ID"]
    activities = row.drop("Case ID").dropna().tolist()
    for i in range(len(activities) - 1):
        activity_sequences.append({
            "Case ID": case_id,
            "Current Sequence": " -> ".join(activities[:i+1]),  # Sequence so far
            "Next Activity": activities[i+1]  # What follows next
        })
activity_flow_df_transformed = pd.DataFrame(activity_sequences)

# 2. Merge with Selected Features
#    Each Case ID has its own clinical feature set that helps filter the next activities
merged_features_df = activity_flow_df_transformed.merge(features_df, on="Case ID", how="left")

# 3. Merge with Biomarkers Data
#    Using biomarkers to further refine next activity prediction
biomarkers_df_grouped = biomarkers_df.groupby(["Case ID", "Activity"])["Range"].apply(list).reset_index()
biomarkers_df_grouped.rename(columns={"Activity": "Last Activity", "Range": "Biomarker Values"}, inplace=True)
merged_biomarkers_df = merged_features_df.merge(biomarkers_df_grouped, 
                                                left_on=["Case ID", "Next Activity"], 
                                                right_on=["Case ID", "Last Activity"], 
                                                how="left").drop(columns=["Last Activity"])

# Display the processed dataset
merged_biomarkers_df.head()

Unnamed: 0,Case ID,Current Sequence,Next Activity,SIRSCriteria2OrMore,Infusion,SIRSCritTemperature,DiagnosticLacticAcid,SIRSCritHeartRate,DiagnosticXthorax,SIRSCritTachypnea,DiagnosticUrinarySediment,Age,InfectionSuspected,Biomarker Values
0,A,ER Registration,Leucocytes,True,True,True,True,True,True,True,True,85.0,True,"[Normal, Normal, Normal, Normal, Elevated, Nor..."
1,A,ER Registration -> Leucocytes,CRP,True,True,True,True,True,True,True,True,85.0,True,"[Low, Moderate, Low, Low, Low, Low, Low]"
2,A,ER Registration -> Leucocytes -> CRP,LacticAcid,True,True,True,True,True,True,True,True,85.0,True,[Elevated]
3,A,ER Registration -> Leucocytes -> CRP -> Lactic...,ER Triage,True,True,True,True,True,True,True,True,85.0,True,
4,A,ER Registration -> Leucocytes -> CRP -> Lactic...,ER Sepsis Triage,True,True,True,True,True,True,True,True,85.0,True,


In [30]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Encode the Current Sequence using Label Encoding
seq_encoder = LabelEncoder()
merged_biomarkers_df["Encoded Sequence"] = seq_encoder.fit_transform(merged_biomarkers_df["Current Sequence"])

# Encode the Next Activity as the target variable
activity_encoder = LabelEncoder()
merged_biomarkers_df["Encoded Next Activity"] = activity_encoder.fit_transform(merged_biomarkers_df["Next Activity"])

# Encode Biomarker Values (One-Hot Encoding for categorical biomarker ranges)
biomarker_ohe = OneHotEncoder(handle_unknown="ignore")
biomarker_encoded = biomarker_ohe.fit_transform(merged_biomarkers_df["Biomarker Values"].fillna("").astype(str).values.reshape(-1, 1)).toarray()

# Convert boolean and categorical features into numerical format
feature_columns = [
    "SIRSCriteria2OrMore", "Infusion", "SIRSCritTemperature", 
    "DiagnosticLacticAcid", "SIRSCritHeartRate", "DiagnosticXthorax", 
    "SIRSCritTachypnea", "DiagnosticUrinarySediment", "Age", "InfectionSuspected"
]

X = np.hstack((merged_biomarkers_df[feature_columns].fillna(0).astype(float).values, biomarker_encoded, merged_biomarkers_df["Encoded Sequence"].values.reshape(-1,1)))
y = merged_biomarkers_df["Encoded Next Activity"].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=activity_encoder.classes_)

accuracy, classification_rep

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.72233564720294,
 '                  precision    recall  f1-score   support\n\n    Admission IC       0.00      0.00      0.00        18\n    Admission NC       0.32      0.37      0.35       211\n             CRP       0.96      0.94      0.95       535\nER Sepsis Triage       0.79      0.85      0.82       197\n       ER Triage       0.89      0.92      0.90       205\n  IV Antibiotics       0.28      0.36      0.32       146\n       IV Liquid       0.62      0.46      0.53       164\n      LacticAcid       0.84      0.84      0.84       220\n      Leucocytes       0.92      0.92      0.92       554\n       Release A       0.07      0.07      0.07       121\n       Release B       0.00      0.00      0.00        16\n       Release C       0.00      0.00      0.00         4\n       Release D       0.00      0.00      0.00         6\n       Release E       0.00      0.00      0.00         2\n       Return ER       0.04      0.04      0.04        50\n\n        accuracy               

In [32]:
# Define the example input data
example_inputs = [
    {
        "sequence": "ER Registration -> ER Triage -> ER Sepsis Triage -> Leucocytes -> LacticAcid -> CRP -> IV Antibiotics -> IV Liquid",
        "features": [True, True, True, True, True, True, False, True, 85.0, True],  # Clinical feature values
        "biomarkers": {'Leucocytes': 'High', 'LacticAcid': 'Normal', 'CRP': 'Mild'}  # Biomarker values
    },
    {
        "sequence": "ER Registration -> ER Triage -> ER Sepsis Triage -> Leucocytes -> CRP -> LacticAcid -> IV Liquid -> IV Antibiotics -> Admission NC -> CRP -> Leucocytes -> Leucocytes -> CRP -> Release A",
        "features": [True, True, True, True, True, True, True, True, 70.0, True],  # Clinical feature values
        "biomarkers": {'Leucocytes': 'Low', 'CRP': 'Low', 'LacticAcid': 'Normal'}  # Biomarker values
    },
    {
        "sequence": "ER Registration -> ER Triage -> ER Sepsis Triage -> Leucocytes -> LacticAcid -> CRP -> IV Liquid -> IV Antibiotics -> Admission NC -> Leucocytes -> Leucocytes -> CRP -> Leucocytes -> Leucocytes -> CRP -> Release B",
        "features": [True, True, True, True, True, True, False, True, 75.0, True],  # Clinical feature values
        "biomarkers": {'Leucocytes': 'Critical', 'LacticAcid': 'Borderline', 'CRP': 'Mild'}  # Biomarker values
    },
    {
        "sequence": "ER Registration -> ER Triage -> ER Sepsis Triage -> CRP -> Leucocytes -> IV Liquid -> IV Antibiotics -> Admission NC -> Admission NC -> CRP -> CRP -> Release A -> Return ER",
        "features": [True, True, False, True, True, True, True, True, 65.0, True],  # Clinical feature values
        "biomarkers": {'CRP': 'Low', 'Leucocytes': 'Low'}  # Biomarker values
    },
    {
        "sequence": "ER Registration -> ER Triage -> ER Sepsis Triage -> CRP -> Leucocytes -> Admission NC -> CRP -> Leucocytes -> CRP -> Release A",
        "features": [False, False, False, False, False, False, False, False, 80.0, False],  # Clinical feature values
        "biomarkers": {'CRP': 'Moderate', 'Leucocytes': 'Normal'}  # Biomarker values
    },
    {
        "sequence": "ER Registration -> ER Triage -> ER Sepsis Triage -> Leucocytes -> CRP -> LacticAcid -> IV Liquid -> IV Antibiotics -> Admission NC -> Leucocytes -> CRP -> Release A -> Return ER",
        "features": [True, True, True, True, False, True, True, False, 75.0, True],  # Clinical feature values
        "biomarkers": {'Leucocytes': 'Normal', 'CRP': 'Moderate', 'LacticAcid': 'Normal'}  # Biomarker values
    },
    {
        "sequence": "ER Registration -> ER Triage -> CRP -> Leucocytes -> ER Sepsis Triage",
        "features": [False, False, False, False, False, False, False, False, 45.0, False],  # Clinical feature values
        "biomarkers": {'Leucocytes': 'Normal'}  # Biomarker values
    },
    {
        "sequence": "ER Registration -> ER Triage -> ER Sepsis Triage -> Leucocytes -> CRP -> LacticAcid -> Admission NC -> Release B",
        "features": [True, False, True, True, False, True, True, True, 90.0, True],  # Clinical feature values
        "biomarkers": {'Leucocytes': 'High', 'LacticAcid': 'Borderline'}  # Biomarker values
    },
    {
        "sequence": "ER Registration -> ER Triage -> ER Sepsis Triage -> CRP -> Leucocytes",
        "features": [False, False, False, False, False, False, False, False, 90.0, False],  # Clinical feature values
        "biomarkers": {'CRP': 'Moderate', 'Leucocytes': 'Critical'}  # Biomarker values
    },
    {
        "sequence": "ER Registration -> ER Triage -> ER Sepsis Triage -> Leucocytes -> CRP -> LacticAcid -> IV Liquid -> IV Antibiotics -> Admission NC -> LacticAcid -> CRP -> CRP -> LacticAcid -> Leucocytes -> Leucocytes -> CRP -> Leucocytes -> CRP -> Release B",
        "features": [True, True, True, True, True, True, False, True, 85.0, True],  # Clinical feature values
        "biomarkers": {'Leucocytes': 'Normal', 'CRP': 'Low', 'LacticAcid': 'Elevated'}  # Biomarker values
    }
]


In [33]:

# Extend the encoder classes to handle unseen sequences
all_sequences = np.concatenate((seq_encoder.classes_, [ex["sequence"] for ex in example_inputs]))
seq_encoder.fit(all_sequences)

for example in example_inputs:
    sequence = example["sequence"]
    
    # Check if the sequence is known to the encoder
    if sequence in seq_encoder.classes_:
        example_sequence_encoded = seq_encoder.transform([sequence])[0]
    else:
        print(f"Warning: Unseen sequence encountered: {sequence}")
        example_sequence_encoded = -1  # Assign a default unknown value

    # Encode the biomarker values using the same encoding logic
    biomarker_vector = biomarker_ohe.transform(
        np.array([str(list(example["biomarkers"].values()))]).reshape(-1, 1)
    ).toarray()

    # Prepare the input feature vector (stacking clinical features, biomarkers, and encoded sequence)
    example_input_features = np.hstack((
        np.array(example["features"]).reshape(1, -1),  # Convert clinical features to numpy array
        biomarker_vector,  # Include one-hot encoded biomarker values
        np.array([[example_sequence_encoded]])  # Add the encoded sequence
    ))

    # Ensure input shape matches model expectations
    if example_input_features.shape[1] != X_train.shape[1]:
        print(f"Error: Feature shape mismatch. Expected {X_train.shape[1]}, got {example_input_features.shape[1]}")
        continue  # Skip prediction for this sample

    # Predict the next activity
    predicted_activity_encoded = rf_model.predict(example_input_features)[0]
    predicted_activity = activity_encoder.inverse_transform([predicted_activity_encoded])[0]

    # Display result
    print(f"Predicted next activity for sequence: {sequence}")
    print(f"Predicted Activity: {predicted_activity}\n")

Predicted next activity for sequence: ER Registration -> ER Triage -> ER Sepsis Triage -> Leucocytes -> LacticAcid -> CRP -> IV Antibiotics -> IV Liquid
Predicted Activity: CRP

Predicted next activity for sequence: ER Registration -> ER Triage -> ER Sepsis Triage -> Leucocytes -> CRP -> LacticAcid -> IV Liquid -> IV Antibiotics -> Admission NC -> CRP -> Leucocytes -> Leucocytes -> CRP -> Release A
Predicted Activity: Leucocytes

Predicted next activity for sequence: ER Registration -> ER Triage -> ER Sepsis Triage -> Leucocytes -> LacticAcid -> CRP -> IV Liquid -> IV Antibiotics -> Admission NC -> Leucocytes -> Leucocytes -> CRP -> Leucocytes -> Leucocytes -> CRP -> Release B
Predicted Activity: CRP

Predicted next activity for sequence: ER Registration -> ER Triage -> ER Sepsis Triage -> CRP -> Leucocytes -> IV Liquid -> IV Antibiotics -> Admission NC -> Admission NC -> CRP -> CRP -> Release A -> Return ER
Predicted Activity: CRP

Predicted next activity for sequence: ER Registration

In [34]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np


# Load the dataset
# Replace this with the correct path to your dataset
sepsis_log_path = "Sepsis_Cases_Log.csv"
sepsis_log_df = pd.read_csv(sepsis_log_path)


# Function to convert timestamps to seconds
def parse_timestamp_to_seconds(timestamp):
    try:
        parts = timestamp.split(":")
        if len(parts) == 3:
            hours = int(parts[0])
            minutes = int(parts[1])
            seconds = int(parts[2].split(".")[0])
            return hours * 3600 + minutes * 60 + seconds
        elif len(parts) == 2:
            hours = int(parts[0])
            minutes = int(parts[1].split(".")[0])
            return hours * 3600 + minutes * 60
        else:
            return 0
    except Exception:
        return 0


# Apply conversion to seconds
sepsis_log_df["Complete Timestamp (seconds)"] = sepsis_log_df["Complete Timestamp"].apply(parse_timestamp_to_seconds)


# Normalize elapsed time and compute remaining time
sepsis_log_df["Elapsed Time (seconds)"] = sepsis_log_df.groupby("Case ID")["Complete Timestamp (seconds)"].transform(
    lambda x: x - x.min()
)
sepsis_log_df["Remaining Time (seconds)"] = sepsis_log_df.groupby("Case ID")["Elapsed Time (seconds)"].transform(
    lambda x: x.max() - x
)


# Generate all possible subsequences starting from "ER Registration"
def generate_subsequences(case_id, activities):
    subsequences = []
    current_sequence = []
    for activity in activities:
        current_sequence.append(activity)
        subsequences.append(" -> ".join(current_sequence))
    return subsequences


# Group by case and create subsequences for each case
sepsis_log_df["Subsequences"] = sepsis_log_df.groupby("Case ID")["Activity"].transform(
    lambda x: generate_subsequences(x.name, x)
)


# Explode the subsequences so that each row corresponds to a unique subsequence
sepsis_log_df_exploded = sepsis_log_df[["Case ID", "Subsequences", "Remaining Time (seconds)"]].explode("Subsequences")


# Calculate average remaining time for each subsequence
remaining_time_data = (
    sepsis_log_df_exploded.groupby("Subsequences")
    .agg({"Remaining Time (seconds)": "mean"})
    .reset_index()
    .rename(columns={"Remaining Time (seconds)": "Avg Remaining Time (seconds)"})
)


# Encode subsequences
seq_encoder = LabelEncoder()
remaining_time_data["Encoded Subsequence"] = seq_encoder.fit_transform(remaining_time_data["Subsequences"])


# Define features and target for the model
X_remaining_time = remaining_time_data["Encoded Subsequence"].values.reshape(-1, 1)
y_remaining_time = remaining_time_data["Avg Remaining Time (seconds)"].values


# Train a Random Forest Regressor
remaining_time_model = RandomForestRegressor(n_estimators=100, random_state=42)
remaining_time_model.fit(X_remaining_time, y_remaining_time)


In [None]:


# Prediction function
def predict_remaining_time(sequence):
    
    try:
        # Encode the input sequence
        encoded_sequence = seq_encoder.transform([sequence])[0].reshape(-1, 1)
        # Predict remaining time (in seconds)
        predicted_remaining_time_seconds = remaining_time_model.predict(encoded_sequence)
        # Convert to hours
        return predicted_remaining_time_seconds[0] / 3600
    except Exception as e:
        print(f"Error in predicting remaining time: {e}")
        return None


# Example usage
example_sequence = "ER Registration -> ER Triage -> Leucocytes -> CRP"
predicted_time = predict_remaining_time(example_sequence)
print(f"Predicted remaining time (in hours): {predicted_time}")


Predicted remaining time (in hours): 25.286002777777778


In [36]:
# predict remaining time for example sequences
for example in example_inputs:
    sequence = example["sequence"]
    predicted_time = predict_remaining_time(sequence)
    print(f"Predicted remaining time for sequence: {sequence}")
    print(f"Predicted Remaining Time (hours): {predicted_time}\n")

Predicted remaining time for sequence: ER Registration -> ER Triage -> ER Sepsis Triage -> Leucocytes -> LacticAcid -> CRP -> IV Antibiotics -> IV Liquid
Predicted Remaining Time (hours): 23.299083333333332

Predicted remaining time for sequence: ER Registration -> ER Triage -> ER Sepsis Triage -> Leucocytes -> CRP -> LacticAcid -> IV Liquid -> IV Antibiotics -> Admission NC -> CRP -> Leucocytes -> Leucocytes -> CRP -> Release A
Predicted Remaining Time (hours): 47.02

Predicted remaining time for sequence: ER Registration -> ER Triage -> ER Sepsis Triage -> Leucocytes -> LacticAcid -> CRP -> IV Liquid -> IV Antibiotics -> Admission NC -> Leucocytes -> Leucocytes -> CRP -> Leucocytes -> Leucocytes -> CRP -> Release B
Predicted Remaining Time (hours): 35.659333333333336

Predicted remaining time for sequence: ER Registration -> ER Triage -> ER Sepsis Triage -> CRP -> Leucocytes -> IV Liquid -> IV Antibiotics -> Admission NC -> Admission NC -> CRP -> CRP -> Release A -> Return ER
Predict