In [33]:
import pandas as pd

# Load the datasets
activity_flow_path = "Sepsis_Activity_Flow.csv"
biomarkers_path = "Sepsis_Biomarkers_Next_Activity.csv"
features_path = "Sepsis_Selected_Features.csv"

activity_flow_df = pd.read_csv(activity_flow_path)
biomarkers_df = pd.read_csv(biomarkers_path)
features_df = pd.read_csv(features_path)

# Display the first few rows of each dataset to understand their structure
activity_flow_df.head(), biomarkers_df.head(), features_df.head()


(  Case ID       Activity 1  Activity 2        Activity 3  Activity 4  \
 0       A  ER Registration  Leucocytes               CRP  LacticAcid   
 1       B  ER Registration   ER Triage               CRP  LacticAcid   
 2       C  ER Registration   ER Triage  ER Sepsis Triage  Leucocytes   
 3       D  ER Registration   ER Triage  ER Sepsis Triage         CRP   
 4       E  ER Registration   ER Triage  ER Sepsis Triage   IV Liquid   
 
    Activity 5        Activity 6      Activity 7      Activity 8    Activity 9  \
 0   ER Triage  ER Sepsis Triage       IV Liquid  IV Antibiotics  Admission NC   
 1  Leucocytes  ER Sepsis Triage       IV Liquid  IV Antibiotics  Admission NC   
 2         CRP         IV Liquid  IV Antibiotics    Admission NC  Admission NC   
 3  LacticAcid        Leucocytes       IV Liquid  IV Antibiotics  Admission NC   
 4         CRP        Leucocytes      LacticAcid  IV Antibiotics           NaN   
 
    ... Activity 40 Activity 41 Activity 42 Activity 43 Activity 4

In [34]:
# Preprocessing the datasets to extract useful training features

# 1. Transform Activity Flow Data: Convert sequence format into a training dataset where each row represents 
#    (Current Sequence -> Next Activity)
activity_sequences = []
for _, row in activity_flow_df.iterrows():
    case_id = row["Case ID"]
    activities = row.drop("Case ID").dropna().tolist()
    for i in range(len(activities) - 1):
        activity_sequences.append({
            "Case ID": case_id,
            "Current Sequence": " -> ".join(activities[:i+1]),  # Sequence so far
            "Next Activity": activities[i+1]  # What follows next
        })
activity_flow_df_transformed = pd.DataFrame(activity_sequences)

# 2. Merge with Selected Features
#    Each Case ID has its own clinical feature set that helps filter the next activities
merged_features_df = activity_flow_df_transformed.merge(features_df, on="Case ID", how="left")

# 3. Merge with Biomarkers Data
#    Using biomarkers to further refine next activity prediction
biomarkers_df_grouped = biomarkers_df.groupby(["Case ID", "Activity"])["Range"].apply(list).reset_index()
biomarkers_df_grouped.rename(columns={"Activity": "Last Activity", "Range": "Biomarker Values"}, inplace=True)
merged_biomarkers_df = merged_features_df.merge(biomarkers_df_grouped, 
                                                left_on=["Case ID", "Next Activity"], 
                                                right_on=["Case ID", "Last Activity"], 
                                                how="left").drop(columns=["Last Activity"])

# Display the processed dataset
merged_biomarkers_df.head()

Unnamed: 0,Case ID,Current Sequence,Next Activity,SIRSCriteria2OrMore,Infusion,SIRSCritTemperature,DiagnosticLacticAcid,SIRSCritHeartRate,DiagnosticXthorax,SIRSCritTachypnea,DiagnosticUrinarySediment,Age,InfectionSuspected,Biomarker Values
0,A,ER Registration,Leucocytes,True,True,True,True,True,True,True,True,85.0,True,"[Normal, Normal, Normal, Normal, Elevated, Nor..."
1,A,ER Registration -> Leucocytes,CRP,True,True,True,True,True,True,True,True,85.0,True,"[Low, Moderate, Low, Low, Low, Low, Low]"
2,A,ER Registration -> Leucocytes -> CRP,LacticAcid,True,True,True,True,True,True,True,True,85.0,True,[Elevated]
3,A,ER Registration -> Leucocytes -> CRP -> Lactic...,ER Triage,True,True,True,True,True,True,True,True,85.0,True,
4,A,ER Registration -> Leucocytes -> CRP -> Lactic...,ER Sepsis Triage,True,True,True,True,True,True,True,True,85.0,True,


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np

# Encode the Current Sequence using Label Encoding
seq_encoder = LabelEncoder()
merged_biomarkers_df["Encoded Sequence"] = seq_encoder.fit_transform(merged_biomarkers_df["Current Sequence"])

# Encode the Next Activity as the target variable
activity_encoder = LabelEncoder()
merged_biomarkers_df["Encoded Next Activity"] = activity_encoder.fit_transform(merged_biomarkers_df["Next Activity"])

# Encode Biomarker Values (One-Hot Encoding for categorical biomarker ranges)
biomarker_ohe = OneHotEncoder(handle_unknown="ignore")
biomarker_encoded = biomarker_ohe.fit_transform(merged_biomarkers_df["Biomarker Values"].fillna("").astype(str).values.reshape(-1, 1)).toarray()

# Convert boolean and categorical features into numerical format
feature_columns = [
    "SIRSCriteria2OrMore", "Infusion", "SIRSCritTemperature", 
    "DiagnosticLacticAcid", "SIRSCritHeartRate", "DiagnosticXthorax", 
    "SIRSCritTachypnea", "DiagnosticUrinarySediment", "Age", "InfectionSuspected"
]

X = np.hstack((merged_biomarkers_df[feature_columns].fillna(0).astype(float).values, biomarker_encoded, merged_biomarkers_df["Encoded Sequence"].values.reshape(-1,1)))
y = merged_biomarkers_df["Encoded Next Activity"].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Checking the processed feature shape
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((9793, 600), (2449, 600), (9793,), (2449,))

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=activity_encoder.classes_)

accuracy, classification_rep


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.72233564720294,
 '                  precision    recall  f1-score   support\n\n    Admission IC       0.00      0.00      0.00        18\n    Admission NC       0.32      0.37      0.35       211\n             CRP       0.96      0.94      0.95       535\nER Sepsis Triage       0.79      0.85      0.82       197\n       ER Triage       0.89      0.92      0.90       205\n  IV Antibiotics       0.28      0.36      0.32       146\n       IV Liquid       0.62      0.46      0.53       164\n      LacticAcid       0.84      0.84      0.84       220\n      Leucocytes       0.92      0.92      0.92       554\n       Release A       0.07      0.07      0.07       121\n       Release B       0.00      0.00      0.00        16\n       Release C       0.00      0.00      0.00         4\n       Release D       0.00      0.00      0.00         6\n       Release E       0.00      0.00      0.00         2\n       Return ER       0.04      0.04      0.04        50\n\n        accuracy               

In [41]:
# Define the example input data
example_sequence = "ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid"
example_features = [1, 1, 1, 1, 1, 0, 0, 1, 50, 1]  # Clinical feature values
example_biomarkers = {"Leucocytes": "Elevated", "CRP": "Severe", "LacticAcid": "High"}  # Biomarker values

# Encode the input sequence using the trained sequence encoder
example_sequence_encoded = seq_encoder.transform([example_sequence])[0]

# Encode the biomarker values using the same encoding logic
biomarker_vector = biomarker_ohe.transform(
    np.array([str(list(example_biomarkers.values()))]).reshape(-1, 1)
).toarray()

# Prepare the input feature vector
example_input_features = np.hstack((np.array(example_features).reshape(1, -1), biomarker_vector, np.array([[example_sequence_encoded]])))

# Predict the next activity
predicted_activity_encoded = rf_model.predict(example_input_features)[0]
predicted_activity = activity_encoder.inverse_transform([predicted_activity_encoded])[0]

# Get the probability of the predicted next activity
predicted_probabilities = rf_model.predict_proba(example_input_features)
predicted_confidence = np.max(predicted_probabilities) * 100  # Convert to percentage

# Display the prediction result
{
    "Current Sequence": example_sequence,
    "Clinical Features": example_features,
    "Biomarkers": example_biomarkers,
    "Predicted Next Activity": predicted_activity,
    "Confidence": f"{predicted_confidence:.2f}%"
}


{'Current Sequence': 'ER Registration -> ER Triage -> Leucocytes -> CRP -> LacticAcid',
 'Clinical Features': [1, 1, 1, 1, 1, 0, 0, 1, 50, 1],
 'Biomarkers': {'Leucocytes': 'Elevated',
  'CRP': 'Severe',
  'LacticAcid': 'High'},
 'Predicted Next Activity': 'CRP',
 'Confidence': '83.00%'}