In [446]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder


In [447]:
# Load and clean data
df = pd.read_csv("Sepsis_Cases_Log.csv")
df = df[df.groupby('Case ID')['Activity'].transform('first') == 'ER Registration']


In [448]:
df = df.drop(columns=['org:group', 'Variant', 'Variant index', 'Diagnose', 'lifecycle:transition'])

In [449]:
df['Complete Timestamp'] = df['Complete Timestamp'].apply(
    lambda x: int(x.split(':')[0]) * 60 + float(x.split(':')[1])
)

In [450]:
print(df[['Case ID', 'Activity', 'Complete Timestamp']].isnull().sum())

Case ID               0
Activity              0
Complete Timestamp    0
dtype: int64


In [451]:
df[['Leucocytes', 'CRP', 'LacticAcid']] = df[['Leucocytes', 'CRP', 'LacticAcid']].fillna(0)

In [452]:
df['Age'] = df['Age'].fillna(df['Age'].median())

In [453]:
missing_columns = df.columns[df.isnull().any()].tolist()
print("Columns with missing values:", missing_columns)

Columns with missing values: ['InfectionSuspected', 'DiagnosticBlood', 'DisfuncOrg', 'SIRSCritTachypnea', 'Hypotensie', 'SIRSCritHeartRate', 'Infusion', 'DiagnosticArtAstrup', 'DiagnosticIC', 'DiagnosticSputum', 'DiagnosticLiquor', 'DiagnosticOther', 'SIRSCriteria2OrMore', 'DiagnosticXthorax', 'SIRSCritTemperature', 'DiagnosticUrinaryCulture', 'SIRSCritLeucos', 'Oligurie', 'DiagnosticLacticAcid', 'Hypoxie', 'DiagnosticUrinarySediment', 'DiagnosticECG']


In [454]:
df[missing_columns] = df[missing_columns].fillna(False)

  df[missing_columns] = df[missing_columns].fillna(False)


In [455]:
print(df.isnull().sum())

Case ID                      0
Activity                     0
Complete Timestamp           0
InfectionSuspected           0
DiagnosticBlood              0
DisfuncOrg                   0
SIRSCritTachypnea            0
Hypotensie                   0
SIRSCritHeartRate            0
Infusion                     0
DiagnosticArtAstrup          0
Age                          0
DiagnosticIC                 0
DiagnosticSputum             0
DiagnosticLiquor             0
DiagnosticOther              0
SIRSCriteria2OrMore          0
DiagnosticXthorax            0
SIRSCritTemperature          0
DiagnosticUrinaryCulture     0
SIRSCritLeucos               0
Oligurie                     0
DiagnosticLacticAcid         0
Hypoxie                      0
DiagnosticUrinarySediment    0
DiagnosticECG                0
Leucocytes                   0
CRP                          0
LacticAcid                   0
dtype: int64


In [456]:
df['Activity_Sequence'] = df.groupby('Case ID').cumcount() + 1

In [457]:
df['Time_Diff'] = df.groupby('Case ID')['Complete Timestamp'].diff().fillna(0)

In [458]:
patient_features = df.groupby('Case ID').agg({
    'Age': 'first',  # Age is constant for each patient
    'InfectionSuspected': 'max',  # True if any activity has True
    'DiagnosticBlood': 'max',
    'DisfuncOrg': 'max',
    # Add other patient-specific attributes here
}).reset_index()

In [459]:
blood_tests = df.groupby('Case ID').agg({
    'Leucocytes': 'max',
    'CRP': 'max',
    'LacticAcid': 'max'
}).reset_index()

In [460]:
df = df.merge(patient_features, on='Case ID', how='left')
df = df.merge(blood_tests, on='Case ID', how='left')

In [461]:
df.head()

Unnamed: 0,Case ID,Activity,Complete Timestamp,InfectionSuspected_x,DiagnosticBlood_x,DisfuncOrg_x,SIRSCritTachypnea,Hypotensie,SIRSCritHeartRate,Infusion,...,LacticAcid_x,Activity_Sequence,Time_Diff,Age_y,InfectionSuspected_y,DiagnosticBlood_y,DisfuncOrg_y,Leucocytes_y,CRP_y,LacticAcid_y
0,A,ER Registration,941.0,True,True,True,True,True,True,True,...,0.0,1,0.0,85.0,True,True,True,13.0,109.0,2.2
1,A,Leucocytes,1620.0,False,False,False,False,False,False,False,...,0.0,2,679.0,85.0,True,True,True,13.0,109.0,2.2
2,A,CRP,1620.0,False,False,False,False,False,False,False,...,0.0,3,0.0,85.0,True,True,True,13.0,109.0,2.2
3,A,LacticAcid,1620.0,False,False,False,False,False,False,False,...,2.2,4,0.0,85.0,True,True,True,13.0,109.0,2.2
4,A,ER Triage,2017.0,False,False,False,False,False,False,False,...,0.0,5,397.0,85.0,True,True,True,13.0,109.0,2.2


In [462]:
df = df.drop(columns=[col for col in df.columns if col.endswith('_x')])

In [463]:
df = df.rename(columns={col: col.replace('_y', '') for col in df.columns if col.endswith('_y')})

In [464]:
df.head()

Unnamed: 0,Case ID,Activity,Complete Timestamp,SIRSCritTachypnea,Hypotensie,SIRSCritHeartRate,Infusion,DiagnosticArtAstrup,DiagnosticIC,DiagnosticSputum,...,DiagnosticECG,Activity_Sequence,Time_Diff,Age,InfectionSuspected,DiagnosticBlood,DisfuncOrg,Leucocytes,CRP,LacticAcid
0,A,ER Registration,941.0,True,True,True,True,True,True,False,...,True,1,0.0,85.0,True,True,True,13.0,109.0,2.2
1,A,Leucocytes,1620.0,False,False,False,False,False,False,False,...,False,2,679.0,85.0,True,True,True,13.0,109.0,2.2
2,A,CRP,1620.0,False,False,False,False,False,False,False,...,False,3,0.0,85.0,True,True,True,13.0,109.0,2.2
3,A,LacticAcid,1620.0,False,False,False,False,False,False,False,...,False,4,0.0,85.0,True,True,True,13.0,109.0,2.2
4,A,ER Triage,2017.0,False,False,False,False,False,False,False,...,False,5,397.0,85.0,True,True,True,13.0,109.0,2.2


In [465]:
# Next Activity
df['Next_Activity'] = df.groupby('Case ID')['Activity'].shift(-1)

# Total Time Remaining
df['Total_Time_Remaining'] = df.groupby('Case ID')['Complete Timestamp'].transform(lambda x: x.max() - x)

In [466]:
df = df.dropna(subset=['Next_Activity'])

In [467]:
print(df[['Case ID', 'Activity', 'Complete Timestamp', 'Next_Activity', 'Total_Time_Remaining']].head())

  Case ID         Activity  Complete Timestamp     Next_Activity  \
0       A  ER Registration               941.0        Leucocytes   
1       A       Leucocytes              1620.0               CRP   
2       A              CRP              1620.0        LacticAcid   
3       A       LacticAcid              1620.0         ER Triage   
4       A        ER Triage              2017.0  ER Sepsis Triage   

   Total_Time_Remaining  
0                1099.0  
1                 420.0  
2                 420.0  
3                 420.0  
4                  23.0  


In [468]:
df = pd.get_dummies(df, columns=['Activity'], prefix='Activity')

In [469]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [470]:
# Features (X)
X = df.drop(columns=['Case ID', 'Complete Timestamp', 'Next_Activity', 'Total_Time_Remaining'])

# Target (y)
y_next_activity = df['Next_Activity']
y_total_time = df['Total_Time_Remaining']

In [471]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_next_activity_train, y_next_activity_test, y_total_time_train, y_total_time_test = train_test_split(
    X, y_next_activity, y_total_time, test_size=0.2, random_state=42
)

In [472]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train the model
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X, y_next_activity)

# Predict on the test set
y_next_activity_pred = rf_classifier.predict(test_df.drop(columns=['Case ID', 'Complete Timestamp', 'Next_Activity', 'Total_Time_Remaining']))

# Evaluate the model
accuracy = accuracy_score(test_df['Next_Activity'], y_next_activity_pred)
print(f"Accuracy for Next Activity Prediction: {accuracy:.2f}")

Accuracy for Next Activity Prediction: 1.00


In [473]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Train the model
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(X, y_total_time)

# Predict on the test set
y_total_time_pred = rf_regressor.predict(test_df.drop(columns=['Case ID', 'Complete Timestamp', 'Next_Activity', 'Total_Time_Remaining']))

# Evaluate the model
mse = mean_squared_error(test_df['Total_Time_Remaining'], y_total_time_pred)
print(f"Mean Squared Error for Total Time Remaining Prediction: {mse:.2f}")

Mean Squared Error for Total Time Remaining Prediction: 78830.15
