In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

# Load and clean df
df = pd.read_csv("Sepsis_Cases_Log.csv")

In [54]:
# Drop unnecessary columns
columns_to_drop = ['org:group', 'Variant', 'Variant index', 'Diagnose', 'lifecycle:transition']
df = df.drop(columns=columns_to_drop)

# Convert 'Complete Timestamp' to total seconds
def time_to_seconds(time_str):
    """Convert HH:MM.S format to total seconds."""
    try:
        parts = time_str.split(':')
        minutes = int(parts[0])
        seconds = float(parts[1])
        return minutes * 60 + seconds
    except:
        return None  # Handle potential errors

df['Complete Timestamp'] = df['Complete Timestamp'].apply(time_to_seconds)

# Display the updated dataframe structure
df.head()


Unnamed: 0,Case ID,Activity,Complete Timestamp,InfectionSuspected,DiagnosticBlood,DisfuncOrg,SIRSCritTachypnea,Hypotensie,SIRSCritHeartRate,Infusion,...,DiagnosticUrinaryCulture,SIRSCritLeucos,Oligurie,DiagnosticLacticAcid,Hypoxie,DiagnosticUrinarySediment,DiagnosticECG,Leucocytes,CRP,LacticAcid
0,A,ER Registration,941.0,True,True,True,True,True,True,True,...,True,False,False,True,False,True,True,,,
1,A,Leucocytes,1620.0,,,,,,,,...,,,,,,,,9.6,,
2,A,CRP,1620.0,,,,,,,,...,,,,,,,,,21.0,
3,A,LacticAcid,1620.0,,,,,,,,...,,,,,,,,,,2.2
4,A,ER Triage,2017.0,,,,,,,,...,,,,,,,,,,


In [55]:
# Encode activities as numerical labels
activity_mapping = {activity: idx for idx, activity in enumerate(df['Activity'].unique())}
df['Activity ID'] = df['Activity'].map(activity_mapping)

# Create the 'Next Activity' column (shifted activity within the same Case ID)
df['Next Activity'] = df.groupby('Case ID')['Activity'].shift(-1)
df['Next Activity ID'] = df['Next Activity'].map(activity_mapping)

# Create the 'Time to End' feature (remaining time until the last activity of the case)
df['Time to End'] = df.groupby('Case ID')['Complete Timestamp'].transform('max') - df['Complete Timestamp']

# Drop rows where 'Next Activity' is NaN (last step in each case has no next activity)
df = df.dropna(subset=['Next Activity'])

df.head()


Unnamed: 0,Case ID,Activity,Complete Timestamp,InfectionSuspected,DiagnosticBlood,DisfuncOrg,SIRSCritTachypnea,Hypotensie,SIRSCritHeartRate,Infusion,...,Hypoxie,DiagnosticUrinarySediment,DiagnosticECG,Leucocytes,CRP,LacticAcid,Activity ID,Next Activity,Next Activity ID,Time to End
0,A,ER Registration,941.0,True,True,True,True,True,True,True,...,False,True,True,,,,0,Leucocytes,1.0,1099.0
1,A,Leucocytes,1620.0,,,,,,,,...,,,,9.6,,,1,CRP,2.0,420.0
2,A,CRP,1620.0,,,,,,,,...,,,,,21.0,,2,LacticAcid,3.0,420.0
3,A,LacticAcid,1620.0,,,,,,,,...,,,,,,2.2,3,ER Triage,4.0,420.0
4,A,ER Triage,2017.0,,,,,,,,...,,,,,,,4,ER Sepsis Triage,5.0,23.0


In [56]:
# Step 1: Remove cases where the first activity is not 'ER Registration'
# Identify the first activity for each case
first_activities = df.groupby('Case ID').first()['Activity']
valid_cases = first_activities[first_activities == 'ER Registration'].index

# Keep only valid cases
df = df[df['Case ID'].isin(valid_cases)]

# Step 2: Forward fill missing values within each 'Case ID'
df = df.groupby('Case ID').apply(lambda group: group.ffill()).reset_index(drop=True)

df.head()

  df = df.groupby('Case ID').apply(lambda group: group.ffill()).reset_index(drop=True)
  df = df.groupby('Case ID').apply(lambda group: group.ffill()).reset_index(drop=True)


Unnamed: 0,Case ID,Activity,Complete Timestamp,InfectionSuspected,DiagnosticBlood,DisfuncOrg,SIRSCritTachypnea,Hypotensie,SIRSCritHeartRate,Infusion,...,Hypoxie,DiagnosticUrinarySediment,DiagnosticECG,Leucocytes,CRP,LacticAcid,Activity ID,Next Activity,Next Activity ID,Time to End
0,A,ER Registration,941.0,True,True,True,True,True,True,True,...,False,True,True,,,,0,Leucocytes,1.0,1099.0
1,A,Leucocytes,1620.0,True,True,True,True,True,True,True,...,False,True,True,9.6,,,1,CRP,2.0,420.0
2,A,CRP,1620.0,True,True,True,True,True,True,True,...,False,True,True,9.6,21.0,,2,LacticAcid,3.0,420.0
3,A,LacticAcid,1620.0,True,True,True,True,True,True,True,...,False,True,True,9.6,21.0,2.2,3,ER Triage,4.0,420.0
4,A,ER Triage,2017.0,True,True,True,True,True,True,True,...,False,True,True,9.6,21.0,2.2,4,ER Sepsis Triage,5.0,23.0


In [57]:
df.to_csv('Sepsis_Cases_Log_cleaned.csv', index=False)

In [58]:
# Fill missing values in Leucocytes, CRP, LacticAcid with zero
for col in ['Leucocytes', 'CRP', 'LacticAcid']:
    df[col] = df[col].fillna(0)

# Verify that there are no more missing values
missing_values_after = df.isnull().sum()

In [59]:
missing_values_after

Case ID                      0
Activity                     0
Complete Timestamp           0
InfectionSuspected           0
DiagnosticBlood              0
DisfuncOrg                   0
SIRSCritTachypnea            0
Hypotensie                   0
SIRSCritHeartRate            0
Infusion                     0
DiagnosticArtAstrup          0
Age                          0
DiagnosticIC                 0
DiagnosticSputum             0
DiagnosticLiquor             0
DiagnosticOther              0
SIRSCriteria2OrMore          0
DiagnosticXthorax            0
SIRSCritTemperature          0
DiagnosticUrinaryCulture     0
SIRSCritLeucos               0
Oligurie                     0
DiagnosticLacticAcid         0
Hypoxie                      0
DiagnosticUrinarySediment    0
DiagnosticECG                0
Leucocytes                   0
CRP                          0
LacticAcid                   0
Activity ID                  0
Next Activity                0
Next Activity ID             0
Time to 

In [61]:
df_filtered =df

In [62]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

# Step 1: Remove low-variance features
num_cols = df_filtered.select_dtypes(include=[np.number]).columns.tolist()
num_cols.remove("Activity ID")
num_cols.remove("Next Activity ID")
num_cols.remove("Time to End")

var_thresh = VarianceThreshold(threshold=0.01)
selected_data = var_thresh.fit_transform(df_filtered[num_cols])
selected_columns = [col for col, keep in zip(num_cols, var_thresh.get_support()) if keep]

# Step 2: Group similar features
df_filtered["Blood_Test_Sum"] = df_filtered["Leucocytes"] + df_filtered["CRP"] + df_filtered["LacticAcid"]

diagnostic_features = ["DiagnosticBlood", "DiagnosticUrinaryCulture", "DiagnosticLacticAcid",
                       "DiagnosticUrinarySediment", "DiagnosticECG"]
df_filtered["Diagnostic_Sum"] = df_filtered[diagnostic_features].sum(axis=1)

# Step 3: Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Keep 95% variance
pca_features = pca.fit_transform(df_filtered[selected_columns])
pca_columns = [f'PCA_{i}' for i in range(pca_features.shape[1])]
df_filtered[pca_columns] = pca_features

# Drop original high-dimensional features
df_filtered.drop(columns=selected_columns, errors="ignore", inplace=True)


In [63]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
label_encoders = {}
categorical_columns = ["Activity", "Next Activity"]

for col in categorical_columns:
    if col in df_filtered.columns:
        le = LabelEncoder()
        df_filtered[col] = le.fit_transform(df_filtered[col])
        label_encoders[col] = le  # Store encoder for inverse transformation


In [64]:
from sklearn.model_selection import train_test_split

# Define Features (X) and Targets (Y)
X = df_filtered.drop(columns=["Next Activity ID", "Time to End"])  # Features
y_classification = df_filtered["Next Activity ID"].astype(int)  # Target for Next Activity Prediction
y_regression = df_filtered["Time to End"].astype(float)  # Target for Time to End Prediction

# Split the dataset (90% Train, 10% Test)
X_train, X_test, y_train_class, y_test_class = train_test_split(X, y_classification, test_size=0.1, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_regression, test_size=0.1, random_state=42)


In [66]:
# Identify non-numeric columns
non_numeric_columns = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

# Display columns that contain non-numeric values
print("Non-Numeric Columns:", non_numeric_columns)


Non-Numeric Columns: ['Case ID', 'InfectionSuspected', 'DiagnosticBlood', 'DisfuncOrg', 'SIRSCritTachypnea', 'Hypotensie', 'SIRSCritHeartRate', 'Infusion', 'DiagnosticArtAstrup', 'DiagnosticIC', 'DiagnosticSputum', 'DiagnosticLiquor', 'DiagnosticOther', 'SIRSCriteria2OrMore', 'DiagnosticXthorax', 'SIRSCritTemperature', 'DiagnosticUrinaryCulture', 'SIRSCritLeucos', 'Oligurie', 'DiagnosticLacticAcid', 'Hypoxie', 'DiagnosticUrinarySediment', 'DiagnosticECG']


In [67]:
from sklearn.preprocessing import LabelEncoder

# Encode remaining categorical columns
for col in non_numeric_columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])  # Apply the same transformation

print("Categorical Columns Encoded Successfully!")


Categorical Columns Encoded Successfully!


In [68]:
# Convert entire dataset to numeric
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Drop any remaining NaN values (if necessary)
X_train = X_train.dropna()
X_test = X_test.dropna()

# Check if the issue is resolved
print("Final Data Types in X_train:\n", X_train.dtypes)


Final Data Types in X_train:
 Case ID                        int64
Activity                       int64
InfectionSuspected             int64
DiagnosticBlood                int64
DisfuncOrg                     int64
SIRSCritTachypnea              int64
Hypotensie                     int64
SIRSCritHeartRate              int64
Infusion                       int64
DiagnosticArtAstrup            int64
DiagnosticIC                   int64
DiagnosticSputum               int64
DiagnosticLiquor               int64
DiagnosticOther                int64
SIRSCriteria2OrMore            int64
DiagnosticXthorax              int64
SIRSCritTemperature            int64
DiagnosticUrinaryCulture       int64
SIRSCritLeucos                 int64
Oligurie                       int64
DiagnosticLacticAcid           int64
Hypoxie                        int64
DiagnosticUrinarySediment      int64
DiagnosticECG                  int64
Activity ID                    int64
Next Activity                  int64
Blood_Te

In [69]:
# Train the classification model again
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train_class)
y_pred_class = rf_classifier.predict(X_test)

print("Model Training Successful!")


Model Training Successful!


In [70]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Train the classification model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train_class)
y_pred_class = rf_classifier.predict(X_test)

# Train the regression model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train_reg, y_train_reg)
y_pred_reg = rf_regressor.predict(X_test_reg)


ValueError: could not convert string to float: 'VIA'

In [71]:
from sklearn.metrics import accuracy_score, mean_absolute_error

# Classification Accuracy
classification_accuracy = accuracy_score(y_test_class, y_pred_class)

# Regression Mean Absolute Error
regression_mae = mean_absolute_error(y_test_reg, y_pred_reg)

# Print Results
print(f"Classification Accuracy: {classification_accuracy * 100:.2f}%")
print(f"Regression Mean Absolute Error: {regression_mae:.2f} seconds")


NameError: name 'y_pred_reg' is not defined