In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

# Sample dataset
# Each sequence leads to a different next activity
sequence_data = [
    ("ER Registration -> ER Triage", [1, 1, 1, 1, 1, 0, 0, 1, 50, 1], {"Leucocytes": "Elevated"}, "Leucocytes"),
    ("ER Registration -> ER Triage -> Leucocytes", [1, 1, 1, 1, 1, 0, 0, 1, 50, 1], {"Leucocytes": "Elevated", "CRP": "Severe"}, "CRP"),
    ("ER Registration -> ER Triage -> Leucocytes -> CRP", [1, 1, 1, 1, 1, 0, 0, 1, 50, 1], {"Leucocytes": "Elevated", "CRP": "Severe", "LacticAcid": "High"}, "LacticAcid"),
]

# Convert to DataFrame
df = pd.DataFrame(sequence_data, columns=["sequence", "features", "biomarkers", "next_activity"])

# Encode sequences
sequence_encoder = LabelEncoder()
df["sequence_encoded"] = sequence_encoder.fit_transform(df["sequence"])

# Encode biomarker values
biomarker_features = ["Leucocytes", "CRP", "LacticAcid"]
biomarker_df = pd.DataFrame([{key: value for key, value in row.items()} for row in df["biomarkers"]], columns=biomarker_features).fillna("Normal")
biomarker_df = biomarker_df.apply(LabelEncoder().fit_transform)  # Convert categorical to numeric

df = pd.concat([df, biomarker_df], axis=1)

# Extract features and target
X = np.hstack([df[["sequence_encoded"]].values, np.vstack(df["features"].values), biomarker_df.values])
y = df["next_activity"]

# Encode target variable
y_encoder = LabelEncoder()
y = y_encoder.fit_transform(y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Function to predict next activity
def predict_next_activity(sequence, features, biomarkers):
    seq_encoded = sequence_encoder.transform([sequence])[0]
    bio_encoded = [biomarker_df[col].unique().tolist().index(biomarkers.get(col, "Normal")) for col in biomarker_features]
    input_features = np.hstack([[seq_encoded], features, bio_encoded])
    prediction = model.predict([input_features])
    return y_encoder.inverse_transform(prediction)[0]

# Example prediction
new_sequence = "ER Registration -> ER Triage -> Leucocytes"
new_features = [1, 1, 1, 1, 1, 0, 0, 1, 50, 1]
new_biomarkers = {"Leucocytes": "Elevated", "CRP": "Severe"}
next_activity = predict_next_activity(new_sequence, new_features, new_biomarkers)
print(f"Predicted Next Activity: {next_activity}")


Model Accuracy: 0.00%


ValueError: 'Elevated' is not in list