In [95]:
import pandas as pd

df = pd.read_csv('./data/ED_triage.csv')
df

Unnamed: 0,triage_code,gender,age,admission_year,admission_month,admission_day,admission_weekday,admission_hour,kindref,ChiefComplaint,...,BlooddpressurSystol,BlooddpressurDiastol,PulseRate,RespiratoryRate,Temperature,O2Saturation,AVPU,TriageGrade,operational_patient,ref_specialist
0,13960101008,Female,77,2017,3,21,2,2,5,Z03.89,...,,,,,,,,5,0,0
1,13960101009,Male,42,2017,3,21,2,2,6,T07,...,,,86.0,18.0,,96.0,A,3,0,0
2,13960101010,Female,71,2017,3,21,2,2,6,R10.84,...,,,,,,,,2,0,0
3,13960101011,Male,77,2017,3,21,2,2,6,R53,...,,,,,,,,2,0,0
4,13960101012,Male,39,2017,3,21,2,2,6,T79.9,...,,,,,,,,4,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143577,14001229114,Female,55,2022,3,20,0,23,3,K92.2,...,,,,,,,,2,0,0
143578,14001229115,Female,60,2022,3,20,0,23,6,T18.9,...,,,,,,,A,4,1,0
143579,14001229117,Female,70,2022,3,20,0,23,6,R55,...,,,,,,,,1,0,0
143580,14001229118,Male,67,2022,3,20,0,23,6,K92.2,...,,,,,,,,2,0,0


In [96]:
# Import necessary packages
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [97]:
len(df.columns)

28

# inspect

In [98]:
# slice out unnecessary columns
df = df.drop(columns=["triage_code", "admission_year", "admission_month", "admission_day", "admission_weekday", "admission_hour"])

In [99]:
# inspect distribution of TriageGrade
df['TriageGrade'].value_counts(normalize=True).sort_index(ascending=True)

1    0.071847
2    0.564855
3    0.239334
4    0.123832
5    0.000132
Name: TriageGrade, dtype: float64

# test one, drop all columns with null values

In [100]:
missing_values = df.isnull().sum() # default value for the axis parameter in isnull() is 0, which means it operates column-wise.
columns_with_missing_values = missing_values[missing_values > 0].index

# print the columns with missing values and their counts
for column in columns_with_missing_values:
    print(f"{column}: {missing_values[column]} missing values")

# drop columns with missing values > 20%
df = df.drop(columns=missing_values[missing_values > 0.2 * len(df)].index)

# We remove the rows with missing values in the remaining columns
df = df.dropna()

# Check the distribution of TriageGrade again
df['TriageGrade'].value_counts(normalize=True).sort_index(ascending=True)

# It seems the missing values were due to NeedFastExecute being True, which resulted in the loss of most triage grade 1 patients


explainer_id: 109327 missing values
CriticalStatus: 10380 missing values
StuporStatus: 10380 missing values
PainGrade: 10316 missing values
MentalDistress: 10380 missing values
MaterialDistress: 10380 missing values
Source: 87549 missing values
BlooddpressurSystol: 118921 missing values
BlooddpressurDiastol: 118622 missing values
PulseRate: 108360 missing values
RespiratoryRate: 110143 missing values
Temperature: 143443 missing values
O2Saturation: 105923 missing values
AVPU: 90558 missing values


1    0.000008
2    0.608865
3    0.257962
4    0.133024
5    0.000143
Name: TriageGrade, dtype: float64

In [101]:
# Inspect how NeedFastExecute is distributed across TriageGrade
df.groupby('TriageGrade')['NeedFastExecute'].value_counts(normalize=True).unstack().fillna(0).sort_index(ascending=True)

# We can drop the NeedFastExecute column, as it is directly related to TriageGrade = 1. We can also drop rows with NeedFastExecute/TriageGrade = 1
df = df.drop(columns=['NeedFastExecute'])
df = df[df['TriageGrade'] != 1]

In [102]:
# One-hot encode the categorical variables
df = pd.get_dummies(df, columns=["gender", "ChiefComplaint"], drop_first=True)

# Check the new columns
df.columns

Index(['age', 'kindref', 'CriticalStatus', 'StuporStatus', 'PainGrade',
       'MentalDistress', 'MaterialDistress', 'TriageGrade',
       'operational_patient', 'ref_specialist',
       ...
       'ChiefComplaint_Z93.1', 'ChiefComplaint_Z93.2', 'ChiefComplaint_Z93.5',
       'ChiefComplaint_Z94', 'ChiefComplaint_Z94.0', 'ChiefComplaint_Z94.4',
       'ChiefComplaint_Z95.0', 'ChiefComplaint_Z95.5', 'ChiefComplaint_Z96.89',
       'ChiefComplaint_Z98.890'],
      dtype='object', length=897)

In [103]:
X = df.drop(['TriageGrade'], axis=1)
y = df['TriageGrade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision Tree Classifier

In [104]:
def train_decision_tree(X_train, X_test, y_train, y_test, model):
    # Train the model
    model.fit(X_train, y_train)

    # Predict the target variable for the test set
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)

    
    print("=====FOR TRAINING:=====")
    

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_train, y_pred_train)
    print("Accuracy of each class:\n", class_accuracy)

    # Calculate accuracy of all classes
    accuracy = accuracy_score(y_train, y_pred_train)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_train, y_pred_train, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_train, y_pred_train, average='micro')
    print("Micro F1 score:", micro_f1)

    print("====FOR TESTING:====")

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_test, y_pred)
    print("Accuracy of each class:\n", class_accuracy)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    print("Micro F1 score:", micro_f1)

In [105]:
from sklearn.exceptions import UndefinedMetricWarning
import warnings

# Suppress the warning for zero division
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UndefinedMetricWarning, module='sklearn.metrics._classification')

from sklearn.model_selection import train_test_split


model = DecisionTreeClassifier(min_samples_split=20, max_depth=10, min_samples_leaf=2)
# model = DecisionTreeClassifier()

train_decision_tree(X_train, X_test, y_train, y_test, model)


=====FOR TRAINING:=====
Accuracy of each class:
               precision    recall  f1-score   support

           2       1.00      1.00      1.00     64864
           3       0.95      0.88      0.91     27479
           4       0.80      0.92      0.85     14201
           5       0.00      0.00      0.00        16

    accuracy                           0.96    106560
   macro avg       0.69      0.70      0.69    106560
weighted avg       0.96      0.96      0.96    106560

Accuracy of all classes: 0.9568318318318318
Macro F1 score: 0.6915887320094003
Micro F1 score: 0.9568318318318318
====FOR TESTING:====
Accuracy of each class:
               precision    recall  f1-score   support

           2       1.00      1.00      1.00     16238
           3       0.94      0.86      0.90      6882
           4       0.77      0.90      0.83      3518
           5       0.00      0.00      0.00         3

    accuracy                           0.95     26641
   macro avg       0.68      0

# logistic regression

In [106]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score

def train_logistic_regression(X_train, X_test, y_train, y_test, model, n_components):
    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # PCA
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # Train the model
    model.fit(X_train_pca, y_train)

    # Predict the target variable for the test set
    y_pred = model.predict(X_test_pca)
    y_pred_train = model.predict(X_train_pca)

    # Predict probabilities for AUC calculation
    y_pred_proba = model.predict_proba(X_test_pca)
    y_pred_train_proba = model.predict_proba(X_train_pca)

    print("=====FOR TRAINING:=====")

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_train, y_pred_train)
    print("Accuracy of each class:\n", class_accuracy)

    # Calculate accuracy of all classes
    accuracy = accuracy_score(y_train, y_pred_train)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_train, y_pred_train, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_train, y_pred_train, average='micro')
    print("Micro F1 score:", micro_f1)

    # Calculate AUC
    auc_train = roc_auc_score(y_train, y_pred_train_proba, multi_class='ovr')
    print("AUC:", auc_train)

    print("====FOR TESTING:====")

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_test, y_pred)
    print("Accuracy of each class:\n", class_accuracy)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    print("Micro F1 score:", micro_f1)

    # Calculate AUC
    auc_test = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
    print("AUC:", auc_test)

In [107]:
# Example usage
model = LogisticRegression(max_iter=300, random_state=42)
train_logistic_regression(X_train, X_test, y_train, y_test, model, n_components=10)

=====FOR TRAINING:=====
Accuracy of each class:
               precision    recall  f1-score   support

           2       0.85      0.92      0.88     64864
           3       0.63      0.49      0.55     27479
           4       0.77      0.80      0.79     14201
           5       0.00      0.00      0.00        16

    accuracy                           0.80    106560
   macro avg       0.56      0.55      0.56    106560
weighted avg       0.78      0.80      0.79    106560

Accuracy of all classes: 0.7950168918918918
Macro F1 score: 0.5561741476647923
Micro F1 score: 0.7950168918918918
AUC: 0.8945250200034696
====FOR TESTING:====
Accuracy of each class:
               precision    recall  f1-score   support

           2       0.85      0.92      0.88     16238
           3       0.64      0.49      0.56      6882
           4       0.78      0.81      0.79      3518
           5       0.00      0.00      0.00         3

    accuracy                           0.80     26641
   mac

# XGBoost

In [108]:
from sklearn.ensemble import RandomForestClassifier

def train_random_forest(X_train, X_test, y_train, y_test, model):
    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train the model
    model.fit(X_train_scaled, y_train)

    # Predict the target variable for the test set
    y_pred = model.predict(X_test_scaled)
    y_pred_train = model.predict(X_train_scaled)

    # Predict probabilities for AUC calculation
    y_pred_proba = model.predict_proba(X_test_scaled)
    y_pred_train_proba = model.predict_proba(X_train_scaled)

    print("=====FOR TRAINING:=====")

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_train, y_pred_train)
    print("Accuracy of each class:\n", class_accuracy)

    # Calculate accuracy of all classes
    accuracy = accuracy_score(y_train, y_pred_train)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_train, y_pred_train, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_train, y_pred_train, average='micro')
    print("Micro F1 score:", micro_f1)

    # Calculate AUC
    auc_train = roc_auc_score(y_train, y_pred_train_proba, multi_class='ovr')
    print("AUC:", auc_train)

    print("====FOR TESTING:====")

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_test, y_pred)
    print("Accuracy of each class:\n", class_accuracy)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    print("Micro F1 score:", micro_f1)

    # Calculate AUC
    auc_test = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
    print("AUC:", auc_test)



In [109]:
# model = RandomForestClassifier(random_state=42, max_depth=10, min_samples_leaf=2)
model = RandomForestClassifier()
train_random_forest(X_train, X_test, y_train, y_test, model)

=====FOR TRAINING:=====
Accuracy of each class:
               precision    recall  f1-score   support

           2       1.00      1.00      1.00     64864
           3       0.98      0.95      0.97     27479
           4       0.91      0.97      0.94     14201
           5       1.00      0.69      0.81        16

    accuracy                           0.98    106560
   macro avg       0.97      0.90      0.93    106560
weighted avg       0.98      0.98      0.98    106560

Accuracy of all classes: 0.9841028528528528
Macro F1 score: 0.9317081051547461
Micro F1 score: 0.9841028528528528
AUC: 0.9991222809505041
====FOR TESTING:====
Accuracy of each class:
               precision    recall  f1-score   support

           2       1.00      1.00      1.00     16238
           3       0.91      0.88      0.89      6882
           4       0.78      0.85      0.81      3518
           5       0.00      0.00      0.00         3

    accuracy                           0.95     26641
   mac

# use neural network to predict esi

In [110]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score, f1_score

def train_neural_network(X_train, X_test, y_train, y_test, model, n_components):
    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # PCA
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # Train the model
    model.fit(X_train_pca, y_train)

    # Predict the target variable for the test set
    y_pred = model.predict(X_test_pca)
    y_pred_train = model.predict(X_train_pca)

    print("=====FOR TRAINING:=====")

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_train, y_pred_train)
    print("Accuracy of each class:\n", class_accuracy)

    # Calculate accuracy of all classes
    accuracy = accuracy_score(y_train, y_pred_train)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_train, y_pred_train, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_train, y_pred_train, average='micro')
    print("Micro F1 score:", micro_f1)

    print("====FOR TESTING:====")

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_test, y_pred)
    print("Accuracy of each class:\n", class_accuracy)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    print("Micro F1 score:", micro_f1)

In [111]:
# Example usage
model = MLPClassifier(hidden_layer_sizes=(64, 16), max_iter=100, random_state=42)
train_neural_network(X_train, X_test, y_train, y_test, model, n_components=500)

=====FOR TRAINING:=====
Accuracy of each class:
               precision    recall  f1-score   support

           2       1.00      1.00      1.00     64864
           3       0.95      0.87      0.91     27479
           4       0.80      0.93      0.86     14201
           5       0.00      0.00      0.00        16

    accuracy                           0.95    106560
   macro avg       0.69      0.70      0.69    106560
weighted avg       0.96      0.95      0.95    106560

Accuracy of all classes: 0.9542323573573573
Macro F1 score: 0.690890934100886
Micro F1 score: 0.9542323573573573
====FOR TESTING:====
Accuracy of each class:
               precision    recall  f1-score   support

           2       0.99      0.99      0.99     16238
           3       0.93      0.85      0.89      6882
           4       0.78      0.91      0.84      3518
           5       0.00      0.00      0.00         3

    accuracy                           0.94     26641
   macro avg       0.67      0.

# Inspect relationship between NeedFastExecute and TriageGrade

In [115]:
# Reinitialise data
df = pd.read_csv('./data/ED_triage.csv')

# inspect distribution of TriageGrade against NeedFastExecute
df.groupby('TriageGrade')['NeedFastExecute'].value_counts(normalize=True).unstack().fillna(0).sort_index(ascending=True)


NeedFastExecute,0,1,2
TriageGrade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,9.7e-05,0.999903,0.0
2,1.2e-05,0.0,0.999988
3,8.7e-05,0.0,0.999913
4,0.003431,0.0,0.996569
5,0.0,0.0,1.0
