In [40]:
import pandas as pd

df = pd.read_csv('./data/ED_triage.csv')
df

Unnamed: 0,triage_code,gender,age,admission_year,admission_month,admission_day,admission_weekday,admission_hour,kindref,ChiefComplaint,...,BlooddpressurSystol,BlooddpressurDiastol,PulseRate,RespiratoryRate,Temperature,O2Saturation,AVPU,TriageGrade,operational_patient,ref_specialist
0,13960101008,Female,77,2017,3,21,2,2,5,Z03.89,...,,,,,,,,5,0,0
1,13960101009,Male,42,2017,3,21,2,2,6,T07,...,,,86.0,18.0,,96.0,A,3,0,0
2,13960101010,Female,71,2017,3,21,2,2,6,R10.84,...,,,,,,,,2,0,0
3,13960101011,Male,77,2017,3,21,2,2,6,R53,...,,,,,,,,2,0,0
4,13960101012,Male,39,2017,3,21,2,2,6,T79.9,...,,,,,,,,4,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143577,14001229114,Female,55,2022,3,20,0,23,3,K92.2,...,,,,,,,,2,0,0
143578,14001229115,Female,60,2022,3,20,0,23,6,T18.9,...,,,,,,,A,4,1,0
143579,14001229117,Female,70,2022,3,20,0,23,6,R55,...,,,,,,,,1,0,0
143580,14001229118,Male,67,2022,3,20,0,23,6,K92.2,...,,,,,,,,2,0,0


In [41]:
# Import necessary packages
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import torch
import numpy as np

In [42]:
len(df.columns)

28

In [43]:
# slice out unnecessary columns
df = df.drop(columns=["triage_code", "admission_year", "admission_month", "admission_day", "admission_weekday", "admission_hour"])

In [44]:
# inspect distribution of TriageGrade
df['TriageGrade'].value_counts(normalize=True).sort_index(ascending=True)

TriageGrade
1    0.071847
2    0.564855
3    0.239334
4    0.123832
5    0.000132
Name: proportion, dtype: float64

In [45]:
missing_values = df.isnull().sum() # default value for the axis parameter in isnull() is 0, which means it operates column-wise.
columns_with_missing_values = missing_values[missing_values > 0].index

# print the columns with missing values and their counts
for column in columns_with_missing_values:
    print(f"{column}: {missing_values[column]} missing values")

# drop columns with missing values > 20%
df = df.drop(columns=missing_values[missing_values > 0.2 * len(df)].index)

# We remove the rows with missing values in the remaining columns
df = df.dropna()

# Check the distribution of TriageGrade again
df['TriageGrade'].value_counts(normalize=True).sort_index(ascending=True)

explainer_id: 109327 missing values
CriticalStatus: 10380 missing values
StuporStatus: 10380 missing values
PainGrade: 10316 missing values
MentalDistress: 10380 missing values
MaterialDistress: 10380 missing values
Source: 87549 missing values
BlooddpressurSystol: 118921 missing values
BlooddpressurDiastol: 118622 missing values
PulseRate: 108360 missing values
RespiratoryRate: 110143 missing values
Temperature: 143443 missing values
O2Saturation: 105923 missing values
AVPU: 90558 missing values


TriageGrade
1    0.000008
2    0.608865
3    0.257962
4    0.133024
5    0.000143
Name: proportion, dtype: float64

In [46]:
# Inspect how NeedFastExecute is distributed across TriageGrade
df.groupby('TriageGrade')['NeedFastExecute'].value_counts(normalize=True).unstack().fillna(0).sort_index(ascending=True)

# We can drop the NeedFastExecute column, as it is directly related to TriageGrade = 1. We can also drop rows with NeedFastExecute/TriageGrade = 1
df = df.drop(columns=['NeedFastExecute'])
df = df[df['TriageGrade'] != 1]

In [47]:
# One-hot encode the categorical variables
df = pd.get_dummies(df, columns=["gender", "ChiefComplaint"], drop_first=True)

# Check the new columns
df.columns

Index(['age', 'kindref', 'CriticalStatus', 'StuporStatus', 'PainGrade',
       'MentalDistress', 'MaterialDistress', 'TriageGrade',
       'operational_patient', 'ref_specialist',
       ...
       'ChiefComplaint_Z93.1', 'ChiefComplaint_Z93.2', 'ChiefComplaint_Z93.5',
       'ChiefComplaint_Z94', 'ChiefComplaint_Z94.0', 'ChiefComplaint_Z94.4',
       'ChiefComplaint_Z95.0', 'ChiefComplaint_Z95.5', 'ChiefComplaint_Z96.89',
       'ChiefComplaint_Z98.890'],
      dtype='object', length=897)

In [None]:
df = df.applymap(lambda x: 0 if x is False else 1) # convert false/true to 0/1 respectively
df = df.select_dtypes(include=[np.number]) ## drop non-numerical columns
df

In [None]:
X = df.drop(['TriageGrade'], axis=1).values.astype(np.float32)
y = df['TriageGrade'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

X_train_tensor = X_train_tensor.double()
X_test_tensor = X_test_tensor.double()

print("Training set shape:", X_train_tensor.shape, y_train_tensor.shape)
print("Test set shape:", X_test_tensor.shape, y_test_tensor.shape)

Training set shape: torch.Size([106560, 896]) torch.Size([106560])
Test set shape: torch.Size([26641, 896]) torch.Size([26641])


kan

In [None]:
from kan import *

torch.set_default_dtype(torch.float64)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# create a KAN: 2D inputs, 1D output, and 5 hidden neurons. cubic spline (k=3), 5 grid intervals (grid=5).
model = KAN(width=[282,5,1], grid=3, k=3, seed=42, device=device)

cpu
checkpoint directory created: ./model
saving model version 0.0


In [None]:
df = df.iloc[:, np.r_[1, 2, 17:298]]
df

Unnamed: 0,kindref,CriticalStatus,ChiefComplaint_0M910ZZ,ChiefComplaint_0W9840Z,ChiefComplaint_0W9G3ZX,ChiefComplaint_0WP830Z,ChiefComplaint_2W3QX1Z,ChiefComplaint_2W5TX2Z,ChiefComplaint_2Y51X5Z,ChiefComplaint_30233K1,...,ChiefComplaint_J45,ChiefComplaint_J45.909,ChiefComplaint_J66.8,ChiefComplaint_J70.5,ChiefComplaint_J81.0,ChiefComplaint_J85.2,ChiefComplaint_J86.0,ChiefComplaint_J90,ChiefComplaint_J93.11,ChiefComplaint_J93.9
0,5,0.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,6,0.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,6,1.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,6,1.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,6,0.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143576,6,1.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
143577,3,1.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
143578,6,0.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
143580,6,1.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
dataset = create_dataset_from_data(X_train_tensor, y_train_tensor, device=device)
dataset['train_input'].shape, dataset['train_label'].shape

(torch.Size([85248, 896]), torch.Size([85248]))