In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import random
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

import seaborn as sns

In [None]:
# Load the dataset
data = data = pd.read_csv('/content/drive/MyDrive/thesis/Dataset.csv')

In [None]:
# Separate numeric and non-numeric columns
numeric_cols = data.select_dtypes(include=['number']).columns
categorical_cols = data.select_dtypes(exclude=['number']).columns

# Impute missing values with the mean for numeric columns
imputer_numeric = SimpleImputer(strategy='mean')
data[numeric_cols] = imputer_numeric.fit_transform(data[numeric_cols])

# Impute missing values with the most frequent value for categorical columns
imputer_categorical = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = imputer_categorical.fit_transform(data[categorical_cols])

In [None]:
# Split 'charttime' into date and time components
data[['Date', 'Time']] = data['charttime'].str.split(' ', expand=True)

# Drop the 'Time' and 'charttime' columns as you only want the date component
data.drop(['Time', 'charttime'], axis=1, inplace=True)

In [None]:
# Separate features (X) and target (y)
X = data[numeric_cols].drop(columns=['disposition'], errors='ignore')
y = data['disposition']

In [None]:
# Standardize numerical columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Encode target column if necessary
if y.dtype == 'object' or y.dtype.name == 'category':
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

# Convert to tensor
X_num_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

print(X_num_tensor.shape)
print(y_tensor.shape)

In [None]:
# Define categorical columns
categorical_columns = ["gender", "race", "Date", "arrival_transport", "rhythm", "icd_code", "chiefcomplaint", "icd_title", "name", "etcdescription"]

In [None]:
# Preprocessing for each categorical column
word_to_index_map = {}
encoded_data = []

for column in categorical_columns:
    # Assuming each column is a string column
    # Extract the column values
    column_values = data[column].tolist()

    # Tokenize each value and build vocabulary
    words = []
    for sentence in column_values:
        words.extend(sentence.lower().split())

    word_to_index = {word: i for i, word in enumerate(set(words))}
    word_to_index_map[column] = word_to_index
    #print(word_to_index)
    #print(word_to_index_map[column])
     # Encode each value
    encoded_column = []
    for sentence in column_values:
        encoded_sentence = [word_to_index[word] for word in sentence.lower().split()]
        #print(encoded_sentence)
        encoded_column.append(encoded_sentence)

    encoded_data.append(encoded_column)
#print(words)

In [None]:
# Padding sequences for each column
max_lens = [max(len(sentence) for sentence in column) for column in encoded_data]
print(max_lens)
padded_data = []

for column, max_len in zip(encoded_data, max_lens):
    padded_column = [sentence + [0] * (max_len - len(sentence)) for sentence in column]
    padded_data.append(padded_column)
    print(padded_column)

# Convert to PyTorch tensors
padded_data = [torch.tensor(column) for column in padded_data]

# Example of accessing word_to_index for a specific column (e.g., 'chiefcomplaint')
print(word_to_index_map['chiefcomplaint'])
print(word_to_index_map['icd_code'])

In [None]:
# Modify input dimension based on your data size
input_dim = sum(len(word_to_index) for word_to_index in word_to_index_map.values())
embedding_dim = 10  # Embedding dimension
num_classes = 16  # Number of classes

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target column
encoded_targets = label_encoder.fit_transform(data["disposition"])

# Convert to PyTorch tensor
targets = torch.tensor(encoded_targets, dtype=torch.long)  # Use long tensor for classification

# Concatenate the padded data into a single tensor along the appropriate axis
X_col = torch.cat(padded_data, dim=1) #data size,sum of padded length
print(X_col[0].shape)

In [None]:
# Convert X (categorical features) to a numpy array
X_categorical = X_col.numpy()

# If X_tensor is a tensor with more than one dimension, convert it to a numpy array
if len(X_num_tensor.shape) > 1:
    X_numerical = X_num_tensor.numpy()
else:
    # If X_tensor is a single-dimensional tensor, reshape it to a column vector
    X_numerical = X_num_tensor.numpy().reshape(-1, 1)

# Concatenate numerical and categorical features
X_combined = np.concatenate((X_numerical, X_categorical), axis=1)

# Split the data into train and test sets with stratification
X_train_val, X_test_nn, y_train_val, y_test_nn = train_test_split(X_combined, y_tensor, test_size=0.2, random_state=42, stratify=y_tensor)

# Further split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)

# Split the combined data back into numerical and categorical tensors for training
X_train_numerical = X_train[:, :X_numerical.shape[1]]
X_train_categorical = X_train[:, X_numerical.shape[1]:]

X_val_numerical = X_val[:, :X_numerical.shape[1]]
X_val_categorical = X_val[:, X_numerical.shape[1]:]

X_test_nn_numerical = X_test_nn[:, :X_numerical.shape[1]]
X_test_nn_categorical = X_test_nn[:, X_numerical.shape[1]:]

# Convert numpy arrays back to PyTorch tensors
X_train_numerical = torch.tensor(X_train_numerical, dtype=torch.float32)
X_train_categorical = torch.tensor(X_train_categorical, dtype=torch.long)
y_train = torch.tensor(y_train.numpy(), dtype=torch.long)

X_val_numerical = torch.tensor(X_val_numerical, dtype=torch.float32)
X_val_categorical = torch.tensor(X_val_categorical, dtype=torch.long)
y_val = torch.tensor(y_val.numpy(), dtype=torch.long)

X_test_nn_numerical = torch.tensor(X_test_nn_numerical, dtype=torch.float32)
X_test_nn_categorical = torch.tensor(X_test_nn_categorical, dtype=torch.long)
y_test_nn = torch.tensor(y_test_nn.numpy(), dtype=torch.long)

# Check the shapes of train, validation, and test data
print("Train data shapes:", X_train_numerical.shape, X_train_categorical.shape, y_train.shape)
print("Validation data shapes:", X_val_numerical.shape, X_val_categorical.shape, y_val.shape)
print("Test data shapes:", X_test_nn_numerical.shape, X_test_nn_categorical.shape, y_test_nn.shape)


In [None]:
# Assuming X_categorical is defined
print("Type of X_categorical:", type(X_categorical))

# Convert X_categorical to a numpy array if it's not already
if not isinstance(X_categorical, np.ndarray):
    X_categorical = np.array(X_categorical)

# Now check the shape
print("X_categorical shape:", X_categorical.shape)

print("X_numerical shape:", X_numerical.shape)
print("X_categorical shape:", X_categorical.shape)

In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Assuming X_train_numerical, X_train_categorical, etc., are defined elsewhere
batch_size = 64

# Create DataLoader objects for train, validation, and test sets
train_dataset = TensorDataset(X_train_numerical, X_train_categorical, y_train)
val_dataset = TensorDataset(X_val_numerical, X_val_categorical, y_val)
test_dataset = TensorDataset(X_test_nn_numerical, X_test_nn_categorical, y_test_nn)

# # Note: shuffle=True for training but keep shuffle=False for validation and test loaders for consistency
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=torch.cuda.is_available())
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=torch.cuda.is_available())
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=torch.cuda.is_available())

# input_dim = sum(len(word_to_index) for word_to_index in word_to_index_map.values())

# Define your model
class Model(nn.Module):
    def __init__(self, n_features=14, cat_features=77, embedding_dim=64, n_classes=8, vocab_size=input_dim):
        super(Model, self).__init__()

        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.prelu_1 = nn.PReLU()
        self.embedding_l1 = nn.Linear(cat_features * embedding_dim, 32)
        self.prelu_2 = nn.PReLU()
        self.embedding_to_add_with_n_features = nn.Linear(32, 16)
        self.prelu_3 = nn.PReLU()

        self.numerical_to_match_cat = nn.Linear(n_features, 16)
        self.prelu_4 = nn.PReLU()
        self.prelu_5 = nn.PReLU()
        self.classify = nn.Linear(16, n_classes)

    def forward(self, n=14, c=77):
        # Process categorical features
        c_out1 = self.embedding(c) # Shape: (batch_size, cat_features, embedding_dim)
        c_out1 = torch.flatten(c_out1, start_dim=1)  # Flatten along all dimensions except batch, Shape: (batch_size, cat_features * embedding_dim)
        c_out1 = self.prelu_1(c_out1)
        c_out1 = self.embedding_l1(c_out1)  # Shape: (batch_size, 32)
        c_out1 = self.prelu_2(c_out1)
        c_out1 = self.embedding_to_add_with_n_features(c_out1)  # Output shape should match n_out1, Shape: (batch_size, 16)
        c_out1 = self.prelu_3(c_out1)

        # Process numerical features
        n_out1 = self.numerical_to_match_cat(n)  # Adjust numerical feature dimensions to match, Shape: (batch_size, 16)
        n_out1 = self.prelu_4(n_out1)  # Applying PReLU

        # Combine numerical and categorical features
        out = n_out1 + c_out1  # Ensure both have the same shape, Shape: (batch_size, 16)

        out = self.prelu_5(out)  # Applying PReLU

        # Final classification
        predictions = self.classify(out)
        return predictions

# Initialize the model and move it to the GPU
model = Model(n_features=X_train_numerical.shape[1], cat_features=X_train_categorical.shape[1], embedding_dim=64, n_classes=8).to(device)

# # Define hyperparameters
num_epochs = 100
learning_rate = 0.001
patience = 10  # Number of epochs to wait for improvement before stopping

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Initialize lists to store training and validation losses
train_losses = []
val_losses = []
early_stopping_epoch = None

# Training loop with validation and early stopping
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (num_features, cat_features, labels) in enumerate(train_loader):
        num_features, cat_features, labels = num_features.to(device), cat_features.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(num_features, cat_features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    train_loss = running_loss / len(train_loader)
    train_losses.append(train_loss)

    # Calculate validation loss
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for num_features, cat_features, labels in val_loader:
            num_features, cat_features, labels = num_features.to(device), cat_features.to(device), labels.to(device)
            outputs = model(num_features, cat_features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    # Print average losses per epoch
    print(f"Epoch {epoch+1}, Training Loss: {running_loss/len(train_loader)}, Validation Loss: {val_loss}")

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        early_stopping_epoch = epoch
        # Save the best model
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break

# # Load the best model
model.load_state_dict(torch.load('best_model.pth', weights_only=True))

# Evaluation on test set
model.eval()
correct = 0
total = 0
all_labels = []
all_preds = []
pred_probs = []
with torch.no_grad():
    for num_features, cat_features, labels in test_loader:
        num_features, cat_features, labels = num_features.to(device), cat_features.to(device), labels.to(device)
        outputs = model(num_features, cat_features)
        outputs = torch.softmax(outputs, dim=1)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(predicted.cpu().numpy())
        pred_probs.extend(outputs.cpu().numpy())

# Convert lists to numpy arrays
all_labels = np.array(all_labels)
all_preds = np.array(all_preds)
pred_probs = np.array(pred_probs)

# Compute metrics
accuracy = 100 * correct / total
precision = precision_score(all_labels, all_preds, average='weighted')
recall = recall_score(all_labels, all_preds, average='weighted')
f1 = f1_score(all_labels, all_preds, average='weighted')
auc = roc_auc_score(y_true=all_labels, y_score=pred_probs, multi_class='ovr', average='weighted')
conf_matrix = confusion_matrix(all_labels, all_preds)

print(f"Test Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")

In [None]:
# Plot the training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
if early_stopping_epoch is not None:
    plt.axvline(x=early_stopping_epoch, color='red', linestyle='--', label='Early Stopping')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.savefig('loss_plot.pdf', format='pdf', dpi=300)
plt.clf()

In [None]:
# confusion matrix plot
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 14})
plt.xlabel('Predicted labels', fontsize=16)
plt.ylabel('True labels', fontsize=16)
plt.title('Confusion Matrix', fontsize=18)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.savefig('confusion_matrix_nn.pdf', format='pdf', dpi=300)
plt.clf()

In [None]:
# Define a wrapper for the PyTorch model
from sklearn.base import BaseEstimator, ClassifierMixin

class NeuralNetClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y):
        return self

    def predict(self, X):
        self.model.eval()
        num_features = torch.tensor(X[:, :X_train_numerical.shape[1]], dtype=torch.float32).to(device)
        cat_features = torch.tensor(X[:, X_train_numerical.shape[1]:], dtype=torch.long).to(device)
        with torch.no_grad():
            outputs = self.model(num_features, cat_features)
            outputs = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs.data, 1)
        return predicted.cpu().numpy()

    def predict_proba(self, X):
        self.model.eval()
        num_features = torch.tensor(X[:, :X_train_numerical.shape[1]], dtype=torch.float32).to(device)
        cat_features = torch.tensor(X[:, X_train_numerical.shape[1]:], dtype=torch.long).to(device)
        with torch.no_grad():
            outputs = self.model(num_features, cat_features)
            probabilities = torch.softmax(outputs, dim=1)
        return probabilities.cpu().numpy()

# Initialize the model and move it to the GPU
# model = Model(n_features=X_train_numerical.shape[1], cat_features=X_train_categorical.shape[1], n_classes=8).to(device)


In [None]:
model.load_state_dict(torch.load('best_model.pth', weights_only=True))
model.eval()

# Define the Neural Network classifier wrapper
nn_classifier = NeuralNetClassifier(model)


# Define the XGBoost classifier (assuming it's already trained)
xgb_classifier = xgb.XGBClassifier(n_estimators=100, max_depth=8, max_leaves=100, tree_method="hist", enable_categorical=True, device="cuda")
print(X_train_numerical.shape)

# Combine numerical and categorical features for training and testing
X_train_combined = np.hstack((X_train_numerical, X_train_categorical))
X_test_combined = np.hstack((X_test_nn_numerical, X_test_nn_categorical))
print(X_train_combined.shape)
print(y_train.shape)
# Define the VotingClassifier with hard and soft voting
estimators = [('xgb', xgb_classifier), ('nn', nn_classifier)]

# # Hard Voting
# hard_voting = VotingClassifier(estimators=estimators, voting='hard')
# hard_voting.fit(X_train_combined, y_train)
# y_pred_hard = hard_voting.predict(X_test_combined)
# hard_voting_score = accuracy_score(y_test_nn, y_pred_hard)
# hard_voting_precision = precision_score(y_test_nn, y_pred_hard, average='weighted')
# hard_voting_recall = recall_score(y_test_nn, y_pred_hard, average='weighted')
# hard_voting_f1 = f1_score(y_test_nn, y_pred_hard, average='weighted')
# print(f"Hard Voting Score: {hard_voting_score:.2f}")

# Soft Voting
soft_voting = VotingClassifier(estimators=estimators, voting='soft')
soft_voting.fit(X_train_combined, y_train)
y_pred_soft = soft_voting.predict(X_test_combined)
y_pred_proba = soft_voting.predict_proba(X_test_combined)
soft_voting_score = accuracy_score(y_test_nn, y_pred_soft)
soft_voting_precision = precision_score(y_test_nn, y_pred_soft, average='weighted')
soft_voting_recall = recall_score(y_test_nn, y_pred_soft, average='weighted')
soft_voting_f1 = f1_score(y_test_nn, y_pred_soft, average='weighted')
auc = roc_auc_score(y_true=all_labels, y_score=y_pred_proba, multi_class='ovr', average='weighted')
conf_matrix_es = confusion_matrix(y_test_nn, y_pred_soft)

print(f"Soft Voting Score: {soft_voting_score:.2f}")
print(f"Precision: {soft_voting_precision:.4f}")
print(f"Recall: {soft_voting_recall:.4f}")
print(f"F1 Score: {soft_voting_f1:.4f}")
print(f"AUC: {auc:.4f}")

# Print the shape of the data used to train the clf model
# print(f"XGBoost model trained on data with shape: {clf.get_booster().num_features()}, {len(y_train)} samples.")
# print(f"Shape of training data used in clf: {X_train.shape}")


In [None]:
# confusion matrix plot
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_es, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix_ensemble.pdf', dpi=300)
plt.clf()