In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import imodels
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

# Prepare the data

The data can be downloaded from Hugging face: https://huggingface.co/datasets/imodels/diabetes-readmission/tree/main 

In [3]:
file_path = "./data/diabetes/reduced.csv"
df = pd.read_csv(file_path)
print(df["readmitted"].value_counts())
# Drop columns with too many unique values or low relevance for prediction
drop_columns = [
    # "encounter_id",
    # "patient_nbr",
    "weight",
    "payer_code",
    "medical_specialty",
]

df.drop(columns=drop_columns, inplace=True)

# Handling missing values
df.replace("?", np.nan, inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)

high_cardinality_cols = ["diag_1", "diag_2", "diag_3"]
for col in high_cardinality_cols:
    freq = df[col].value_counts(normalize=True)
    rare_categories = freq[freq < 0.01].index
    df[col] = df[col].replace(rare_categories, "Other")

# Apply ordinal encoding to high-cardinality columns
ordinal_encoder = LabelEncoder()
for col in high_cardinality_cols:
    df[col] = ordinal_encoder.fit_transform(df[col].astype(str))

# Encode the 'readmitted' column as binary target variable
df["readmitted"] = df["readmitted"].apply(lambda x: 1 if x == True else 0)

# Encode categorical features
categorical_features = df.select_dtypes(include=["object"]).columns
for col in categorical_features:
    df[col] = LabelEncoder().fit_transform(df[col])

df = pd.get_dummies(df, columns=None, drop_first=False)

y = df["readmitted"]
X = df.drop(columns=["readmitted"])

# Standardize numerical features fitting the scaler with the training data
scaler = StandardScaler()
X = scaler.fit_transform(X)


x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

readmitted
False    6035
True     3965
Name: count, dtype: int64


In [4]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from collections import Counter
from sklearn.metrics import accuracy_score, f1_score


# Convert datasets to PyTorch tensors
def create_dataloader(X, y, batch_size=32, shuffle=False):
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y.values, dtype=torch.long)
    dataset = TensorDataset(X_tensor, y_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


# Define the neural network
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20):
    best_val_acc = 0
    best_model = None

    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                outputs = model(X_batch)
                _, predicted = torch.max(outputs, 1)
                val_total += y_batch.size(0)
                val_correct += (predicted == y_batch).sum().item()

        val_acc = val_correct / val_total
        print(
            f"Epoch {epoch+1}/{epochs}, Loss: {train_loss/len(train_loader):.4f}, Val Accuracy: {val_acc:.4f}"
        )

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model.state_dict()

    return best_model, best_val_acc


# Hyperparameter tuning using grid search
input_size = x_train.shape[1]
param_grid = {
    "hidden_size1": [16, 32, 64],
    "hidden_size2": [8, 16, 32],
    "learning_rate": [0.01, 0.001, 0.0001],
}

best_params = None
best_model_state = None
best_val_acc = 0

for hidden_size1 in param_grid["hidden_size1"]:
    for hidden_size2 in param_grid["hidden_size2"]:
        for lr in param_grid["learning_rate"]:
            for batch_size in [32, 64, 128]:
                train_loader = create_dataloader(
                    x_train,
                    y_train,
                    shuffle=True,
                    batch_size=batch_size,
                )
                val_loader = create_dataloader(x_val, y_val, batch_size=batch_size)
                test_loader = create_dataloader(x_test, y_test, batch_size=batch_size)

                print(
                    f"Training with hidden_size1={hidden_size1}, hidden_size2={hidden_size2}, learning_rate={lr}"
                )
                model = NeuralNetwork(input_size, hidden_size1, hidden_size2)
                criterion = nn.CrossEntropyLoss()
                optimizer = optim.Adam(model.parameters(), lr=lr)
                model_state, val_acc = train_model(
                    model, train_loader, val_loader, criterion, optimizer, epochs=20
                )

                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    best_model_state = model_state
                    best_params = (hidden_size1, hidden_size2, lr)

print(
    f"Best Params: hidden_size1={best_params[0]}, hidden_size2={best_params[1]}, learning_rate={best_params[2]}"
)
print(f"Best Validation Accuracy: {best_val_acc:.4f}")

# Merge training and validation data
X_train_val = np.vstack((x_train, x_val))
y_train_val = np.hstack((y_train, y_val))

# Train the best model on the merged training and validation data
train_val_loader = create_dataloader(
    X_train_val, pd.Series(y_train_val), shuffle=True, batch_size=batch_size
)
best_model = NeuralNetwork(input_size, best_params[0], best_params[1])
best_model.load_state_dict(best_model_state)
optimizer = optim.Adam(best_model.parameters(), lr=best_params[2])
best_model_state, _ = train_model(
    best_model, train_val_loader, val_loader, criterion, optimizer, epochs=20
)

# Load the best model and evaluate on the test set
best_model.load_state_dict(best_model_state)
best_model.eval()

test_correct = 0
test_total = 0
y_true = []
y_pred = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = best_model(X_batch)
        _, predicted = torch.max(outputs, 1)
        test_total += y_batch.size(0)
        test_correct += (predicted == y_batch).sum().item()
        y_true.extend(y_batch)
        y_pred.extend(predicted)

test_accuracy = test_correct / test_total
f1 = f1_score(y_true, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

# Majority classifier
y_majority = np.zeros(len(y_test))
accuracy_majority = accuracy_score(y_test, y_majority)
print("Accuracy of majority classifier:", accuracy_majority)


Training with hidden_size1=16, hidden_size2=8, learning_rate=0.01
Epoch 1/20, Loss: 0.6491, Val Accuracy: 0.6444
Epoch 2/20, Loss: 0.6360, Val Accuracy: 0.6425
Epoch 3/20, Loss: 0.6316, Val Accuracy: 0.6325
Epoch 4/20, Loss: 0.6269, Val Accuracy: 0.6462
Epoch 5/20, Loss: 0.6208, Val Accuracy: 0.6400
Epoch 6/20, Loss: 0.6161, Val Accuracy: 0.6500
Epoch 7/20, Loss: 0.6108, Val Accuracy: 0.6400
Epoch 8/20, Loss: 0.6111, Val Accuracy: 0.6481
Epoch 9/20, Loss: 0.6049, Val Accuracy: 0.6394
Epoch 10/20, Loss: 0.6017, Val Accuracy: 0.6312
Epoch 11/20, Loss: 0.5989, Val Accuracy: 0.6306
Epoch 12/20, Loss: 0.5976, Val Accuracy: 0.6412
Epoch 13/20, Loss: 0.5916, Val Accuracy: 0.6375
Epoch 14/20, Loss: 0.5910, Val Accuracy: 0.6369
Epoch 15/20, Loss: 0.5867, Val Accuracy: 0.6312
Epoch 16/20, Loss: 0.5844, Val Accuracy: 0.6362
Epoch 17/20, Loss: 0.5814, Val Accuracy: 0.6225
Epoch 18/20, Loss: 0.5790, Val Accuracy: 0.6400
Epoch 19/20, Loss: 0.5750, Val Accuracy: 0.6369
Epoch 20/20, Loss: 0.5712, Val 

# Classification

In [None]:
# classify with a simple logistic regression model
model = LogisticRegression(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
# Compare with a majority class classifier that always predicts 0
y_pred_majority = np.zeros(y_test.shape)
accuracy_majority = accuracy_score(y_test, y_pred_majority)
print(f"Accuracy of majority class classifier: {accuracy_majority}")