In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import imodels
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

# Prepare the data

The data can be downloaded from Hugging face: https://huggingface.co/datasets/imodels/diabetes-readmission/tree/main 

In [2]:
df = pd.read_csv("./data/diabetes/diabetic_data.csv")

# Drop columns with too many unique values or low relevance for prediction
drop_columns = [
    "encounter_id",
    "patient_nbr",
    "weight",
    "payer_code",
    "medical_specialty",
]

df.drop(columns=drop_columns, inplace=True)

# Handling missing values
df.replace("?", np.nan, inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)

# Encode the 'readmitted' column as binary target variable
df["readmitted"] = df["readmitted"].apply(lambda x: 1 if x == "<30" else 0)


# Encode categorical features
categorical_features = df.select_dtypes(include=["object"]).columns
for col in categorical_features:
    df[col] = LabelEncoder().fit_transform(df[col])


In [3]:
df["readmitted"].value_counts()

readmitted
0    90409
1    11357
Name: count, dtype: int64

In [4]:
# undersample and oversample the data
y = df["readmitted"]
X = df.drop(columns=["readmitted"])

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
# Standardize numerical features fitting the scaler with the training data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [6]:
smote = SMOTE(sampling_strategy=0.50, random_state=42)
X_smote, y_smote = smote.fit_resample(x_train, y_train)

minority_class_count = Counter(y_smote)[1]
under_sampler = RandomUnderSampler(sampling_strategy={0: minority_class_count})
X_balanced, y_balanced = under_sampler.fit_resample(X_smote, y_smote)

print("Final class distribution after undersampling:", Counter(y_balanced))
print("Final class distribution of test set:", Counter(y_test))

After SMOTE: Counter({0: 72340, 1: 36170})
Final class distribution after undersampling: Counter({0: 36170, 1: 36170})
Final class distribution of test set: Counter({0: 18069, 1: 2285})


# Classification

In [7]:
# classify with a simple logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_balanced, y_balanced)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6442468310897121


In [8]:
# Compare with a majority class classifier that always predicts 0
y_pred_majority = np.zeros(y_test.shape)
accuracy_majority = accuracy_score(y_test, y_pred_majority)
print(f"Accuracy of majority class classifier: {accuracy_majority}")

Accuracy of majority class classifier: 0.887737054141692
