In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Load the data
diabetic_data = pd.read_csv('/content/drive/MyDrive/diabetic_data/diabetic_data.csv')
ids_mapping = pd.read_csv('/content/drive/MyDrive/diabetic_data/IDS_mapping.csv')

print(diabetic_data.head())
print(ids_mapping.head())

   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No      No                   No

# Data Preprocessing

In [7]:
print('Preprocessing data .........')
# Drop columns that won't help in prediction (e.g., unique patient IDs)
diabetic_data.drop(columns=['encounter_id', 'patient_nbr'], inplace=True)

Preprocessing data .........


In [8]:
# Handle missing values
diabetic_data.replace('?', np.nan, inplace=True)
missing_values = diabetic_data.isnull().sum()
missing_columns = missing_values[missing_values > 0].index.tolist()
diabetic_data.drop(columns=missing_columns, inplace=True)

In [9]:
print('Preparing data for models .........')
# Encode categorical variables
categorical_cols = diabetic_data.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    diabetic_data[col] = le.fit_transform(diabetic_data[col])

# Split dataset into features and target variable
X = diabetic_data.drop(columns=['readmitted'])
y = diabetic_data['readmitted']

Preparing data for models .........


In [10]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Building

In [11]:
print('Model building - LogisticRegression')
# Logistic Regression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)

Model building - LogisticRegression


In [12]:
print('Model building - RandomForestClassifier')
# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

Model building - RandomForestClassifier


In [13]:
# print('Model building - SVM')
# # Support Vector Machine (SVM)
# svm_model = SVC(kernel="rbf", probability=True, random_state=42)
# svm_model.fit(X_train, y_train)
# y_pred_svm = svm_model.predict(X_test)

In [14]:
print('Model building - GradientBoostingClassifier')
# Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

Model building - GradientBoostingClassifier


In [15]:
print('Model building - XGBClassifier')
# XGBoost Classifier
xgb_model = XGBClassifier(
    use_label_encoder=False, eval_metric="logloss", random_state=42
)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

Model building - XGBClassifier


Parameters: { "use_label_encoder" } are not used.



In [16]:
print('Model building - AdaBoostClassifier')
# AdaBoost Classifier
ada_model = AdaBoostClassifier(n_estimators=100, random_state=42)
ada_model.fit(X_train, y_train)
y_pred_ada = ada_model.predict(X_test)

Model building - AdaBoostClassifier


In [17]:
print('Model building - Deep Learning')
# Deep Learning Model
deep_model = Sequential()
deep_model.add(Dense(64, input_dim=X_train.shape[1], activation="relu"))
deep_model.add(Dropout(0.5))
deep_model.add(Dense(32, activation="relu"))
deep_model.add(Dropout(0.5))
deep_model.add(Dense(1, activation="sigmoid"))

# Compile the model
deep_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

print(deep_model.summary())

print('Model training - Deep Learning')
# Train the model
deep_model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_test, y_test),
    verbose=1,
)

# Predict with Deep Learning Model
y_pred_deep = (deep_model.predict(X_test) > 0.5).astype("int32")

Model building - Deep Learning


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None
Model training - Deep Learning
Epoch 1/50
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.3443 - loss: -3204.2566 - val_accuracy: 0.3497 - val_loss: -50909.3359
Epoch 2/50
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.3469 - loss: -102500.9062 - val_accuracy: 0.3497 - val_loss: -316207.4375
Epoch 3/50
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.3529 - loss: -439373.3750 - val_accuracy: 0.3497 - val_loss: -874772.6875
Epoch 4/50
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.3482 - loss: -1103596.2500 - val_accuracy: 0.3497 - val_loss: -1821893.1250
Epoch 5/50
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.3480 - loss: -2153636.5000 - val_accuracy: 0.3497 - val_loss: -3247073.5000
Epoch 6/50
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms

# Model Evaluation

In [18]:
print('Model evaluation......')
def evaluate_model(model_name, y_test, y_pred):
    print(f'--- {model_name} Evaluation ---')
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# Evaluate Logistic Regression
evaluate_model("Logistic Regression", y_test, y_pred_logistic)

# Evaluate Random Forest
evaluate_model("Random Forest", y_test, y_pred_rf)

# Evaluate SVM
# evaluate_model("Support Vector Machine (SVM)", y_test, y_pred_svm)

# Evaluate Gradient Boosting
evaluate_model("Gradient Boosting", y_test, y_pred_gb)

# Evaluate XGBoost
evaluate_model("XGBoost", y_test, y_pred_xgb)

# Evaluate AdaBoost
evaluate_model("AdaBoost", y_test, y_pred_ada)

# Evaluate Deep Learning Model
evaluate_model("Deep Learning Model", y_test, y_pred_deep)

Model evaluation......
--- Logistic Regression Evaluation ---
Accuracy: 0.57
Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.01      0.02      2285
           1       0.50      0.24      0.33      7117
           2       0.58      0.90      0.71     10952

    accuracy                           0.57     20354
   macro avg       0.49      0.38      0.35     20354
weighted avg       0.53      0.57      0.50     20354

--- Random Forest Evaluation ---
Accuracy: 0.57
Classification Report:
              precision    recall  f1-score   support

           0       0.26      0.02      0.04      2285
           1       0.48      0.40      0.43      7117
           2       0.61      0.80      0.69     10952

    accuracy                           0.57     20354
   macro avg       0.45      0.40      0.39     20354
weighted avg       0.53      0.57      0.53     20354

--- Gradient Boosting Evaluation ---
Accuracy: 0.59
Classification 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
# Comparing Model accuracy
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logistic))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
# print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("AdaBoost Accuracy:", accuracy_score(y_test, y_pred_ada))
print("Deep Learning Model Accuracy:", accuracy_score(y_test, y_pred_deep))

Logistic Regression Accuracy: 0.5693721135894665
Random Forest Accuracy: 0.5701581998624349
Gradient Boosting Accuracy: 0.5863220988503488
XGBoost Accuracy: 0.585683403753562
AdaBoost Accuracy: 0.5764468900461825
Deep Learning Model Accuracy: 0.34966100029478236
