In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight

In [3]:
df = pd.read_csv('diabetic_data.csv')
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [5]:
df.isnull().sum()

encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [6]:
df['readmitted'].value_counts()

readmitted
NO     54864
>30    35545
<30    11357
Name: count, dtype: int64

In [8]:
# Converting to binary
df['readmitted'] = df['readmitted'].apply(lambda x: 1 if x == '<30' else 0)

In [9]:
df = df.drop(['encounter_id', 'patient_nbr'], axis=1)

In [10]:
df['readmitted'].value_counts()

readmitted
0    101766
Name: count, dtype: int64

In [11]:
df = pd.read_csv('diabetic_data.csv')
df['readmitted'] = df['readmitted'].map({'<30': 1, 'NO': 0, '>30': 0})
df['readmitted'].value_counts()

readmitted
0    90409
1    11357
Name: count, dtype: int64

In [12]:
df = df.drop(['encounter_id', 'patient_nbr'], axis=1)

In [13]:
# Check columns with "?" (dataset uses "?" for missing values)
for col in df.columns:
    print(f"{col}: {(df[col] == '?').sum()}")

race: 2273
gender: 0
age: 0
weight: 98569
admission_type_id: 0
discharge_disposition_id: 0
admission_source_id: 0
time_in_hospital: 0
payer_code: 40256
medical_specialty: 49949
num_lab_procedures: 0
num_procedures: 0
num_medications: 0
number_outpatient: 0
number_emergency: 0
number_inpatient: 0
diag_1: 21
diag_2: 358
diag_3: 1423
number_diagnoses: 0
max_glu_serum: 0
A1Cresult: 0
metformin: 0
repaglinide: 0
nateglinide: 0
chlorpropamide: 0
glimepiride: 0
acetohexamide: 0
glipizide: 0
glyburide: 0
tolbutamide: 0
pioglitazone: 0
rosiglitazone: 0
acarbose: 0
miglitol: 0
troglitazone: 0
tolazamide: 0
examide: 0
citoglipton: 0
insulin: 0
glyburide-metformin: 0
glipizide-metformin: 0
glimepiride-pioglitazone: 0
metformin-rosiglitazone: 0
metformin-pioglitazone: 0
change: 0
diabetesMed: 0
readmitted: 0


In [14]:
df = df.drop(['weight', 'payer_code', 'medical_specialty'], axis=1)

In [15]:
# Replace "?" with NaN
df = df.replace('?', np.nan)

# Drop rows with any NaN (only ~2000 rows affected)
df = df.dropna()

# Check shape
df.shape

(289, 45)

In [16]:
df = pd.read_csv('diabetic_data.csv')
df['readmitted'] = df['readmitted'].map({'<30': 1, 'NO': 0, '>30': 0})
df = df.drop(['encounter_id', 'patient_nbr', 'weight', 'payer_code', 'medical_specialty'], axis=1)

# Replace "?" ONLY in specific columns
df['race'] = df['race'].replace('?', df['race'].mode()[0])
df['diag_1'] = df['diag_1'].replace('?', df['diag_1'].mode()[0])
df['diag_2'] = df['diag_2'].replace('?', df['diag_2'].mode()[0])
df['diag_3'] = df['diag_3'].replace('?', df['diag_3'].mode()[0])

df.shape

(101766, 45)

In [17]:
df.dtypes

race                        object
gender                      object
age                         object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide               object
glipizide                   object
glyburide                   object
tolbutamide                 object
pioglitazone        

In [19]:

# Get all object columns except target
cat_cols = df.select_dtypes(include='object').columns.tolist()

# One-hot encode
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

df.shape

(101766, 2331)

In [None]:
X = df.drop('readmitted', axis=1)
y = df['readmitted']

# First split: 60% train, 40% temp
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4,    )

# Second split: 50% of temp = 20% validation, 20% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(X_train.shape, X_val.shape, X_test.shape)

(61059, 2330) (20353, 2330) (20354, 2330)


In [24]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

print("Data ready for neural network!")

Data ready for neural network!


In [26]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy', # loss=tf.keras.losses.BinaryCrossentropy()
    metrics=['accuracy', 'precision', 'recall']
)

In [28]:
history = model.fit(
    X_train, y_train,
    batch_size=32,
    epochs=20,
    validation_data=(X_val, y_val),
    verbose=1
)

Epoch 1/20
[1m1909/1909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 6ms/step - accuracy: 0.8867 - loss: 0.3611 - precision: 0.1063 - recall: 0.0025 - val_accuracy: 0.8880 - val_loss: 0.3439 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/20
[1m1909/1909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.8887 - loss: 0.3378 - precision: 0.2500 - recall: 2.9442e-04 - val_accuracy: 0.8880 - val_loss: 0.3438 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/20
[1m1909/1909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.8886 - loss: 0.3309 - precision: 0.4103 - recall: 0.0024 - val_accuracy: 0.8876 - val_loss: 0.3445 - val_precision: 0.3824 - val_recall: 0.0057
Epoch 4/20
[1m1909/1909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.8892 - loss: 0.3246 - precision: 0.5722 - recall: 0.0152 - val_accuracy: 0.8879 - val_loss: 0.3472 - val_precision: 0.4808 - val_recall:

In [29]:
test_loss, test_acc, test_prec, test_recall = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Recall: {test_recall:.4f}")

[1m637/637[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8791 - loss: 0.4404 - precision: 0.2603 - recall: 0.0416
Test Accuracy: 0.8791
Test Recall: 0.0416


In [31]:
# Apply oversampling to training data only
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

print(f"Before: {y_train.value_counts()}")
print(f"After: {pd.Series(y_train_resampled).value_counts()}")

Before: readmitted
0    54266
1     6793
Name: count, dtype: int64
After: readmitted
0    54266
1    54266
Name: count, dtype: int64


In [32]:
# Standardize resampled data
X_train_resampled = scaler.fit_transform(X_train_resampled)

# Retrain model
history = model.fit(
    X_train_resampled, y_train_resampled,
    batch_size=32,
    epochs=20,
    validation_data=(X_val, y_val),
    verbose=1
)

Epoch 1/20
[1m3392/3392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 6ms/step - accuracy: 0.8193 - loss: 0.3865 - precision: 0.8266 - recall: 0.8080 - val_accuracy: 0.7970 - val_loss: 0.5710 - val_precision: 0.1768 - val_recall: 0.2225
Epoch 2/20
[1m3392/3392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 6ms/step - accuracy: 0.8467 - loss: 0.3422 - precision: 0.8375 - recall: 0.8602 - val_accuracy: 0.7900 - val_loss: 0.6293 - val_precision: 0.1741 - val_recall: 0.2339
Epoch 3/20
[1m3392/3392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 6ms/step - accuracy: 0.8626 - loss: 0.3159 - precision: 0.8496 - recall: 0.8812 - val_accuracy: 0.7730 - val_loss: 0.6485 - val_precision: 0.1713 - val_recall: 0.2677
Epoch 4/20
[1m3392/3392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 6ms/step - accuracy: 0.8704 - loss: 0.3010 - precision: 0.8579 - recall: 0.8878 - val_accuracy: 0.7862 - val_loss: 0.6857 - val_precision: 0.1673 - val_recall: 0.2286
Epoch 5/20


In [33]:
test_loss, test_acc, test_prec, test_recall = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Recall: {test_recall:.4f}")

[1m637/637[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7907 - loss: 1.4369 - precision: 0.1660 - recall: 0.2149
Test Accuracy: 0.7907
Test Recall: 0.2149


In [35]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Standardize
X_train_smote = scaler.fit_transform(X_train_smote)

print(f"SMOTE result: {pd.Series(y_train_smote).value_counts()}")

SMOTE result: readmitted
0    54266
1    54266
Name: count, dtype: int64


In [37]:
# Calculate class weights
class_weights = class_weight.compute_class_weight('balanced', 
                                                   classes=np.unique(y_train), 
                                                   y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

print(f"Class weights: {class_weight_dict}")

Class weights: {0: np.float64(0.5625898352559614), 1: np.float64(4.494258795819226)}


In [38]:
# Rebuild fresh model
model_improved = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_smote.shape[1],)),
    Dropout(0.4),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model_improved.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
    loss='binary_crossentropy',
    metrics=['accuracy', 'precision', 'recall']
)

# Train with class weights
history_improved = model_improved.fit(
    X_train_smote, y_train_smote,
    batch_size=32,
    epochs=30,
    validation_data=(X_val, y_val),
    class_weight=class_weight_dict,
    verbose=1
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m3392/3392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 11ms/step - accuracy: 0.5231 - loss: 0.8487 - precision: 0.5119 - recall: 0.9951 - val_accuracy: 0.1830 - val_loss: 1.7102 - val_precision: 0.1149 - val_recall: 0.9394
Epoch 2/30
[1m3392/3392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 10ms/step - accuracy: 0.5737 - loss: 0.7158 - precision: 0.5401 - recall: 0.9932 - val_accuracy: 0.2307 - val_loss: 1.4942 - val_precision: 0.1189 - val_recall: 0.9153
Epoch 3/30
[1m3392/3392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 10ms/step - accuracy: 0.6468 - loss: 0.6050 - precision: 0.5872 - recall: 0.9885 - val_accuracy: 0.2887 - val_loss: 1.5379 - val_precision: 0.1217 - val_recall: 0.8613
Epoch 4/30
[1m3392/3392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 9ms/step - accuracy: 0.7189 - loss: 0.5136 - precision: 0.6427 - recall: 0.9858 - val_accuracy: 0.3828 - val_loss: 1.3802 - val_precision: 0.1279 - val_recall: 0.7753
Epoch 5/3

In [39]:
test_loss, test_acc, test_prec, test_recall = model_improved.evaluate(X_test, y_test)
print(f"\nImproved Model Results:")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Precision: {test_prec:.4f}")
print(f"Test Recall: {test_recall:.4f}")

[1m637/637[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5940 - loss: 1.4928 - precision: 0.1198 - recall: 0.4123

Improved Model Results:
Test Accuracy: 0.5940
Test Precision: 0.1198
Test Recall: 0.4123


In [40]:
# Get predictions with probabilities
y_pred_proba = model_improved.predict(X_test)

# Try lower threshold (0.3 instead of 0.5)
y_pred_threshold = (y_pred_proba > 0.3).astype(int)

from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred_threshold))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_threshold))

[1m637/637[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
              precision    recall  f1-score   support

           0       0.89      0.56      0.69     18069
           1       0.12      0.48      0.19      2285

    accuracy                           0.55     20354
   macro avg       0.51      0.52      0.44     20354
weighted avg       0.81      0.55      0.63     20354


Confusion Matrix:
[[10061  8008]
 [ 1191  1094]]


In [41]:
y_pred_threshold_25 = (y_pred_proba > 0.25).astype(int)
print(classification_report(y_test, y_pred_threshold_25))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_threshold_25))

              precision    recall  f1-score   support

           0       0.89      0.54      0.67     18069
           1       0.12      0.50      0.19      2285

    accuracy                           0.53     20354
   macro avg       0.51      0.52      0.43     20354
weighted avg       0.81      0.53      0.62     20354


Confusion Matrix:
[[9698 8371]
 [1146 1139]]


In [42]:
# Save model
model_improved.save('hospital_readmission_model.h5')

# Save scaler
import pickle
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("✅ Model saved as: hospital_readmission_model.h5")
print("✅ Scaler saved as: scaler.pkl")
print("✅ Files are in: backend/machine_learning/")



✅ Model saved as: hospital_readmission_model.h5
✅ Scaler saved as: scaler.pkl
✅ Files are in: backend/machine_learning/


In [43]:
model_improved.save('hospital_readmission_model.keras')
print("✅ Also saved as: hospital_readmission_model.keras")

✅ Also saved as: hospital_readmission_model.keras


In [44]:
from sklearn.ensemble import RandomForestClassifier

# Get feature names
feature_names = X.columns.tolist()

# Train Random Forest to get feature importance
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)

# Get top 70 features
importances = pd.DataFrame({
    'feature': feature_names,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 70 Most Important Features:")
print(importances.head(70))

Top 70 Most Important Features:
                 feature  importance
4     num_lab_procedures    0.052712
6        num_medications    0.049293
3       time_in_hospital    0.038174
9       number_inpatient    0.035498
5         num_procedures    0.027418
...                  ...         ...
986           diag_2_411    0.002643
430           diag_1_584    0.002578
1126          diag_2_585    0.002555
2277  max_glu_serum_Norm    0.002447
998           diag_2_425    0.002368

[70 rows x 2 columns]


In [None]:
# Get top 70 feature names
top_70_features = importances.head(70)['feature'].tolist()

# Filter dataset to only use top 70 features
X_filtered = df[top_70_features]
y_filtered = df['readmitted']

# Split data
X_train_f, X_temp_f, y_train_f, y_temp_f = train_test_split(X_filtered, y_filtered, test_size=0.4, random_state=42)
X_val_f, X_test_f, y_val_f, y_test_f = train_test_split(X_temp_f, y_temp_f, test_size=0.5, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote_f, y_train_smote_f = smote.fit_resample(X_train_f, y_train_f)

# Standardize
scaler_f = StandardScaler()
X_train_smote_f = scaler_f.fit_transform(X_train_smote_f)
X_val_f = scaler_f.transform(X_val_f)
X_test_f = scaler_f.transform(X_test_f)

print(f"New shape: {X_train_smote_f.shape}")


New shape: (108532, 70)


In [46]:
# Build model for 70 features
model_70 = Sequential([
    Dense(128, activation='relu', input_shape=(70,)),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_70.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
    loss='binary_crossentropy',
    metrics=['accuracy', 'precision', 'recall']
)

# Train
history_70 = model_70.fit(
    X_train_smote_f, y_train_smote_f,
    batch_size=32,
    epochs=30,
    validation_data=(X_val_f, y_val_f),
    class_weight=class_weight_dict,
    verbose=1
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m3392/3392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - accuracy: 0.6524 - loss: 0.5166 - precision: 0.5940 - recall: 0.9631 - val_accuracy: 0.4341 - val_loss: 0.8677 - val_precision: 0.1306 - val_recall: 0.7170
Epoch 2/30
[1m3392/3392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - accuracy: 0.6819 - loss: 0.4591 - precision: 0.6169 - recall: 0.9602 - val_accuracy: 0.4335 - val_loss: 0.8211 - val_precision: 0.1338 - val_recall: 0.7416
Epoch 3/30
[1m3392/3392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.6937 - loss: 0.4498 - precision: 0.6261 - recall: 0.9616 - val_accuracy: 0.4618 - val_loss: 0.8093 - val_precision: 0.1383 - val_recall: 0.7275
Epoch 4/30
[1m3392/3392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.7024 - loss: 0.4450 - precision: 0.6334 - recall: 0.9612 - val_accuracy: 0.5193 - val_loss: 0.8120 - val_precision: 0.1422 - val_recall: 0.6542
Epoch 5/30


In [47]:
test_loss, test_acc, test_prec, test_recall = model_70.evaluate(X_test_f, y_test_f)
print(f"\n70-Feature Model Results:")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Precision: {test_prec:.4f}")
print(f"Test Recall: {test_recall:.4f}")

[1m637/637[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5062 - loss: 0.7623 - precision: 0.1452 - recall: 0.6954

70-Feature Model Results:
Test Accuracy: 0.5062
Test Precision: 0.1452
Test Recall: 0.6954


In [48]:
# Get predictions
y_pred_proba_70 = model_70.predict(X_test_f)

# Try 0.3 threshold
y_pred_30 = (y_pred_proba_70 > 0.3).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_test_f, y_pred_30))

[1m637/637[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
              precision    recall  f1-score   support

           0       0.94      0.10      0.18     18069
           1       0.12      0.96      0.21      2285

    accuracy                           0.19     20354
   macro avg       0.53      0.53      0.19     20354
weighted avg       0.85      0.19      0.18     20354



In [49]:
y_pred_40 = (y_pred_proba_70 > 0.4).astype(int)
print(classification_report(y_test_f, y_pred_40))

              precision    recall  f1-score   support

           0       0.94      0.26      0.40     18069
           1       0.13      0.86      0.22      2285

    accuracy                           0.32     20354
   macro avg       0.53      0.56      0.31     20354
weighted avg       0.85      0.32      0.38     20354



In [50]:
# Save model
model_70.save('hospital_readmission_70features.keras')

# Save scaler
import pickle
with open('scaler_70features.pkl', 'wb') as f:
    pickle.dump(scaler_f, f)

# Save feature list
with open('top_70_features.pkl', 'wb') as f:
    pickle.dump(top_70_features, f)

print("✅ 70-feature model saved!")
print("✅ Use threshold 0.4 for 86% recall")

✅ 70-feature model saved!
✅ Use threshold 0.4 for 86% recall


In [51]:
import pickle
with open('top_70_features.pkl', 'rb') as f:
    features = pickle.load(f)
print(features)

['num_lab_procedures', 'num_medications', 'time_in_hospital', 'number_inpatient', 'num_procedures', 'discharge_disposition_id', 'number_diagnoses', 'admission_type_id', 'admission_source_id', 'gender_Male', 'number_outpatient', 'number_emergency', 'race_Caucasian', 'age_[70-80)', 'age_[60-70)', 'insulin_Steady', 'change_No', 'age_[80-90)', 'insulin_No', 'age_[50-60)', 'metformin_Steady', 'metformin_No', 'diabetesMed_Yes', 'glipizide_No', 'age_[40-50)', 'insulin_Up', 'diag_2_276', 'A1Cresult_>8', 'glyburide_No', 'glipizide_Steady', 'diag_3_250', 'diag_1_428', 'diag_2_428', 'glyburide_Steady', 'diag_3_276', 'diag_2_427', 'diag_3_428', 'diag_3_401', 'diag_3_427', 'A1Cresult_Norm', 'pioglitazone_No', 'pioglitazone_Steady', 'rosiglitazone_No', 'diag_1_414', 'rosiglitazone_Steady', 'diag_2_496', 'diag_3_414', 'diag_3_496', 'diag_2_599', 'age_[30-40)', 'diag_1_410', 'diag_2_403', 'glimepiride_No', 'diag_2_250', 'diag_1_486', 'diag_3_585', 'glimepiride_Steady', 'diag_3_403', 'age_[90-100)', 'd

In [53]:
import pickle
with open('top_70_features.pkl', 'rb') as f:
    features = pickle.load(f)
print('\n'.join([f"{i+1}. {f}" for i, f in enumerate(features)]))

1. num_lab_procedures
2. num_medications
3. time_in_hospital
4. number_inpatient
5. num_procedures
6. discharge_disposition_id
7. number_diagnoses
8. admission_type_id
9. admission_source_id
10. gender_Male
11. number_outpatient
12. number_emergency
13. race_Caucasian
14. age_[70-80)
15. age_[60-70)
16. insulin_Steady
17. change_No
18. age_[80-90)
19. insulin_No
20. age_[50-60)
21. metformin_Steady
22. metformin_No
23. diabetesMed_Yes
24. glipizide_No
25. age_[40-50)
26. insulin_Up
27. diag_2_276
28. A1Cresult_>8
29. glyburide_No
30. glipizide_Steady
31. diag_3_250
32. diag_1_428
33. diag_2_428
34. glyburide_Steady
35. diag_3_276
36. diag_2_427
37. diag_3_428
38. diag_3_401
39. diag_3_427
40. A1Cresult_Norm
41. pioglitazone_No
42. pioglitazone_Steady
43. rosiglitazone_No
44. diag_1_414
45. rosiglitazone_Steady
46. diag_2_496
47. diag_3_414
48. diag_3_496
49. diag_2_599
50. age_[30-40)
51. diag_1_410
52. diag_2_403
53. glimepiride_No
54. diag_2_250
55. diag_1_486
56. diag_3_585
57. glim

In [54]:
import pickle
with open('top_70_features.pkl', 'rb') as f:
    features = pickle.load(f)

with open('features_list.txt', 'w') as f:
    for i, feature in enumerate(features):
        f.write(f"{i+1}. {feature}\n")

print("Saved to features_list.txt")

Saved to features_list.txt
