In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle


In [3]:
data = pd.read_csv("diabetic_data.csv")

In [4]:
data.head(1)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO


In [5]:
# drop_cols = [
#     'encounter_id', 'patient_nbr', 'weight', 'payer_code',
#     'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'examide', 'citoglipton'
# ]
# data = data.drop(columns=drop_cols)

In [6]:
features = [
    'age', 'gender', 'race', 'time_in_hospital', 'num_lab_procedures',
    'num_procedures', 'num_medications', 'number_outpatient',
    'number_emergency', 'number_inpatient', 'number_diagnoses',
    'A1Cresult', 'max_glu_serum', 'insulin', 'change', 'diabetesMed', 'metformin'
]
target = 'readmitted'

In [7]:
df = data[features + [target]]

In [8]:
df = df.replace('?', np.nan)
df = df.dropna()


In [9]:
df.isnull().sum()


age                   0
gender                0
race                  0
time_in_hospital      0
num_lab_procedures    0
num_procedures        0
num_medications       0
number_outpatient     0
number_emergency      0
number_inpatient      0
number_diagnoses      0
A1Cresult             0
max_glu_serum         0
insulin               0
change                0
diabetesMed           0
metformin             0
readmitted            0
dtype: int64

In [10]:
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col])

In [11]:
X = df[features]
y = df[target]

In [12]:
from sklearn.preprocessing import MinMaxScaler
# Normalize features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', C=10, gamma=0.1, decision_function_shape='ovo', random_state=42)
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


SVM Accuracy: 0.5423728813559322

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.58      0.84      0.68        31
           2       0.43      0.32      0.36        19

    accuracy                           0.54        59
   macro avg       0.34      0.38      0.35        59
weighted avg       0.44      0.54      0.48        59



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    random_state=42
)
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Gradient Boosting Accuracy: 0.4576271186440678

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.59      0.55      0.57        31
           2       0.38      0.53      0.44        19

    accuracy                           0.46        59
   macro avg       0.32      0.36      0.34        59
weighted avg       0.43      0.46      0.44        59



In [18]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    objective='multi:softmax'
)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


XGBoost Accuracy: 0.4745762711864407

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.56      0.58      0.57        31
           2       0.42      0.53      0.47        19

    accuracy                           0.47        59
   macro avg       0.33      0.37      0.35        59
weighted avg       0.43      0.47      0.45        59



In [14]:
model = RandomForestClassifier(n_estimators=150, random_state=42)
model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)
print("âœ… Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

âœ… Accuracy: 0.492

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.57      0.65      0.61        31
           2       0.38      0.47      0.42        19

    accuracy                           0.49        59
   macro avg       0.32      0.37      0.34        59
weighted avg       0.42      0.49      0.45        59


Confusion Matrix:
 [[ 0  5  4]
 [ 0 20 11]
 [ 0 10  9]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
pickle.dump(model, open("readmission_model.pkl", "wb"))
print("\nðŸ’¾ Model saved as 'readmission_model.pkl'")


ðŸ’¾ Model saved as 'readmission_model.pkl'


In [37]:
sample_input = np.array([
    5,   # age (encoded age group, e.g., [0â€“9] scaled to 0â€“9)
    1,   # gender (1 = male/female encoded)
    2,   # race (encoded)
    6,   # time_in_hospital (1â€“14 days)
    45,  # num_lab_procedures
    1,   # num_procedures
    10,  # num_medications
    2,   # number_outpatient
    0,   # number_emergency
    1,   # number_inpatient
    5,   # number_diagnoses
    1,   # A1Cresult (encoded)
    2,   # max_glu_serum (encoded)
    1,   # insulin (encoded)
    1,   # change (encoded)
    1,   # diabetesMed (encoded)
    1    # metformin (encoded)
]).reshape(1, -1)

In [38]:
# Predict readmission
prediction = model.predict(sample_input)[0]
probability = model.predict_proba(sample_input)[0][1] * 100 



In [39]:
# Display results
if prediction == 1:
    result = "ðŸ”´ High Risk: Patient likely to be readmitted"
else:
    result = "ðŸŸ¢ Low Risk: Patient unlikely to be readmitted"

print("\nâœ… Test Input (Sample Patient):")
print(pd.DataFrame([sample_input[0]], columns=features))
print("\nðŸ“Š Prediction Result:", result)
print(f"ðŸ“ˆ Readmission Probability: {probability:.2f}%")


âœ… Test Input (Sample Patient):
   age  gender  race  time_in_hospital  num_lab_procedures  num_procedures  \
0    5       1     2                 6                  45               1   

   num_medications  number_outpatient  number_emergency  number_inpatient  \
0               10                  2                 0                 1   

   number_diagnoses  A1Cresult  max_glu_serum  insulin  change  diabetesMed  \
0                 5          1              2        1       1            1   

   metformin  
0          1  

ðŸ“Š Prediction Result: ðŸ”´ High Risk: Patient likely to be readmitted
ðŸ“ˆ Readmission Probability: 56.00%
