# Chronic Kidney Disease Prediction - Model Comparison
This notebook compares several ML models to predict the likelihood of having diabetes.

In [None]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("kidney_disease.csv")
# df.dropna(inplace=True)
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [None]:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df.dropna(inplace=True)
df["classification"] = df["classification"].map({"ckd": 1, "notckd": 0})
categorical_columns = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
encoder = OrdinalEncoder()
df[categorical_columns] = encoder.fit_transform(df[categorical_columns])

## Prepare the data

In [None]:
X = df.drop(columns=["classification"])
y = df["classification"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(y_train.isna().sum())
print(df["classification"].isna().sum())

0
0


## Train and evaluate models

In [None]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    # "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    proba = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, preds)
    auc = roc_auc_score(y_test, proba)

    results.append((name, acc, auc))
    print(f"Model: {name}")
    print(f"Accuracy: {acc:.4f}, AUC: {auc:.4f}")
    print(classification_report(y_test, preds))
    print("-" * 40)

best_model_name, best_acc, best_auc = max(results, key=lambda x: x[2])
print(f"Best Model: {best_model_name} (AUC = {best_auc:.4f})")

best_model = models[best_model_name]


Model: Random Forest
Accuracy: 1.0000, AUC: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00         9

    accuracy                           1.00        32
   macro avg       1.00      1.00      1.00        32
weighted avg       1.00      1.00      1.00        32

----------------------------------------
Model: Logistic Regression
Accuracy: 1.0000, AUC: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00         9

    accuracy                           1.00        32
   macro avg       1.00      1.00      1.00        32
weighted avg       1.00      1.00      1.00        32

----------------------------------------
Model: SVM
Accuracy: 0.8125, AUC: 0.8502
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        23
      

## Make a Prediction with Best Model

In [None]:
input_data = {
    "id": 402,
    "age": 25,
    "bp": 70,
    "sg": 1.02,
    "al": 3,
    "su": 0,
    "rbc": "normal",
    "pc": "abnormal",
    "pcc": "present",
    "ba": "not present",
    "bgr": 250,
    "bu": 20,
    "sc": 1.2,
    "sod": 101,
    "pot": 3.1,
    "hemo": 14.2,
    "pcv": 35,
    "wc": 7600,
    "rc": 2.5,
    "htn": "yes",
    "dm": "no",
    "cad": "no",
    "appet": "poor",
    "pe": "yes",
    "ane": "no"
}

input_df = pd.DataFrame([input_data])
categorical_columns = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
encoder = OrdinalEncoder()
input_df[categorical_columns] = encoder.fit_transform(input_df[categorical_columns])

proba = best_model.predict_proba(input_df)[0][1]
print(f"Chance of having chronic kidney disease: {proba * 100:.2f}%")

prediction = best_model.predict(input_df)[0]
print("Prediction:", "Has chronic kidney disease" if prediction == 1 else "No chronic kidney disease")


Chance of having chronic kidney disease: 52.00%
Prediction: Has chronic kidney disease


In [None]:
import joblib
joblib.dump(best_model, "best_chronic_kidney_disease_model.pkl")

['best_chronic_kidney_disease_model.pkl']