In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib  # for saving the scaler



In [2]:
# Load the dataset
data = pd.read_csv("kidney_disease.csv")

In [3]:
# Display the first few rows of the dataset
print("Dataset Overview:")
display(data.head())


Dataset Overview:


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [4]:
# Preprocessing
print("\nPreprocessing...")

# Drop the 'id' column if not relevant
if 'id' in data.columns:
    data = data.drop(['id'], axis=1)


Preprocessing...


In [5]:
# Handle missing values - Impute or drop rows/columns
data.fillna(data.mean(numeric_only=True), inplace=True)
data.fillna("unknown", inplace=True)

In [6]:
# Convert categorical columns to numeric using LabelEncoder
categorical_columns = data.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in categorical_columns:
    data[col] = encoder.fit_transform(data[col])

In [7]:
# Normalize the data (optional, useful for some models like KNN and SVM)
scaler = StandardScaler()
X = scaler.fit_transform(data.drop('classification', axis=1))  # Apply scaler to features
y = data['classification']

In [8]:
# Save the scaler to a file
joblib.dump(scaler, 'kidney_scaler.pkl')  # Save the scaler


['kidney_scaler.pkl']

In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Train models
print("\nTraining models...")
models = {
    "Random Forest": RandomForestClassifier(),
    "K-Nearest Neighbor": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

model_results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    model_results[name] = {
        "model": model,
        "accuracy": accuracy,
        "report": classification_report(y_test, predictions)
    }
    print(f"{name} Accuracy: {accuracy:.2f}")
    if name == "Random Forest":
        with open("Random_Forest.pkl", 'wb') as f:
            pickle.dump(model, f)
        joblib.dump(model, "Random_Forest.sav")


Training models...
Random Forest Accuracy: 1.00
K-Nearest Neighbor Accuracy: 0.94
Logistic Regression Accuracy: 0.96
Naive Bayes Accuracy: 0.94
SVM Accuracy: 0.97
XGBoost Accuracy: 0.99


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.



In [17]:
import pickle
import numpy as np
# Load the saved Random Forest model
with open('Random_Forest_kidney.pkl', 'rb') as file:
    model = pickle.load(file)

In [18]:
# Custom Input Prediction
def predict_with_custom_input(model_name, custom_input):
    print(f"\nPredicting with {model_name}...")
    # model = model_results[model_name]["model"]

    # Load the scaler for transforming the custom input
    scaler = joblib.load('kidney_scaler.pkl')  # Load the scaler
    input_scaled = scaler.transform([custom_input])
    print("scaled_input: ",input_scaled)
    prediction = model.predict(input_scaled)

    return encoder.inverse_transform(prediction)

In [21]:
# Example: Custom input (replace with actual patient data)
custom_input = [53, 90, 1.02, 2, 0, 1, 1, 1, 0, 70, 107, 7.2, 114, 3.7, 9.5, 29, 12100, 3.7, 1,1,0,0,0,1]  # Replace with your input
print("\nCustom Input Prediction:")
predict_with_custom_input("Random_Forest", custom_input)[0]


Custom Input Prediction:

Predicting with Random_Forest...
scaled_input:  [[ 8.94566365e-02  1.00531122e+00  4.83354714e-01  7.73613260e-01
  -4.37796899e-01 -4.00964932e-01  4.63681212e-02  2.43266816e+00
  -2.50872603e-01 -1.04481804e+00  1.00711107e+00  7.35687065e-01
  -2.55948721e+00 -3.29246948e-01 -1.11562443e+00 -8.10671334e-02
   4.28092555e+02 -2.32705892e+00  2.69985103e-01 -2.60709748e+00
  -2.06474160e+00 -5.07981930e-01 -4.87273502e-01  9.75745187e-01]]




'ckd'

In [12]:
import joblib

# Find the model with the highest accuracy
best_model_name = max(model_results, key=lambda name: model_results[name]["accuracy"])
best_model = model_results["Random Forest"]["model"]

print(f"The best model is {best_model_name} with an accuracy of {model_results[best_model_name]['accuracy']:.2f}")

# Save the model to a file
joblib.dump(best_model, "best_model.pkl")
print("Model saved as 'best_model.pkl'")

The best model is Random Forest with an accuracy of 1.00
Model saved as 'best_model.pkl'


In [None]:
print("Feature order during training:")
print(data.drop('classification', axis=1).columns.tolist())


Feature order during training:
['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']


In [None]:
joblib.dump(scaler, "kidney_scaler.pkl")
joblib.dump(best_model, "best_kidney.pkl")


['best_kidney.pkl']