# Import Libraries

In [1]:
import pandas as pd
import numpy as np

# preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# clasification models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# for model evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load Dataset

In [2]:
data=pd.read_csv("kidney_disease.csv")
print(data)



      id   age    bp     sg   al   su     rbc        pc         pcc  \
0      0  48.0  80.0  1.020  1.0  0.0     NaN    normal  notpresent   
1      1   7.0  50.0  1.020  4.0  0.0     NaN    normal  notpresent   
2      2  62.0  80.0  1.010  2.0  3.0  normal    normal  notpresent   
3      3  48.0  70.0  1.005  4.0  0.0  normal  abnormal     present   
4      4  51.0  80.0  1.010  2.0  0.0  normal    normal  notpresent   
..   ...   ...   ...    ...  ...  ...     ...       ...         ...   
395  395  55.0  80.0  1.020  0.0  0.0  normal    normal  notpresent   
396  396  42.0  70.0  1.025  0.0  0.0  normal    normal  notpresent   
397  397  12.0  80.0  1.020  0.0  0.0  normal    normal  notpresent   
398  398  17.0  60.0  1.025  0.0  0.0  normal    normal  notpresent   
399  399  58.0  80.0  1.025  0.0  0.0  normal    normal  notpresent   

             ba  ...  pcv    wc   rc  htn   dm  cad appet   pe  ane  \
0    notpresent  ...   44  7800  5.2  yes  yes   no  good   no   no   
1    

# explanation of features

##### 'age' : Patient age (years)
##### 'bp' : Blood pressure (mm/Hg)
##### 'sg' : Urine specific gravity
##### 'al' : Albumin in urine (0–5)
##### 'hemo' : Hemoglobin level (g/dL)
##### 'sc' : Serum creatinine (mg/dL)
##### 'htn' : Hypertension (yes/no)
##### 'dm' : Diabetes mellitus (yes/no)
##### 'cad' : Coronary artery disease (yes/no)
##### 'appet' : Appetite status (good/poor)
##### 'pc' : Pus cell status (normal/abnormal)
##### 'classification' : CKD diagnosis (ckd/notckd)

# Exploratory Data Analysis

In [3]:
data.shape

(400, 26)

In [4]:
data.info()
data.describe()
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

# Selecting Important Features

In [5]:
important_columns = ['age', 'bp', 'sg', 'al', 'hemo', 'sc','htn','dm','cad','appet','pc','classification']
data = data[important_columns]
data

# data=data.drop(['id','su','rbc','pcc','ba','bgr','bu','sod','pot','pcv','wc','rc','pe','ane'],axis=1)
# print(data)

Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,48.0,80.0,1.020,1.0,15.4,1.2,yes,yes,no,good,normal,ckd
1,7.0,50.0,1.020,4.0,11.3,0.8,no,no,no,good,normal,ckd
2,62.0,80.0,1.010,2.0,9.6,1.8,no,yes,no,poor,normal,ckd
3,48.0,70.0,1.005,4.0,11.2,3.8,yes,no,no,poor,abnormal,ckd
4,51.0,80.0,1.010,2.0,11.6,1.4,no,no,no,good,normal,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,15.7,0.5,no,no,no,good,normal,notckd
396,42.0,70.0,1.025,0.0,16.5,1.2,no,no,no,good,normal,notckd
397,12.0,80.0,1.020,0.0,15.8,0.6,no,no,no,good,normal,notckd
398,17.0,60.0,1.025,0.0,14.2,1.0,no,no,no,good,normal,notckd


# Clean dataset

In [8]:
print(data.select_dtypes(include='object').columns)

Index(['htn', 'dm', 'cad', 'appet', 'pc', 'classification'], dtype='object')


In [9]:
for col in data.select_dtypes(include='object').columns:
    data[col]=data[col].str.strip().str.replace('\t','',regex=True)

data['classification'].value_counts()

classification
ckd       250
notckd    150
Name: count, dtype: int64

# Replacing Missing Values with mode and median

In [10]:
# List of numerical columns (median)
numerical_columns = ['age', 'bp','sc', 'hemo']

# List of categorical columns (mode)
categorical_columns = ['sg','al','htn', 'dm', 'cad','pc','appet']


# Handle missing values for numerical columns (using median)
for col in numerical_columns:
    data[col] = data[col].fillna(data[col].median())

# Handle missing values for categorical columns (using mode for this example)
for col in categorical_columns:
    data[col] = data[col].fillna(data[col].mode().iloc[0])

# Now, the data should have no missing values


In [13]:
data.isnull().sum()

age               0
bp                0
sg                0
al                0
hemo              0
sc                0
htn               0
dm                0
cad               0
appet             0
pc                0
classification    0
dtype: int64

In [14]:
data['pc'].value_counts()

pc
normal      324
abnormal     76
Name: count, dtype: int64

# Encode Categorical Variables

In [15]:
# # Save encoders for later use
# encoders = {}

# for col in data.columns:
#     if data[col].dtype == 'object':
#         encoders[col] = LabelEncoder()
#         data[col] = encoders[col].fit_transform(data[col])


In [16]:
# manually

data['htn'] = data['htn'].map({'yes':1, "no":0})
data['dm'] = data['dm'].map({'yes':1, "no":0})
data['cad'] = data['cad'].map({'yes':1, "no":0})
data['appet'] = data['appet'].map({'good':1, "poor":0})
data['pc'] = data['pc'].map({'normal':1, "abnormal":0})
data['classification'] = data['classification'].map({'ckd':1, "notckd":0})

data

Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,48.0,80.0,1.020,1.0,15.4,1.2,1,1,0,1,1,1
1,7.0,50.0,1.020,4.0,11.3,0.8,0,0,0,1,1,1
2,62.0,80.0,1.010,2.0,9.6,1.8,0,1,0,0,1,1
3,48.0,70.0,1.005,4.0,11.2,3.8,1,0,0,0,0,1
4,51.0,80.0,1.010,2.0,11.6,1.4,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,15.7,0.5,0,0,0,1,1,0
396,42.0,70.0,1.025,0.0,16.5,1.2,0,0,0,1,1,0
397,12.0,80.0,1.020,0.0,15.8,0.6,0,0,0,1,1,0
398,17.0,60.0,1.025,0.0,14.2,1.0,0,0,0,1,1,0


# Feature Scaling

In [19]:
numeric_cols = ['age', 'bp', 'sg', 'al', 'hemo', 'sc']

# Initialize scaler
scaler = MinMaxScaler()

# Fit-transform the numeric columns
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

data.head()

Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,0.522727,0.230769,0.75,0.2,0.836735,0.010582,1,1,0,1,1,1
1,0.056818,0.0,0.75,0.8,0.557823,0.005291,0,0,0,1,1,1
2,0.681818,0.230769,0.25,0.4,0.442177,0.018519,0,1,0,0,1,1
3,0.522727,0.153846,0.0,0.8,0.55102,0.044974,1,0,0,0,0,1
4,0.556818,0.230769,0.25,0.4,0.578231,0.013228,0,0,0,1,1,1


# Split Features and Target

In [20]:
X = data.drop('classification', axis=1)
y = data['classification']


# Applying SMOTE (To balance data)

In [21]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X,y)


In [22]:
y_balanced.value_counts()

classification
1    250
0    250
Name: count, dtype: int64

# Train-Test Split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_test.shape)
print(y_train.shape)

(400, 11)
(100, 11)
(100,)
(400,)


# Model Comparison

In [24]:
models={
    "Logistic Regression": LogisticRegression(),
    "support vector classifier": SVC(),
    "Random forest classifier": RandomForestClassifier(),
    "K nearest neighbour": KNeighborsClassifier(),
    "Decision tree classifier": DecisionTreeClassifier(),
    "Naive bayes": GaussianNB(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier()
}

for name, models in models.items():
    print("Model:", name)
    models.fit(X_train,y_train)
    y_pred=models.predict(X_test)

 # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Print metrics
    print("Accuracy:", accuracy*100)
    print("Classification Report:\n", classification_rep)
    print("Confusion Matrix:\n", conf_matrix)    

Model: Logistic Regression
Accuracy: 96.0
Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96        54
           1       1.00      0.91      0.95        46

    accuracy                           0.96       100
   macro avg       0.97      0.96      0.96       100
weighted avg       0.96      0.96      0.96       100

Confusion Matrix:
 [[54  0]
 [ 4 42]]
Model: support vector classifier
Accuracy: 97.0
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97        54
           1       1.00      0.93      0.97        46

    accuracy                           0.97       100
   macro avg       0.97      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100

Confusion Matrix:
 [[54  0]
 [ 3 43]]
Model: Random forest classifier
Accuracy: 100.0
Classification Report:
               precision    recall  f1-score   support

     

# Selecting Best Model
### As we see most models give 100% accuracy, so we can go with Random forest Classifier

In [25]:

model_gbc=RandomForestClassifier(random_state=42)
model_gbc.fit(X_train,y_train)
Y_pred=model_gbc.predict(X_test)

accuracy = accuracy_score(y_test, Y_pred)
classification_rep = classification_report(y_test, Y_pred)
conf_matrix = confusion_matrix(y_test, Y_pred)

print("Accuracy:", accuracy*100)
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix) 

Accuracy: 100.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        54
           1       1.00      1.00      1.00        46

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

Confusion Matrix:
 [[54  0]
 [ 0 46]]


# Save the Best Model, Encoders, and Scaler

In [None]:
import pickle

pickle.dump(scaler, open("models5/scaler.pkl",'wb'))
pickle.dump(model_gbc,open("models5/model_gbc.pkl",'wb'))

# Prediction on New Data

In [None]:
scaler = pickle.load(open("models5/scaler.pkl", 'rb'))  # Load the scaler
model_gbc = pickle.load(open("models5/model_gbc.pkl", 'rb'))  # Load the trained model

def predict_chronic_disease(age, bp, sg, al, hemo, sc, htn, dm, cad, appet, pc):
    # Create a DataFrame with input variables, following the correct order
    df_dict = {
        'age': [age],
        'bp': [bp],
        'sg': [sg],
        'al': [al],
        'hemo': [hemo],
        'sc': [sc],
        'htn': [htn],
        'dm': [dm],
        'cad': [cad],
        'appet': [appet],
        'pc': [pc]
    }
    df = pd.DataFrame(df_dict)

    # Encode the categorical columns
    df['htn'] = df['htn'].map({'yes':1, "no":0})
    df['dm'] = df['dm'].map({'yes':1, "no":0})
    df['cad'] = df['cad'].map({'yes':1, "no":0})
    df['appet'] = df['appet'].map({'good':1, "poor":0})
    df['pc'] = df['pc'].map({'normal':1, "abnormal":0})

    # Scale the numeric columns using the previously fitted scaler
    numeric_cols = ['age', 'bp', 'sg', 'al', 'hemo', 'sc']
    df[numeric_cols] = scaler.transform(df[numeric_cols])

    # Make the prediction
    prediction = model_gbc.predict(df)

    # Return the predicted class
    return prediction

# Example usage:
result = predict_chronic_disease(age=30,
                                bp=20,
                                sg=1.020,
                                al=1.0,
                                hemo=15.4,
                                sc=1.2,
                                htn="no",
                                dm="no",
                                cad='no',
                                appet='good',
                                pc='normal')

if result == 1:
    print("The Patient Has CKD....")
else:
    print("The Patient Has not CKD....")

The Patient Has not CKD....


In [29]:
result = predict_chronic_disease(
    age=65,     # Older age is a risk factor for CKD
    bp=160,     # High blood pressure
    sg=1.030,   # Abnormal specific gravity (suggestive of kidney dysfunction)
    al=3.0,     # Elevated albumin in urine (indicative of kidney damage)
    hemo=9.0,   # Low hemoglobin (anemia is common in CKD)
    sc=2.0,     # Elevated serum creatinine (indicating kidney dysfunction)
    htn="yes",   # Hypertension present
    dm="yes",       # Diabetes mellitus present
    cad="yes",      # Coronary artery disease (cardiovascular issues)
    appet="poor",    # Lack of appetite (common in advanced kidney disease)
    pc='abnormal'  # Abnormal proteinuria (strong indicator of kidney damage)
)

# Check the result and print the appropriate message
if result == 1:
    print("The Patient Has CKD....")
else:
    print("The Patient Has not CKD....")

The Patient Has CKD....
