In [1]:
import pandas as pd
import numpy as np

#preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

#clasiffication
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC #support vector classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB #When features are not independent , Medical diagnosis (e.g., disease prediction)
from sklearn.ensemble import AdaBoostClassifier#Improving weak models (boosts simple models like decision stumps).
from sklearn.ensemble import GradientBoostingClassifier

#for model evaluation
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score


In [2]:
#import data
df = pd.read_csv("kidney_disease.csv")
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
important_clmn=["age","bp","sg","al","hemo","sc","htn","dm","cad","appet","pc","classification"]
df=df[important_clmn]
df

Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,48.0,80.0,1.020,1.0,15.4,1.2,yes,yes,no,good,normal,ckd
1,7.0,50.0,1.020,4.0,11.3,0.8,no,no,no,good,normal,ckd
2,62.0,80.0,1.010,2.0,9.6,1.8,no,yes,no,poor,normal,ckd
3,48.0,70.0,1.005,4.0,11.2,3.8,yes,no,no,poor,abnormal,ckd
4,51.0,80.0,1.010,2.0,11.6,1.4,no,no,no,good,normal,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,15.7,0.5,no,no,no,good,normal,notckd
396,42.0,70.0,1.025,0.0,16.5,1.2,no,no,no,good,normal,notckd
397,12.0,80.0,1.020,0.0,15.8,0.6,no,no,no,good,normal,notckd
398,17.0,60.0,1.025,0.0,14.2,1.0,no,no,no,good,normal,notckd


In [4]:
for col in df.select_dtypes(include='object').columns:#columns stored as string
    df[col]=df[col].str.strip().str.replace("\t",'',regex=True)
df["classification"].value_counts()
# df["cad"].value_counts()

# df[col]: Selects the column named col from the DataFrame df.
#.str.strip(): .str allows string operations on each element of the column.
#.strip() removes leading and trailing whitespace (spaces, tabs, newlines) from each string in the column.
#.str.replace("\t", " "):
#.str.replace() replaces occurrences of a substring with another substring.
#Here, it replaces all tab characters (\t) with a single space (" ") inside each string in the column.

classification
ckd       250
notckd    150
Name: count, dtype: int64

In [5]:
#data cleaning and preprocessing
df.isnull().sum()

age                9
bp                12
sg                47
al                46
hemo              52
sc                17
htn                2
dm                 2
cad                2
appet              1
pc                65
classification     0
dtype: int64

In [6]:
#filling missing gap
df.fillna({'age': df['age'].median()}, inplace=True)
df.fillna({'bp':df['bp'].median()},inplace=True)
df.fillna({'sg':df['sg'].mode()[0]},inplace =True)
df.fillna({'al': df['al'].mode()[0]},inplace =True)
df.fillna({'hemo': df['hemo'].median()},inplace =True)
df.fillna({'sc': df['sc'].median()},inplace =True)
df.fillna({'htn': df['htn'].mode()[0]}, inplace=True)
df.fillna({'dm': df['dm'].mode()[0]}, inplace=True)
df.fillna({'cad': df['cad'].mode()[0]}, inplace=True)
df.fillna({'appet': df['appet'].mode()[0]}, inplace=True)
df.fillna({'pc': df['pc'].mode()[0]},inplace =True)
df.isnull().sum()

#mode for categorical -> yes and no

age               0
bp                0
sg                0
al                0
hemo              0
sc                0
htn               0
dm                0
cad               0
appet             0
pc                0
classification    0
dtype: int64

In [7]:
#encoding
df['htn']= df['htn'].map({"yes":1,"no":0})
df['dm']=df['dm'].map({"yes":1,"no":0})
df['cad']=df['cad'].map({"yes":1,"no":0})
df['appet'] = df['appet'].map({"good":1,"poor":0})
df['pc'] =df['pc'].map({"normal":1,"abnormal":0})
df['classification'] = df['classification'].map({"ckd":1,"notckd":0})
df


Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,48.0,80.0,1.020,1.0,15.4,1.2,1,1,0,1,1,1
1,7.0,50.0,1.020,4.0,11.3,0.8,0,0,0,1,1,1
2,62.0,80.0,1.010,2.0,9.6,1.8,0,1,0,0,1,1
3,48.0,70.0,1.005,4.0,11.2,3.8,1,0,0,0,0,1
4,51.0,80.0,1.010,2.0,11.6,1.4,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,15.7,0.5,0,0,0,1,1,0
396,42.0,70.0,1.025,0.0,16.5,1.2,0,0,0,1,1,0
397,12.0,80.0,1.020,0.0,15.8,0.6,0,0,0,1,1,0
398,17.0,60.0,1.025,0.0,14.2,1.0,0,0,0,1,1,0


In [8]:
#Scaling normalization

#Selecting numerics  columns to normalize
numeric_cols=['age','bp','sg','al','hemo','sc']

#initializer  scaler
scaler = MinMaxScaler()

#fit-transform the numeric columns
df[numeric_cols]=scaler.fit_transform(df[numeric_cols])
df.head()


Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,0.522727,0.230769,0.75,0.2,0.836735,0.010582,1,1,0,1,1,1
1,0.056818,0.0,0.75,0.8,0.557823,0.005291,0,0,0,1,1,1
2,0.681818,0.230769,0.25,0.4,0.442177,0.018519,0,1,0,0,1,1
3,0.522727,0.153846,0.0,0.8,0.55102,0.044974,1,0,0,0,0,1
4,0.556818,0.230769,0.25,0.4,0.578231,0.013228,0,0,0,1,1,1


In [9]:
#Data balancing
from imblearn.over_sampling import SMOTE

x = df.drop('classification',axis=1) # input
y = df['classification']# target

smote = SMOTE(random_state = 42)
x_balanced,y_balanced=smote.fit_resample(x,y)
y_balanced.value_counts()

classification
1    250
0    250
Name: count, dtype: int64

In [10]:
#train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_balanced,y_balanced,test_size=0.2,random_state=42)

#check the shape
print("Train Shape:",x_train.shape)
print("Test shape:",x_test.shape)

Train Shape: (400, 11)
Test shape: (100, 11)


In [11]:
#DEfine models
models={
  "Logistic Regression": LogisticRegression(),
    "Support Vector Classifier": SVC(),
    "Random Forest Classifier":  RandomForestClassifier(),
    "KNeighbors Classifier":  KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Gaussian Naive Bayen": GaussianNB(),
    "AdaBoost Classifier":AdaBoostClassifier(),
    "Gradient Boosting Classifier":GradientBoostingClassifier()
}
for name ,model in models.items():
    print("="*50)
    print("Model:",name)

    #train model
    model.fit(x_train,y_train)

    #predict on test set
    y_pred = model.predict(x_test)

    #calculate metrics
    accuracy = accuracy_score(y_test,y_pred)
    classification_rep =classification_report(y_test,y_pred)
    conf_matrix =confusion_matrix(y_test,y_pred)

    #print metric
    print("Accuracy:",accuracy)
    print("classification Report:\n",classification_rep)
    print("Confusion Matrix:\n",conf_matrix)

Model: Logistic Regression
Accuracy: 0.96
classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96        54
           1       1.00      0.91      0.95        46

    accuracy                           0.96       100
   macro avg       0.97      0.96      0.96       100
weighted avg       0.96      0.96      0.96       100

Confusion Matrix:
 [[54  0]
 [ 4 42]]
Model: Support Vector Classifier
Accuracy: 0.97
classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97        54
           1       1.00      0.93      0.97        46

    accuracy                           0.97       100
   macro avg       0.97      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100

Confusion Matrix:
 [[54  0]
 [ 3 43]]
Model: Random Forest Classifier
Accuracy: 1.0
classification Report:
               precision    recall  f1-score   support

       

In [12]:
#selecting the best model
model_gbc= GradientBoostingClassifier()
model_gbc.fit(x_train,y_train)

y_pred=model_gbc.predict(x_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        54
           1       1.00      0.98      0.99        46

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100

Confusion Matrix:
 [[54  0]
 [ 1 45]]


In [13]:
#Saving model,Encoder,scaler for production
import os
import pickle

# Create models folder if not exists
os.makedirs("models", exist_ok=True)

# Save properly
pickle.dump(scaler, open("models/scaler.pkl", 'wb'))
pickle.dump(model_gbc, open("models/model_gbc.pkl", 'wb'))


In [14]:
#inference predictionon new data
scaler = pickle.load(open("models/scaler.pkl", 'rb'))
model_gbc = pickle.load (open("models/model_gbc.pkl",'rb'))

In [21]:
def predict_chronic_disease(age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc):
    df_dict= {
    'age':[age],
    'bp':[bp],
    'sg':[sg],
    'al':[al],
    'hemo':[hemo],
    'sc':[sc],
    'htn':[htn],
    'dm':[dm],
    'cad':[cad],
    'appet':[appet],
    'pc':[pc]
    
    }
    df= pd.DataFrame(df_dict)

#ecoding
    df['htn']= df['htn'].map({"yes":1,"no":0})
    df['dm']=df['dm'].map({"yes":1,"no":0})
    df['cad']=df['cad'].map({"yes":1,"no":0})
    df['appet'] = df['appet'].map({"good":1,"poor":0})
    df['pc'] =df['pc'].map({"normal":1,"abnormal":0})

#scaling
    numeric_cols=['age','bp','sg','al','hemo','sc']
    df[numeric_cols]= scaler.transform(df[numeric_cols])

#prediction
    prediction = model_gbc.predict(df)

#return the prediction
    return prediction[0]

In [22]:
result = predict_chronic_disease(age=30,bp=20,sg=1.020,al=1.0,hemo=15.4,sc=1.2,htn="no",dm="no",cad="no",appet="good",pc="normal")
if result == 1:
    print("The patient has CKD")
else:
    print("The patient has no CKD")

The patient has no CKD


In [23]:
import sklearn
print(sklearn.__version__)


1.6.1
