In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier


In [None]:
#Reading the dataset
data = pd.read_csv("../input/ckdisease/kidney_disease.csv")
data.head()

In [None]:
#getting the shape of the dataset
data.shape

In [None]:
data.columns

In [None]:
#getting the information about the dataset contents
data.info()

# Data Preprocessing


In [None]:
categorial_cols = [col for col in data.columns if data[col].dtype=="object"]
categorial_cols

In [None]:
numerical_cols = [x for x in data.columns if not x in categorial_cols]
numerical_cols

In [None]:
for i in ['rc','wc','pcv']:
    data[i] = data[i].str.extract('(\d+)').astype(float)

## Simple Imputing

In [None]:
#filling the null values with the mean values 
for i in ['age','bp','sg','al','su','bgr','bu','sc','sod','pot','hemo','rc','wc','pcv']:
    data[i].fillna(data[i].mean(),inplace=True)

## OneHot Encoding

In [None]:
#converting the categorial data by using oneHot Encoding
rbc = pd.get_dummies(data[["rbc"]],drop_first=True)
rbc.head()

In [None]:
pc = pd.get_dummies(data[["pc"]],drop_first=True)
pc.head()

In [None]:
pcc = pd.get_dummies(data[["pcc"]],drop_first=True)
pcc.head()

In [None]:
ba = pd.get_dummies(data[["ba"]],drop_first=True)
ba.head()

In [None]:
#dropping the categorial data columns
data.drop(["rbc","pc","pcc","ba"],axis=1,inplace=True)

In [None]:
data.head()

In [None]:
#concating the data columns
data = pd.concat([data,rbc,pc,pcc,ba],axis=1)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
#converting the age Data column into list
k=data["age"].apply(lambda x : int(x)//10).to_list()

In [None]:
#batching the ages(0-9,10-19,20-29,......90-99)
a=[0]*10
for i in range(len(k)):
    a[k[i]]+=1
a

In [None]:
plt.figure(figsize= (7,7))
x_labels = ['0-9','10-19','20-29','30-39','40-49','50-59','60-69','70-79','80-89','90-99']
y_labels = np.array(a)
plt.pie(y_labels, labels = x_labels)
plt.show() 

In [None]:
data["classification"].value_counts()

In [None]:
#replacing the values of notckd, ckd and ckd/t in the dataset
data.replace({"notckd":0,"ckd":1,"ckd\t":1},inplace=True)

In [None]:
plt.figure(figsize=(25,10))
sns.barplot(x =data['bp'],y =data['classification'],data=data)
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.swarmplot(y=data["age"], x = data["classification"])

In [None]:
data["appet"].value_counts()

In [None]:
plt.figure(figsize=(7,7))
sns.barplot(x =data['appet'],y =data['classification'],data=data)
plt.show()

In [None]:
data.replace({"good":1,"poor":0},inplace=True)

In [None]:
data["ane"].value_counts()

In [None]:
#replacing the values of no, yes to 0,1 respectively
data.replace({"no":0,"yes":1,"\tno":0,"\tyes":1," yes":1},inplace=True)

In [None]:
data.info()

In [None]:
#if still null values present then replacing the null value with the most frequent value in the column
data=data.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [None]:
data.info()

# Splitting Train Data and Test Data

In [None]:
#getting the columns in the dataset
data.columns

In [None]:
#seperating the data for the model as X contains the data which feed to the model and y contains the target column 
X = data.loc[:,['age', 'bp', 'rc','wc','appet','pc_normal','htn','hemo','bgr','dm','ane']]
y = data["classification"]

In [None]:
#splitting the train data and test Data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 0)
X_train.head()

# MODELS:

## Random Forest

In [None]:
#using the random forest classifier
model = RandomForestClassifier()
model.fit(X_train,y_train)

In [None]:
#getting the predictions using the trained model
predictions = model.predict(X_test)

In [None]:
#checking the mean absolute error between the predicted values and test data
print("the mean absolute error by using the RandomForest is",mean_absolute_error(y_test,predictions))

In [None]:
#printing the accuracy of the train data
print("the accuracy of the train data is ",model.score(X_train,y_train)*100)

In [None]:
#printing the accuracy of the test data
print("the accuracy of the test data is",model.score(X_test,y_test)*100)

## XGradient Boost

In [None]:
#using the XGradient Boosting algorithm
mod = XGBClassifier()
mod.fit(X_train,y_train)

In [None]:
#checking the mean absolute error between the predicted values and test data
print(mean_absolute_error(y_test,mod.predict(X_test)))

In [None]:
#printing the accuracy of the train data
print("the accuracy of the train data is ",mod.score(X_train,y_train)*100)

In [None]:
#printing the accuracy of the test data
print("the accuracy of the test data is",mod.score(X_test,y_test)*100)

## Support Vector Machine

In [None]:
model2 = SVC()
model2.fit(X_train,y_train)

In [None]:
print("the mean absolute error is",mean_absolute_error(y_test,model2.predict(X_test)))

In [None]:
#printing the accuracy of the train data
print("the accuracy of the train data is ",model2.score(X_train,y_train)*100)

In [None]:
#printing the accuracy of the test data
print("the accuracy of the test data is",model2.score(X_test,y_test)*100)

# Dumping the best model into the pickle
            from the above three models we get to know that Random Forest gives the best accuracy as compared with remaining two models. So, we use Random Forest for this project.

In [None]:
#import pickle
#file = open("mainBookpickle.pkl","wb")
#pickle.dump(model,file)

In [None]:
#checking the test accuracy with the model in the pickle file
#mod1 = pickle.load(open("mainBookpickle.pkl","rb"))
#print(mod1.score(X_test,y_test)*100)

In [None]:
nan