# Project Chronic_Kidney_Disease

[Chronic_Kidney_Disease Data Set](https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease)

### Introduction

> Chronic kidney disease is one of the most common diseases facing humans, as well as one of the most dangerous. This disease is defined as a long-term condition in which the kidneys do not work as they should. It is a common condition often associated with aging. It can infect anyone, but it is more common in most countries such as those in South Asia.

> We used data containing features for reasons that indicate chronic kidney disease. We used machine learning algorithms to help classify the person with or without this disease.

### Loading the librarys

In [None]:
import os
os.chdir('../')
!dir

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

# to read .arff files
from scipy.io import arff

import warnings
warnings.filterwarnings('ignore')

# display all the columns of the dataframes
pd.pandas.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('Dataset/kidney_disease.csv')
df.head()

In [None]:
# show shape of dataset
df.shape

In [None]:
# describe the dataset
df.describe()

In [None]:
# show information a bout data
df.info()

In [None]:
# dropping id column
df.drop('id', axis = 1, inplace = True)

In [None]:
# rename column names to make it more user-friendly

df.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
              'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
              'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
              'aanemia', 'class']
df.head()

In [None]:
# converting necessary columns to numerical type 
df['packed_cell_volume'] = pd.to_numeric(df['packed_cell_volume'], errors='coerce')
df['white_blood_cell_count'] = pd.to_numeric(df['white_blood_cell_count'], errors='coerce')
df['red_blood_cell_count'] = pd.to_numeric(df['red_blood_cell_count'], errors='coerce')

# we can show the data after made convert for unssesary columns to numerical
df.info()

In [None]:
# looking at unique values
for col in df.columns:
    print(f"{col} has {df[col].unique()} | \n{len(df[col].unique())} values\n")

In [None]:
# make a replace for incorect value 

df['diabetes_mellitus'].replace(to_replace = {'\tno':'no','\tyes':'yes',' yes':'yes'},inplace=True)

df['coronary_artery_disease'] = df['coronary_artery_disease'].replace(to_replace = '\tno', value='no')

df['class'] = df['class'].replace(to_replace = {'ckd\t': 'ckd', 'notckd': 'not ckd'})

# ckd == 0 and not ckd == 1
df['class'] = df['class'].map({'ckd': 0, 'not ckd': 1})
df['class'] = pd.to_numeric(df['class'], errors='coerce')

In [None]:
cols = ['diabetes_mellitus', 'coronary_artery_disease', 'class']

for col in cols:
    print(f"{col} has {df[col].unique()} | \n{len(df[col].unique())} values\n")

### Plot to show the data 

In [None]:
# Extracting categorical and numerical columns

cat_cols = [col for col in df.columns if df[col].dtype == 'object']
num_cols = [col for col in df.columns if df[col].dtype != 'object']

In [None]:
# looking at unique values in categorical columns

for col in cat_cols:
    print(f"{col} has {df[col].unique()} values\n")

In [None]:
# checking numerical features distribution
plt.figure(figsize = (30, 25))
plotnumber = 1
for column in num_cols:
    if plotnumber <= len(num_cols): # 14:
        ax = plt.subplot(5, 3, plotnumber)
        sns.distplot(df[column], color='blue')
        plt.xlabel(column)
        
    plotnumber += 1

plt.tight_layout()
plt.show()

In [None]:
# looking at categorical columns

plt.figure(figsize = (30, 35))
plotnumber = 1
for column in cat_cols:
    if plotnumber <= len(cat_cols): # 11:
        ax = plt.subplot(6, 2, plotnumber)
        sns.countplot(df[column], palette = 'Set2', color='black')
        plt.xlabel(column)
        
    plotnumber += 1

plt.tight_layout()
plt.show()

In [None]:
# Calculation the crro
cor=df.corr()
plt.figure(figsize = (20, 15))
sns.heatmap(cor, annot = True, linewidths = 1, linecolor = 'lightgrey')
plt.show()

In [None]:
px.scatter(df, x="age", y="blood_pressure", color="class")

In [None]:
px.scatter(df, x="age", y="sugar", color="class")

In [None]:
px.scatter(df, x="age", y="haemoglobin", color="class", marginal_y="violin", marginal_x="box")

In [None]:
px.scatter(df, x="age", y="bacteria", color="class", marginal_y="violin", marginal_x="box")

In [None]:
px.violin(df, y="age", x="sugar", color="class", box=True, points="all")

In [None]:
px.scatter_3d(df, x='age', y='hypertension', z='sugar', color='class')

### Data processing

In [None]:
# checking for null values and correct it

df.isna().sum().sort_values(ascending = False)

In [None]:
df[num_cols].isnull().sum()

In [None]:
df[cat_cols].isnull().sum()

In [None]:
def random_value_imputation(feature):
    random_sample = df[feature].dropna().sample(df[feature].isna().sum())
    random_sample.index = df[df[feature].isnull()].index
    df.loc[df[feature].isnull(), feature] = random_sample
    
def impute_mode(feature):
    mode = df[feature].mode()[0]
    df[feature] = df[feature].fillna(mode)

In [None]:
# filling num_cols null values using random sampling method

for col in num_cols:
    random_value_imputation(col)
df[num_cols].isnull().sum()

In [None]:
# filling "red_blood_cells" and "pus_cell" using random sampling method and rest of cat_cols using mode imputation

random_value_imputation('red_blood_cells')
random_value_imputation('pus_cell')

for col in cat_cols:
    impute_mode(col)
df[cat_cols].isnull().sum()

### Feature Encoding

In [None]:
for col in cat_cols:
    print(f"{col} has {df[col].nunique()} categories\n")

### Since all of our columns have two classes, we can use the label encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col])
    
df.head()

### Using the machien Learning model

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

import pickle

In [None]:
# data splitting in X,y
x = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
print("The shape of X_train",X_train.shape)
print("The shape of X_test",X_test.shape)
print("The shape of X_train",y_train.shape)
print("The shape of X_test",y_test.shape)

### Random Forest Classifier

In [None]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 10)
classifier.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred[:10]

In [None]:
print('model_RF Train Score is : ' , classifier.score(X_train, y_train))
print('model_RF Test Score is : ' , classifier.score(X_test, y_test))

In [None]:
print("Accuracy_score:", round((accuracy_score(y_test, y_pred))*100,2),'%')

print("Loss:", round((1-accuracy_score(y_test, y_pred))*100,2),'%')

print("Cohen_kappa_score:", round((cohen_kappa_score(y_test, y_pred))*100,2),'%')

print("Classification_report:\n",metrics.classification_report(y_test, y_pred))

# print("confusion_matrix:\n", confusion_matrix(y_test, y_pred))
print("confusion_matrix:\n", confusion_matrix(y_test, y_pred))


fig, ax = plt.subplots()
fig.set_size_inches(8,6) # WH
sns.heatmap(confusion_matrix(y_test, y_pred), 
           annot=True,
                 linewidths = 2,
                linecolor = "blue",
                center=0)

In [None]:
X_test.columns, X_test.iloc[0,:].values

In [None]:
y_pred = classifier.predict([X_test.iloc[0,:].values])
y_pred

In [None]:
y_test.iloc[0]

In [None]:
X_test.iloc[0,:]

In [None]:
input_list = np.array([27,60,1.009,2,2,1,1,0,0,42,102,45,5.5,10,10,25,2500,3.6,0,0,0,0,0,0])

y_pred = classifier.predict([input_list])
y_pred

In [None]:
# save the model to disk
filename = 'rf_Classifier.pkl'
pickle.dump(classifier, open(filename, 'wb'))

In [None]:
X_test.columns, X_test.iloc[1,:].values

In [None]:
X_test.iloc[1,:], y_test.iloc[1]

In [None]:
# from sklearn.svm import SVC 
# svm_classifier = SVC()  

from sklearn.tree import DecisionTreeClassifier  
svm_classifier = DecisionTreeClassifier(criterion='entropy', random_state=101)

svm_classifier.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_pred = svm_classifier.predict(X_test)
y_pred[:10]

In [None]:
print('model_SVM Train Score is : ' , svm_classifier.score(X_train, y_train))
print('model_SVM Test Score is : ' , svm_classifier.score(X_test, y_test))

In [None]:
print("Accuracy_score:", round((accuracy_score(y_test, y_pred))*100,2),'%')

print("Loss:", round((1-accuracy_score(y_test, y_pred))*100,2),'%')

print("Cohen_kappa_score:", round((cohen_kappa_score(y_test, y_pred))*100,2),'%')

print("Classification_report:\n",metrics.classification_report(y_test, y_pred))

# print("confusion_matrix:\n", confusion_matrix(y_test, y_pred))
print("confusion_matrix:\n", confusion_matrix(y_test, y_pred))

fig, ax = plt.subplots()
fig.set_size_inches(8,6) # WH
sns.heatmap(confusion_matrix(y_test, y_pred),             
            annot=True,
            linewidths = 2,
            linecolor = "blue",
            center=0)

In [None]:
y_pred = classifier.predict([X_test.iloc[0,:].values])
y_pred, y_test.iloc[0]

In [None]:
input_list = np.array([27,60,1.009,2,2,1,1,0,0,42,102,45,5.5,10,10,25,2500,3.6,0,0,0,0,0,0])

y_pred = classifier.predict([input_list])
y_pred

In [None]:
# save the model to disk
filename = 'dt_Classifier.pkl'
pickle.dump(classifier, open(filename, 'wb'))