In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn import metrics

Importing the dataset

In [None]:
# Reading the csv data file
kidney_df = pd.read_csv('/kaggle/input/chronic-kidney-disease/new_model.csv' )

EDA

In [None]:
kidney_df.info()

In [None]:
kidney_df.head()

In [None]:
kidney_df.describe()

In [None]:
#structure of data
str(kidney_df)

In [None]:
#column names
kidney_df.columns

In [None]:
# Assuming you have loaded your DataFrame, e.g., kidney_df
# kidney_df = ...

# Calculate the min and max of the "Bp" column
min_bp = kidney_df['Bp'].min()
max_bp = kidney_df['Bp'].max()

# Print the min and max values
print(f"Min Bp: {min_bp}, Max Bp: {max_bp}")

In [None]:
# Assuming you have loaded your DataFrame, e.g., kidney_df
# kidney_df = ...

# Find the index of the min and max values of the "Bp" column
min_bp_index = kidney_df['Bp'].idxmin()
max_bp_index = kidney_df['Bp'].idxmax()

# Access the min and max values using the index
min_bp_value = kidney_df.loc[min_bp_index, 'Bp']
max_bp_value = kidney_df.loc[max_bp_index, 'Bp']

# Print the min and max values
print(f"Lowest BP: {min_bp_value}")
print(f"Highest BP: {max_bp_value}")

In [None]:
# Assuming you have loaded your DataFrame, e.g., kidney_df
# kidney_df = ...

# Maximum Hemoglobin level
max_hemo = kidney_df['Hemo'].max()
print(f"Maximum Hemoglobin: {max_hemo}")

# Minimum Hemoglobin level
min_hemo = kidney_df['Hemo'].min()
print(f"Minimum Hemoglobin: {min_hemo}")

In [None]:
# Assuming you have loaded your DataFrame, e.g., kidney_df
# kidney_df = ...

# Mean of Blood Pressure
mean_bp = kidney_df['Bp'].mean()
print(f"Mean of Blood Pressure: {mean_bp}")

# Median of Blood Pressure
median_bp = kidney_df['Bp'].median()
print(f"Median of Blood Pressure: {median_bp}")

# Mode of Blood Pressure
mode_bp = kidney_df['Bp'].mode()
print(f"Mode of Blood Pressure:\n{mode_bp}")

In [None]:
# Assuming you have loaded your DataFrame, e.g., kidney_df
# kidney_df = ...

# First quartile (Q1)
q1 = kidney_df['Hemo'].quantile(0.25)
print(f"First Quartile: {q1}")

# Third quartile (Q3)
q3 = kidney_df['Hemo'].quantile(0.75)
print(f"Third Quartile: {q3}")


In [None]:
# Assuming you have loaded your DataFrame, e.g., kidney_df
# kidney_df = ...

from scipy.stats import iqr

# Interquartile range for Hemoglobin (IQR)
iqr_hemo = iqr(kidney_df['Hemo'])
print(f"Interquartile Range for Hemoglobin: {iqr_hemo}")

In [None]:
# Standard Deviation for Hemoglobin Column
std_hemo = np.std(kidney_df['Hemo'])
print(f"Standard Deviation for Hemoglobin Column: {std_hemo}")

# Variance for Hemoglobin Column
var_hemo = np.var(kidney_df['Hemo'])
print(f"Variance for Hemoglobin Column: {var_hemo}")

In [None]:
# Assuming you have loaded your DataFrame, e.g., kidney_df
# kidney_df = ...

# Plotting histograms
plt.figure(figsize=(12, 8))

# Blood Pressure
plt.subplot(3, 2, 1)
sns.histplot(kidney_df['Bp'], binwidth=1, color='blue')
plt.title("Blood Pressure")
plt.ylabel("Percentage")

# Hemoglobin
plt.subplot(3, 2, 2)
sns.histplot(kidney_df['Hemo'], binwidth=2, color='green')
plt.title("Hemoglobin")
plt.ylabel("Percentage")

# SC
plt.subplot(3, 2, 3)
sns.histplot(kidney_df['Sc'], binwidth=20, color='red')
plt.title("SC")
plt.ylabel("Percentage")

# SOD
plt.subplot(3, 2, 4)
sns.histplot(kidney_df['Sod'], binwidth=4, color='yellow')
plt.title("SOD")
plt.ylabel("Percentage")

# WBCC
plt.subplot(3, 2, 5)
sns.histplot(kidney_df['Wbcc'], color='purple')
plt.title("WBCC")
plt.ylabel("Percentage")

# RBCC
plt.subplot(3, 2, 6)
sns.histplot(kidney_df['Rbcc'], binwidth=1, color='lightblue')
plt.title("RBCC")
plt.ylabel("Percentage")

plt.tight_layout()
plt.show()

In [None]:
# Assuming you have loaded your DataFrame, e.g., kidney_df
# kidney_df = ...

# Extracting numeric variables
numeric_vars = kidney_df.select_dtypes(include=np.number)

# Calculating the correlation matrix
corr_matrix = numeric_vars.corr()

# Plotting the correlation using seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Plot for Numerical Variables")
plt.show()

In [None]:
numerical_columns = ['Bp', 'Bu', 'Sc', 'Sod', 'Pot', 'Hemo', 'Wbcc', 'Rbcc']

sns.set(style='whitegrid')
plt.figure(figsize=(10,6))
for i, var in enumerate(numerical_columns, 1):
    plt.subplot(3, 3, i)
    sns.histplot(data=kidney_df, x=var, kde=True, bins=20, color='Teal')
    plt.xticks(fontsize=6)
    plt.yticks(fontsize=6)
    plt.xlabel(var, fontsize=8)
    plt.ylabel('')
plt.tight_layout()
plt.show()

sns.set(style='whitegrid')
plt.figure(figsize=(10,6))
for i, var in enumerate(numerical_columns, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(data=kidney_df, x=var, color='Teal')
    plt.xticks(fontsize=6)
    plt.yticks(fontsize=6)
    plt.xlabel(var, fontsize=8)
    plt.ylabel('')
plt.tight_layout()
plt.show()

In [None]:
categorical_columns = ['Sg', 'Al', 'Su', 'Rbc', 'Htn', 'Class']

sns.set(style='whitegrid')

plt.figure(figsize=(10, 6))
for i, cat_var in enumerate(categorical_columns, 1):
    plt.subplot(2, 3, i)
    sns.countplot(data=kidney_df, x=cat_var, color='Teal', alpha=0.7)
    plt.xticks(fontsize=6)
    plt.yticks(fontsize=6)
    plt.xlabel(cat_var, fontsize=8)
    plt.ylabel('')

plt.tight_layout()
plt.show()

In [None]:
kidney_df['Class'].value_counts()

In [None]:
target = kidney_df['Class']
X_train, X_test, y_train, y_test = train_test_split(kidney_df.drop('Class', axis=1), target, test_size=0.3, random_state=42)

MODEL BUILDING

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Using XGBoost classifier
xgb_model = XGBClassifier()

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(xgb_model, param_grid, scoring='accuracy', cv=3, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model to make predictions
y_pred = grid_search.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import classification_report
# Predictions
y_pred = grid_search.predict(X_test)

# Generating a classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

In [None]:
# Confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Display the confusion matrix using seaborn heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

Explainability via LIME

In [None]:
!pip install lime

In [None]:
# importing lime
import lime
from lime import lime_tabular

In [None]:
# LIME explanation
explainer = lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns, class_names=['Not CKD', 'CKD'], mode='classification')
i = 0  # Example index, choose any index from X_test
exp = explainer.explain_instance(X_test.values[i], grid_search.predict_proba, num_features=len(X_train.columns))
exp.show_in_notebook()

Explainability via SHAP

In [None]:
!pip install shap

In [None]:
import shap

In [None]:
# SHAP explanation
# SHAP explanation using the best model from grid search
best_model = grid_search.best_estimator_
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

# Summary plot
shap.summary_plot(shap_values, X_test, plot_type="bar", class_names=['Not CKD', 'CKD'])