In [15]:
#importing the Libraies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle as pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [16]:
# Loading the dataset from the CSV file named 'CKD.csv'
dataset=pd.read_csv("CKD.csv")

In [17]:
dataset

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [18]:
# 1. Clean up “?” and whitespace
for col in ['sg','rbc','pc','pcc','ba','htn','dm','cad','appet','pe','ane','classification']:
    dataset[col] = dataset[col].astype(str).str.strip().replace('?', np.nan)
    
# 2. Map sg, al, su
# Define a mapping from specific gravity categories (a–e) to ordinal integers (1–5)
sg_map = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}

# Apply the mapping to the 'sg' column, converting each letter to its corresponding numeric rank
dataset['sg'] = dataset['sg'].map(sg_map)
# Convert the ‘al’ column from strings to numeric values.
# Any non-numeric entries (e.g., "?", blanks) will be set to NaN for later handling.
dataset['al'] = pd.to_numeric(dataset['al'], errors='coerce')

# Convert the ‘su’ column from strings to numeric values.
# With errors='coerce', invalid parsing (like stray text or "?") becomes NaN.
dataset['su'] = pd.to_numeric(dataset['su'], errors='coerce')

# 3. Impute or drop NaNs as you prefer
dataset['sg'].fillna(dataset['sg'].median(), inplace=True)
# …handle others…

# 4. One-hot encode only the nominal features, drop_first=True
nominals = ['rbc','pc','pcc','ba','htn','dm','cad','appet','pe','ane','classification']
dataset = pd.get_dummies(dataset, columns=nominals, drop_first=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['sg'].fillna(dataset['sg'].median(), inplace=True)


In [19]:
dataset

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,...,False,False,False,False,False,False,True,True,False,True
1,3.000000,76.459948,3,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,...,True,False,False,False,False,False,True,False,False,True
2,4.000000,76.459948,1,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,...,True,False,False,False,False,False,True,False,False,True
3,5.000000,76.459948,4,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,...,True,False,False,False,False,False,True,False,True,True
4,5.000000,50.000000,3,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,...,True,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,1,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,...,True,False,False,False,False,False,True,False,False,True
395,51.492308,70.000000,3,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,...,True,False,False,True,True,False,True,False,True,True
396,51.492308,70.000000,3,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,...,True,False,False,True,True,False,False,False,False,True
397,51.492308,90.000000,1,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,...,True,False,False,True,True,False,True,False,True,True


In [20]:
# Display the count of each class in the target variable 'Purchased' (0 = Not Purchased, 1 = Purchased)
dataset["classification_yes"].value_counts()

classification_yes
True     249
False    150
Name: count, dtype: int64

In [21]:
# Define independent variables/features for the model
indep = dataset[["age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hrmo",
 "pcv", "wc", "rc", "rbc_normal", "pc_normal", "pcc_present", "ba_present",
 "htn_yes", "dm_yes", "cad_yes", "appet_yes", "pe_yes", "ane_yes"]]

# Define dependent variable/target for prediction
dep = dataset["classification_yes"]

In [22]:
# Check the shape (rows, columns) of the independent variables dataframe
indep.shape

(399, 24)

In [23]:
# Display the target variable 'Purchased' values
dep

0       True
1       True
2       True
3       True
4       True
       ...  
394     True
395     True
396     True
397     True
398    False
Name: classification_yes, Length: 399, dtype: bool

In [24]:
# Split data into training and test sets
# test_size=1/3 means 33% data for testing, 67% for training
# random_state=0 ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(indep, dep, test_size=1/3, random_state=0)

In [25]:
# Instantiate a StandardScaler to standardize features by removing the mean and scaling to unit variance
sc = StandardScaler()

# Fit the scaler to the training data and transform it (apply scaling)
X_train = sc.fit_transform(X_train)

# Use the same scaler to transform the test data (use the mean and variance learned from training data)
X_test = sc.transform(X_test)

In [26]:
param_grid = {
    # Number of trees in the forest
    'n_estimators': [10, 100, 200],  
    
    # Function to measure the quality of a split
    'criterion': ['gini', 'entropy'],  
    
    # Number of features to consider when looking for the best split
    # 'auto': all features; 'sqrt': square root of total features; 'log2': log base 2 of total features
    'max_features': ['auto', 'sqrt', 'log2'],
    
    # Maximum depth of the tree (None means nodes are expanded until all leaves are pure)
    'max_depth': [None, 5, 10, 20],
    
    # Minimum number of samples required to split an internal node
    'min_samples_split': [2, 5, 10],
    
    # Minimum number of samples required to be at a leaf node
    'min_samples_leaf': [1, 2, 4],
    
    # Whether bootstrap samples are used when building trees
    'bootstrap': [True, False],
}


# Set up GridSearchCV for Random Forest
grid = GridSearchCV(
    RandomForestClassifier(),  # The base Random Forest model
    param_grid,                # Hyperparameter grid with comments
    refit=True,                # Refit the best model on the full training set
    verbose=3,                 # Show detailed progress logs
    n_jobs=-1,                 # Use all available CPU cores
    scoring='f1'               # Optimize for F1 score (good for balanced classes)
)


# Fit the grid search to the training data (this will try all combinations in param_grid)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


2160 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
977 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\mukil\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\mukil\anaconda3\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\mukil\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\mukil\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidPara

In [27]:
# Print the best hyperparameter combination found during grid search
print(grid.best_params_)

# Get the full cross-validation results as a dictionary (contains scores, params, etc.)
re = grid.cv_results_

# Optionally, you can print all cross-validation results for analysis
#print(re)

# Use the best estimator found by grid search to make predictions on the test data
grid_predictions = grid.predict(X_test)

{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 10}


In [28]:
# Compute the confusion matrix to evaluate the performance of the best logistic regression model on the test set
cm = confusion_matrix(y_test, grid_predictions)

In [29]:
# Generate a detailed classification report (precision, recall, f1-score, support) 
# to evaluate the performance of the best logistic regression model on the test set
clf_report = classification_report(y_test, grid_predictions)

In [30]:
# Calculate the weighted F1-score for the test set predictions made by the best model
f1_macro = f1_score(y_test, grid_predictions, average='weighted')

# Print the best hyperparameters and the corresponding F1-score
print("The weighted F1-score for the best parameters {}:".format(grid.best_params_), f1_macro)

The weighted F1-score for the best parameters {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 10}: 0.9849624060150376


In [31]:
# Print the confusion matrix for the model's predictions on the test set
print("The confusion Matrix:\n", cm)

The confusion Matrix:
 [[50  1]
 [ 1 81]]


In [32]:
# Print the detailed classification report showing precision, recall, f1-score, and support for each class
print("The report:\n", clf_report)

The report:
               precision    recall  f1-score   support

       False       0.98      0.98      0.98        51
        True       0.99      0.99      0.99        82

    accuracy                           0.98       133
   macro avg       0.98      0.98      0.98       133
weighted avg       0.98      0.98      0.98       133



In [33]:
# Calculate the ROC AUC score using the predicted probabilities for the positive class (class 1)
# This measures the model's ability to distinguish between classes (higher is better)
roc_auc_score(y_test,grid.predict_proba(X_test)[:,1])

0.9992826398852224

In [34]:
# Convert the cross-validation results dictionary (from grid search) into a pandas DataFrame for easier analysis and visualization
table = pd.DataFrame.from_dict(re)
# Display the DataFrame containing all cross-validation results from the grid search
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002309,0.000509,0.000000,0.000000,True,gini,,auto,1,2,10,"{'bootstrap': True, 'criterion': 'gini', 'max_...",,,,,,,,865
1,0.002004,0.000010,0.000000,0.000000,True,gini,,auto,1,2,100,"{'bootstrap': True, 'criterion': 'gini', 'max_...",,,,,,,,865
2,0.000802,0.000401,0.000000,0.000000,True,gini,,auto,1,2,200,"{'bootstrap': True, 'criterion': 'gini', 'max_...",,,,,,,,865
3,0.001205,0.000979,0.000000,0.000000,True,gini,,auto,1,5,10,"{'bootstrap': True, 'criterion': 'gini', 'max_...",,,,,,,,865
4,0.001511,0.000652,0.000000,0.000000,True,gini,,auto,1,5,100,"{'bootstrap': True, 'criterion': 'gini', 'max_...",,,,,,,,865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1291,0.170451,0.004991,0.012524,0.002155,False,entropy,20,log2,4,5,100,"{'bootstrap': False, 'criterion': 'entropy', '...",1.000000,0.957746,0.968750,0.969697,1.0,0.979239,0.017464,589
1292,0.409024,0.015658,0.024587,0.007292,False,entropy,20,log2,4,5,200,"{'bootstrap': False, 'criterion': 'entropy', '...",1.000000,0.957746,0.984615,0.969697,1.0,0.982412,0.016695,310
1293,0.021463,0.004066,0.006118,0.002706,False,entropy,20,log2,4,10,10,"{'bootstrap': False, 'criterion': 'entropy', '...",0.985075,0.957746,0.918033,0.969697,1.0,0.966110,0.027937,855
1294,0.194127,0.006302,0.014030,0.002517,False,entropy,20,log2,4,10,100,"{'bootstrap': False, 'criterion': 'entropy', '...",1.000000,0.957746,0.968750,0.969697,1.0,0.979239,0.017464,589


In [35]:
# 1. Define test samples
test_samples = pd.DataFrame([
    {
        "age": 51.492308, "bp": 90, "sg": 1, "al": 0, "su": 0, "bgr": 207, "bu": 80, "sc": 6.8,
        "sod": 142, "pot": 5.5, "hrmo": 9.5, "pcv": 30, "wc": 10000, "rc": 3.9,
        "rbc_normal": 0, "pc_normal": 0, "pcc_present": 1, "ba_present": 1,
        "htn_yes": 1, "dm_yes": 1, "cad_yes": 1, "appet_yes": 0, "pe_yes": 1, "ane_yes": 1
    },
    {
        "age": 45, "bp": 70, "sg": 1.020, "al": 0, "su": 0, "bgr": 120, "bu": 20, "sc": 1.2,
        "sod": 140, "pot": 4.0, "hrmo": 15.0, "pcv": 44, "wc": 7500, "rc": 5.2,
        "rbc_normal": 1, "pc_normal": 1, "pcc_present": 0, "ba_present": 0,
        "htn_yes": 0, "dm_yes": 0, "cad_yes": 0, "appet_yes": 1, "pe_yes": 0, "ane_yes": 0
    }
])

# 2. Scale test data using the same StandardScaler used in training
scaled_test_samples = sc.transform(test_samples)

# 3. Predict with the best model
best_grid = grid.best_estimator_
print("✅ Best Model Found by GridSearchCV:\n", best_grid)

test_predictions = best_grid.predict(scaled_test_samples)

# 4. Display formatted results
for i, pred in enumerate(test_predictions):
    status = "🟢 CKD Detected" if pred == 1 else "🔵 No CKD"
    print(f"🔹 Test Case {i+1}: {status}")

✅ Best Model Found by GridSearchCV:
 RandomForestClassifier(criterion='entropy', max_depth=10, min_samples_split=5,
                       n_estimators=10)
🔹 Test Case 1: 🟢 CKD Detected
🔹 Test Case 2: 🔵 No CKD
