In [2]:
#importing the Libraies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle as pickle
import ipywidgets as widgets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from IPython.display import display

In [3]:
# Loading the dataset from the CSV file named 'CKD.csv'
dataset=pd.read_csv("CKD.csv")

In [4]:
dataset

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [5]:
# you can list all column names like this:
print(dataset.columns.tolist())

['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'classification']


In [6]:
# 1. Clean up “?” and whitespace
for col in ['sg','rbc','pc','pcc','ba','htn','dm','cad','appet','pe','ane','classification']:
    dataset[col] = dataset[col].astype(str).str.strip().replace('?', np.nan)
    
# 2. Map sg, al, su
# Define a mapping from specific gravity categories (a–e) to ordinal integers (1–5)
sg_map = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}

# Apply the mapping to the 'sg' column, converting each letter to its corresponding numeric rank
dataset['sg'] = dataset['sg'].map(sg_map)
# Convert the ‘al’ column from strings to numeric values.
# Any non-numeric entries (e.g., "?", blanks) will be set to NaN for later handling.
dataset['al'] = pd.to_numeric(dataset['al'], errors='coerce')

# Convert the ‘su’ column from strings to numeric values.
# With errors='coerce', invalid parsing (like stray text or "?") becomes NaN.
dataset['su'] = pd.to_numeric(dataset['su'], errors='coerce')

# 3. Impute or drop NaNs as you prefer
dataset['sg'].fillna(dataset['sg'].median(), inplace=True)
# …handle others…

# 4. One-hot encode only the nominal features, drop_first=True
nominals = ['rbc','pc','pcc','ba','htn','dm','cad','appet','pe','ane','classification']
dataset = pd.get_dummies(dataset, columns=nominals, drop_first=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['sg'].fillna(dataset['sg'].median(), inplace=True)


In [7]:
dataset

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,...,False,False,False,False,False,False,True,True,False,True
1,3.000000,76.459948,3,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,...,True,False,False,False,False,False,True,False,False,True
2,4.000000,76.459948,1,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,...,True,False,False,False,False,False,True,False,False,True
3,5.000000,76.459948,4,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,...,True,False,False,False,False,False,True,False,True,True
4,5.000000,50.000000,3,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,...,True,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,1,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,...,True,False,False,False,False,False,True,False,False,True
395,51.492308,70.000000,3,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,...,True,False,False,True,True,False,True,False,True,True
396,51.492308,70.000000,3,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,...,True,False,False,True,True,False,False,False,False,True
397,51.492308,90.000000,1,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,...,True,False,False,True,True,False,True,False,True,True


In [8]:
# you can list all column names like this:
print(dataset.columns.tolist())

['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv', 'wc', 'rc', 'rbc_normal', 'pc_normal', 'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes', 'appet_yes', 'pe_yes', 'ane_yes', 'classification_yes']


In [9]:
# Display the count of each class in the target variable 'Purchased' (0 = Not Purchased, 1 = Purchased)
dataset["classification_yes"].value_counts()

classification_yes
True     249
False    150
Name: count, dtype: int64

In [10]:
# Define independent variables/features for the model
indep = dataset[["age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hrmo",
 "pcv", "wc", "rc", "rbc_normal", "pc_normal", "pcc_present", "ba_present",
 "htn_yes", "dm_yes", "cad_yes", "appet_yes", "pe_yes", "ane_yes"]]

# Define dependent variable/target for prediction
dep = dataset["classification_yes"]

In [11]:
# Check the shape (rows, columns) of the independent variables dataframe
indep.shape

(399, 24)

In [12]:
# Display the target variable 'classification_yes' values
dep

0       True
1       True
2       True
3       True
4       True
       ...  
394     True
395     True
396     True
397     True
398    False
Name: classification_yes, Length: 399, dtype: bool

In [13]:
# Split data into training and test sets
# test_size=1/3 means 33% data for testing, 67% for training
# random_state=0 ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(indep, dep, test_size=1/3, random_state=0)

In [14]:
# Instantiate a StandardScaler to standardize features by removing the mean and scaling to unit variance
sc = StandardScaler()

# Fit the scaler to the training data and transform it (apply scaling)
X_train = sc.fit_transform(X_train)

# Use the same scaler to transform the test data (use the mean and variance learned from training data)
X_test = sc.transform(X_test)

In [15]:
param_grid = {
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['auto', 'scale'],
    'C': [10, 100, 1000, 2000, 3000],
    'degree': [2, 3, 4],            # For 'poly' kernel
    'coef0': [0.0, 0.1, 0.5, 1.0]  # For 'poly' and 'sigmoid' kernels
    # You can add others if needed
}


grid = GridSearchCV(
    SVC(probability=True),  # <-- ADD probability=True here!
    param_grid,        # The dictionary of parameters to try (from your previous message)
    refit=True,        # After finding the best parameters, refit the model on the whole dataset
    verbose=3,         # Print detailed progress messages during the search (higher = more info)
    n_jobs=-1,         # Use all available CPU cores to speed up the search
    scoring='f1_weighted'  # Use the weighted F1 score to evaluate performance for each parameter set
)

# Fit the grid search to the training data (this will try all combinations in param_grid)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


In [16]:
# Print the best hyperparameter combination found during grid search
print(grid.best_params_)

# Get the full cross-validation results as a dictionary (contains scores, params, etc.)
re = grid.cv_results_

# Optionally, you can print all cross-validation results for analysis
#print(re)

# Use the best estimator found by grid search to make predictions on the test data
grid_predictions = grid.predict(X_test)

{'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': 'auto', 'kernel': 'linear'}


In [17]:
# Compute the confusion matrix to evaluate the performance of the best logistic regression model on the test set
cm = confusion_matrix(y_test, grid_predictions)

In [18]:
# Generate a detailed classification report (precision, recall, f1-score, support) 
# to evaluate the performance of the best logistic regression model on the test set
clf_report = classification_report(y_test, grid_predictions)

In [19]:
# Calculate the weighted F1-score for the test set predictions made by the best model
f1_macro = f1_score(y_test, grid_predictions, average='weighted')

# Print the best hyperparameters and the corresponding F1-score
print("The weighted F1-score for the best parameters {}:".format(grid.best_params_), f1_macro)

The weighted F1-score for the best parameters {'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': 'auto', 'kernel': 'linear'}: 0.9624731911379497


In [20]:
# Print the confusion matrix for the model's predictions on the test set
print("The confusion Matrix:\n", cm)

The confusion Matrix:
 [[49  2]
 [ 3 79]]


In [21]:
# Print the detailed classification report showing precision, recall, f1-score, and support for each class
print("The report:\n", clf_report)

The report:
               precision    recall  f1-score   support

       False       0.94      0.96      0.95        51
        True       0.98      0.96      0.97        82

    accuracy                           0.96       133
   macro avg       0.96      0.96      0.96       133
weighted avg       0.96      0.96      0.96       133



In [22]:

# Calculate the ROC AUC score using the predicted probabilities for the positive class (class 1)
# This measures the model's ability to distinguish between classes (higher is better)
roc_auc_score(y_test,grid.predict_proba(X_test)[:,1])

0.996652319464371

In [23]:
# Convert the cross-validation results dictionary (from grid search) into a pandas DataFrame for easier analysis and visualization
table = pd.DataFrame.from_dict(re)
# Display the DataFrame containing all cross-validation results from the grid search
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_coef0,param_degree,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.018678,0.011891,0.012865,0.007805,10,0.0,2,auto,linear,"{'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': ...",0.926978,1.000000,0.981217,1.000000,1.000000,0.981639,0.028282,1
1,0.012665,0.004819,0.011523,0.001696,10,0.0,2,auto,rbf,"{'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': ...",0.926978,0.981014,0.962573,0.981031,1.000000,0.970319,0.024692,291
2,0.026581,0.013268,0.013319,0.007946,10,0.0,2,auto,poly,"{'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': ...",0.872586,0.888646,0.907035,0.962264,0.869925,0.900091,0.033797,461
3,0.019107,0.005426,0.020026,0.007864,10,0.0,2,auto,sigmoid,"{'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': ...",0.908877,1.000000,0.981217,0.981217,0.981217,0.970506,0.031661,277
4,0.016263,0.003005,0.010256,0.002085,10,0.0,2,scale,linear,"{'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': ...",0.926978,1.000000,0.981217,1.000000,1.000000,0.981639,0.028282,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,0.005013,0.001519,0.004441,0.000583,3000,1.0,4,auto,sigmoid,"{'C': 3000, 'coef0': 1.0, 'degree': 4, 'gamma'...",0.926978,0.903610,0.962573,0.981217,0.961826,0.947241,0.027987,434
476,0.006822,0.001514,0.004311,0.000511,3000,1.0,4,scale,linear,"{'C': 3000, 'coef0': 1.0, 'degree': 4, 'gamma'...",0.926978,1.000000,0.981217,1.000000,1.000000,0.981639,0.028282,1
477,0.005722,0.000693,0.004610,0.001074,3000,1.0,4,scale,rbf,"{'C': 3000, 'coef0': 1.0, 'degree': 4, 'gamma'...",0.926978,0.981014,0.981217,0.981031,1.000000,0.974048,0.024649,181
478,0.004437,0.000352,0.003600,0.000799,3000,1.0,4,scale,poly,"{'C': 3000, 'coef0': 1.0, 'degree': 4, 'gamma'...",0.926978,1.000000,0.981217,0.981217,0.981217,0.974126,0.024671,145


In [39]:
# 1. Define test samples

test_samples = pd.DataFrame([
    {
        "age": 51.492308, "bp": 90, "sg": 1, "al": 0, "su": 0, "bgr": 207, "bu": 80, "sc": 6.8,
        "sod": 142, "pot": 5.5, "hrmo": 9.5, "pcv": 30, "wc": 10000, "rc": 3.9,
        "rbc_normal": 0, "pc_normal": 0, "pcc_present": 1, "ba_present": 1,
        "htn_yes": 1, "dm_yes": 1, "cad_yes": 1, "appet_yes": 0, "pe_yes": 1, "ane_yes": 1
    },
    {
        "age": 45, "bp": 70, "sg": 1.020, "al": 0, "su": 0, "bgr": 120, "bu": 20, "sc": 1.2,
        "sod": 140, "pot": 4.0, "hrmo": 15.0, "pcv": 44, "wc": 7500, "rc": 5.2,
        "rbc_normal": 1, "pc_normal": 1, "pcc_present": 0, "ba_present": 0,
        "htn_yes": 0, "dm_yes": 0, "cad_yes": 0, "appet_yes": 1, "pe_yes": 0, "ane_yes": 0
    }
])



# 2. Scale test data using the same StandardScaler used in training
scaled_test_samples = sc.transform(test_samples)

# 3. Predict with the best model
best_grid = grid.best_estimator_
#print("✅ Best Model Found by GridSearchCV:\n", best_grid)

test_predictions = best_grid.predict(scaled_test_samples)

# 4. Display formatted results
print("📋 CKD Prediction Results".center(60, "="))
for i, pred in enumerate(test_predictions):
    status = "🟢 CKD Detected" if pred == 1 else "🔵 No CKD"
    print(f"🔹 Test Case {i+1:>2}: {status}")
print("=" * 60)

🔹 Test Case  1: 🟢 CKD Detected
🔹 Test Case  2: 🔵 No CKD
