In [1]:
import pandas as pd

In [52]:
dataset = pd.read_csv("CKD.csv")

In [54]:
dataset.shape

(399, 25)

In [3]:
dataset['classification'] = dataset['classification'].map({'yes':1,'no':0})#since the output column is a yes/no value , it should be converted to 1 or 0 format for processing.. This can be done by map method

In [4]:
#check if the dataset has classification values as 0 and 1
dataset.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.0,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,1
1,3.0,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.0,12300.0,4.705597,no,no,no,yes,poor,no,1
2,4.0,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.0,...,34.0,8408.191126,4.705597,no,no,no,yes,poor,no,1
3,5.0,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,1
4,5.0,50.0,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.0,12400.0,4.705597,no,no,no,yes,poor,no,1


In [5]:
dataset[['classification']].value_counts() #check balanced or imbalanced through output target variable


classification
1                 249
0                 150
Name: count, dtype: int64

In [6]:
dataset = pd.get_dummies(dataset,drop_first=True)

In [7]:
dataset.head()

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes
0,2.0,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,True,False,False,False,False,False,False,True,True,False
1,3.0,76.459948,2.0,0.0,148.112676,22.0,0.7,137.528754,4.627244,10.7,...,True,True,False,False,False,False,False,True,False,False
2,4.0,76.459948,1.0,0.0,99.0,23.0,0.6,138.0,4.4,12.0,...,True,True,False,False,False,False,False,True,False,False
3,5.0,76.459948,1.0,0.0,148.112676,16.0,0.7,138.0,3.2,8.1,...,True,True,False,False,False,False,False,True,False,True
4,5.0,50.0,0.0,0.0,148.112676,25.0,0.6,137.528754,4.627244,11.8,...,True,True,False,False,False,False,False,True,False,False


In [8]:
independent = dataset[['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv',
       'wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e',
       'rbc_normal', 'pc_normal', 'pcc_present', 'ba_present', 'htn_yes',
       'dm_yes', 'cad_yes', 'appet_yes', 'pe_yes', 'ane_yes']]

In [9]:
dependent = dataset[['classification']].values.ravel()

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test  = train_test_split(independent,dependent,test_size=0.3)

In [12]:
from sklearn.svm import  SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

In [13]:
pipeline = Pipeline([
('sc',StandardScaler()),
('svc',SVC(random_state=42,probability=True))
]
)
#SVCs aren’t probabilistic by nature. When you use SVC(probability=True), 
#scikit-learn produce probability estimates, that can generate ROC curves using these probabilities.So for SVC, dont forget to set probability to True

In [14]:
param_grid = {
    'svc__C': [0.1, 1, 5, 10, 100],
    'svc__kernel': ['linear', 'rbf', 'poly'],
    'svc__gamma': ['scale', 'auto'],
    
    'svc__class_weight': [None, 'balanced']
}

In [15]:
model = GridSearchCV(estimator=pipeline,param_grid=param_grid,n_jobs=-1,refit=True,verbose=3,scoring='roc_auc')

In [16]:
model.fit(X_train,y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [17]:
model = GridSearchCV(estimator=pipeline,param_grid=param_grid,n_jobs=-1,refit=True,verbose=3,scoring='roc_auc')

In [18]:
model.fit(X_train,y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [19]:
model.best_params_

{'svc__C': 0.1,
 'svc__class_weight': None,
 'svc__gamma': 'scale',
 'svc__kernel': 'rbf'}

In [20]:
y_pred = model.predict(X_test)

In [21]:
confusion_matrix(y_test,y_pred)

array([[48,  1],
       [ 2, 69]], dtype=int64)

In [22]:
y_proba = model.predict_proba(X_test)[:,1]

In [23]:
from sklearn.metrics import classification_report, roc_auc_score

In [24]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97        49
           1       0.99      0.97      0.98        71

    accuracy                           0.97       120
   macro avg       0.97      0.98      0.97       120
weighted avg       0.98      0.97      0.98       120



In [25]:
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

ROC AUC Score: 0.999137683242311
