In [20]:
#importing libraries
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler



In [21]:
#loading the data
kidney_data = pd.read_csv("..\dataset\kidney.csv")
#printing first 5 rows
kidney_data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [22]:
#shape of the data(rows and columns)
kidney_data.shape

(400, 26)

In [23]:
#list of columns to retain
column_to_retain = ['sg', 'al', 'sc', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'classification']

#droping the rest columns
kidney_data = kidney_data.drop([col for col in kidney_data.columns if not col in column_to_retain] , axis=1)

#drop the rows with na or missing values
kidney_data = kidney_data.dropna(axis=0)

In [24]:
#transform the non-numeric data in the columns
for column in kidney_data.columns:
    if kidney_data[column].dtype == np.number:
        continue
    kidney_data[column] = LabelEncoder().fit_transform(kidney_data[column])

  if kidney_data[column].dtype == np.number:
  if kidney_data[column].dtype == np.number:
  if kidney_data[column].dtype == np.number:


In [25]:
#printing the first 5 rows of the cleaned data
kidney_data.head()

Unnamed: 0,sg,al,sc,hemo,pcv,htn,classification
0,1.02,1.0,1.2,15.4,28,1,0
1,1.02,4.0,0.8,11.3,22,0,0
2,1.01,2.0,1.8,9.6,15,0,0
3,1.005,4.0,3.8,11.2,16,1,0
4,1.01,2.0,1.4,11.6,19,0,0


In [26]:
kidney_data.shape

(287, 7)

In [27]:
#split the data into independent (X) data set (the features) and dependent (Y) data set (the target)
X = kidney_data.drop(['classification'], axis=1)
Y = kidney_data['classification']

In [28]:
#feature scaling
#min-max scaler method scales the dataset so that all the input features lie between 0 and 1
x_scaler = MinMaxScaler()
x_scaler.fit(X)
column_names = X.columns
X[column_names] = x_scaler.transform(X)

In [29]:
#splitting the dataset into 80% training and 20% testing and shuffle
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42 ,shuffle=True)

In [30]:
# Initialize SVM classifier
model = SVC(kernel='linear', random_state=42)

In [31]:
# Train the SVM classifier
model.fit(X_train, Y_train)

In [32]:
# Predict labels for training and testing sets
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

# Compute training and testing accuracy
train_accuracy = accuracy_score(Y_train, train_predictions)
test_accuracy = accuracy_score(Y_test, test_predictions)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

Training Accuracy: 0.9781659388646288
Testing Accuracy: 0.9827586206896551


In [33]:
import pickle

In [34]:
#saving the model
filename = 'kidney_prediction_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [35]:
#shape of the training and testing data
print("Shape of the training data:", X_train.shape)
print("Shape of the testing data:", X_test.shape)

Shape of the training data: (229, 6)
Shape of the testing data: (58, 6)


In [36]:
#show the actual and predicted values
predict = model.predict(X_test)
predict = [1 if y>=0.5 else 0 for y in predict]

print('Original : {0}'.format(",".join(str(x) for x in Y_test)))
print('Predicted : {0}'.format(",".join(str(x) for x in predict)))

Original : 0,1,0,1,1,1,0,0,0,0,0,1,0,1,0,1,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,1,0,1,1,1,1,0,0,1,0
Predicted : 0,1,0,1,1,1,0,0,0,0,0,1,0,1,0,1,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,1,1,1,1,1,1,0,0,1,0


In [37]:
Y_test

9      0
368    1
235    0
318    1
341    1
300    1
155    0
111    0
173    0
44     0
225    0
311    1
234    0
379    1
5      0
333    1
348    1
367    1
63     0
382    1
349    1
80     0
249    0
250    1
179    0
172    0
246    0
239    0
177    0
56     0
178    0
240    0
269    1
317    1
65     0
130    0
94     0
185    0
115    0
260    1
241    0
40     0
27     0
31     0
284    1
366    1
133    0
272    1
381    1
150    0
309    1
267    1
275    1
259    1
6      0
123    0
296    1
91     0
Name: classification, dtype: int32

In [38]:
input_data = (1.02,0,0.7,13.2,28,0)
# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Kidney Disease')
else:
  print('The Person has Kidney Disease')

[1]
The Person has Kidney Disease




In [39]:
# loading the saved model
loaded_model = pickle.load(open('kidney_prediction_model.sav', 'rb'))

In [40]:
for column in X.columns:
  print(column)

sg
al
sc
hemo
pcv
htn
