In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

#
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
dt = pd.read_csv('kidney_disease.csv')

In [3]:
cols_to_retain = ['age','al','sc','hemo','sg','pcv','bu','htn','rbc','pc','classification']
dt = dt.drop( [col for col in dt.columns if not col in cols_to_retain] , axis = 1 )
dt = dt.dropna(axis = 0 , how = 'any')

In [4]:
dt['rbc'] = LabelEncoder().fit_transform( dt['rbc'])
dt['pc'] = LabelEncoder().fit_transform( dt['pc'])
dt['htn'] = LabelEncoder().fit_transform( dt['htn'])
dt['classification'] = LabelEncoder().fit_transform( dt['classification'])

In [5]:
X = dt.drop(['classification'] , axis = 1)
Y = dt['classification']

In [6]:
X_train,X_test,Y_train,Y_test=train_test_split(X , Y , test_size = 0.2)

In [7]:
k_range = range(1,20)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train,Y_train)
    y_pred=knn.predict(X_test)
    scores.append(accuracy_score(Y_test,y_pred))
    
import operator
index, value = max(enumerate(scores), key=operator.itemgetter(1))
k_value = index + 1  

In [8]:
models =[]
models.append(('RFC',RandomForestClassifier()))
models.append(('DTC',DecisionTreeClassifier()))
models.append(('KNN',KNeighborsClassifier(n_neighbors = k_value )))
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('SVM',SVC(kernel='linear')))

In [9]:
results = []
names = []
accuracy_values = []
function_name = []

for name,model in models:
    accuracy = cross_val_score(model, X_train, Y_train,scoring='accuracy')
    results.append(accuracy)   
    msg = ("%s: %f" % (name, accuracy.mean()))
    print(msg)
    accuracy_values.append(accuracy.mean())
    function_name.append(name)

RFC: 0.993750
DTC: 0.975000
KNN: 0.932576
LR: 0.969129
SVM: 0.987689


In [10]:
import operator
index, value = max(enumerate(accuracy_values), key=operator.itemgetter(1))

In [11]:
print(index)
print(value)
print(function_name[index])

0
0.99375
RFC


In [12]:
model = ""

if function_name[index] == "RFC" :
    model = RandomForestClassifier()
    print(function_name[index])
        
elif function_name[index] == "DTC" :
    model = DecisionTreeClassifier()
    print(function_name[index])
                
elif function_name[index] == "KNN" :
    model = KNeighborsClassifier(n_neighbors = k_value)
    print(function_name[index])
    
elif function_name[index] == "SVM" :
    model = SVC(kernel='linear')
    print(function_name[index])
        
elif function_name[index] == "LR" :
    model = LogisticRegression(solver='liblinear')
    print(function_name[index])
        
        
    
    

RFC


In [13]:
model = model.fit(X,Y)

In [14]:
X_test

Unnamed: 0,age,sg,al,rbc,pc,bu,sc,hemo,pcv,htn
229,59.0,1.01,3.0,1,0,191.0,12.0,9.6,31,0
332,34.0,1.025,0.0,1,1,33.0,1.0,15.3,44,0
225,60.0,1.01,3.0,0,1,95.0,2.7,11.5,35,1
371,28.0,1.025,0.0,1,1,50.0,0.5,17.6,51,0
149,65.0,1.02,1.0,0,0,29.0,1.0,10.5,32,1
338,62.0,1.02,0.0,1,1,34.0,0.8,17.8,44,0
376,58.0,1.025,0.0,1,1,16.0,1.1,16.4,53,0
9,53.0,1.02,2.0,0,0,107.0,7.2,9.5,29,1
339,25.0,1.02,0.0,1,1,42.0,0.5,13.3,48,0
239,34.0,1.015,2.0,1,1,50.0,1.6,11.9,39,0


In [15]:
y_pred = model.predict(X_test)

In [16]:
y_pred

array([0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1])

In [17]:
import joblib as joblib

In [18]:
joblib.dump(model,'train_model.joblib')

['train_model.joblib']

In [19]:
loadModel = joblib.load('train_model.joblib')

In [20]:
y_pred = model.predict(X_test)

In [21]:
y_pred

array([0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1])