In [147]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

#
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [148]:
dt = pd.read_csv('kidney_disease.csv')

In [149]:
cols_to_retain = ['age','al','sc','hemo','sg','pcv','bu','htn','rbc','pc','classification']
dt = dt.drop( [col for col in dt.columns if not col in cols_to_retain] , axis = 1 )# keep cols_to_retain and drop other rows
dt = dt.dropna(axis = 0 , how = 'any')# if one column is null remove entier row

In [150]:
dt['rbc'] = LabelEncoder().fit_transform( dt['rbc'])
dt['pc'] = LabelEncoder().fit_transform( dt['pc'])
dt['htn'] = LabelEncoder().fit_transform( dt['htn'])
dt['classification'] = LabelEncoder().fit_transform( dt['classification'])

In [151]:
X = dt.drop(['classification'] , axis = 1)
Y = dt['classification']

In [152]:
dt

Unnamed: 0,age,sg,al,rbc,pc,bu,sc,hemo,pcv,htn,classification
2,62.0,1.010,2.0,1,1,53.0,1.8,9.6,31,0,0
3,48.0,1.005,4.0,1,0,56.0,3.8,11.2,32,1,0
4,51.0,1.010,2.0,1,1,26.0,1.4,11.6,35,0,0
7,24.0,1.015,2.0,1,0,31.0,1.1,12.4,44,0,0
8,52.0,1.015,3.0,1,0,60.0,1.9,10.8,33,1,0
...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,1.020,0.0,1,1,49.0,0.5,15.7,47,0,1
396,42.0,1.025,0.0,1,1,31.0,1.2,16.5,54,0,1
397,12.0,1.020,0.0,1,1,26.0,0.6,15.8,49,0,1
398,17.0,1.025,0.0,1,1,50.0,1.0,14.2,51,0,1


In [153]:
X_train,X_test,Y_train,Y_test=train_test_split(X , Y , test_size = 0.2)#split test data and train data

In [154]:
#find best K value for KNeighborsClassifier algorithom
k_range = range(1,20)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train,Y_train)
    y_pred=knn.predict(X_test)
    scores.append(accuracy_score(Y_test,y_pred))
    
print(scores)   
import operator
index, value = max(enumerate(scores), key=operator.itemgetter(1))
k_value = index + 1  
print(k_value)

[0.9024390243902439, 0.8780487804878049, 0.9024390243902439, 0.8536585365853658, 0.9024390243902439, 0.9024390243902439, 0.9024390243902439, 0.9024390243902439, 0.9024390243902439, 0.9024390243902439, 0.9024390243902439, 0.9024390243902439, 0.9024390243902439, 0.9024390243902439, 0.9024390243902439, 0.8780487804878049, 0.9024390243902439, 0.8780487804878049, 0.9024390243902439]
1


In [155]:
models =[]
models.append(('RFC',RandomForestClassifier()))
models.append(('DTC',DecisionTreeClassifier()))
models.append(('KNN',KNeighborsClassifier(n_neighbors = k_value )))
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('SVM',SVC(kernel='linear')))

In [156]:
results = []
names = []
accuracy_values = []
function_name = []

for name,model in models:
    accuracy = cross_val_score(model, X_train, Y_train,scoring='accuracy')
    results.append(accuracy)   
    msg = ("%s: %f" % (name, accuracy.mean()))
    print(msg)
    accuracy_values.append(accuracy.mean())
    function_name.append(name)

RFC: 0.993750
DTC: 0.987500
KNN: 0.907386
LR: 0.975189
SVM: 0.987500


In [157]:
import operator
index, value = max(enumerate(accuracy_values), key=operator.itemgetter(1))

In [158]:
print(index)
print(value)
print(function_name[index])

0
0.99375
RFC


In [159]:
model = ""

if function_name[index] == "RFC" :
    model = RandomForestClassifier()
    print(function_name[index])
        
elif function_name[index] == "DTC" :
    model = DecisionTreeClassifier()
    print(function_name[index])
                
elif function_name[index] == "KNN" :
    model = KNeighborsClassifier(n_neighbors = k_value)
    print(function_name[index])
    
elif function_name[index] == "SVM" :
    model = SVC(kernel='linear')
    print(function_name[index])
        
elif function_name[index] == "LR" :
    model = LogisticRegression(solver='liblinear')
    print(function_name[index])
        
        
    
    

RFC


In [160]:
model = model.fit(X,Y)

In [161]:
X_test

Unnamed: 0,age,sg,al,rbc,pc,bu,sc,hemo,pcv,htn
336,25.0,1.02,0.0,1,1,27.0,0.5,15.2,40,0
311,56.0,1.025,0.0,1,1,18.0,1.1,13.7,45,0
374,79.0,1.025,0.0,1,1,44.0,1.2,16.3,40,0
281,55.0,1.025,0.0,1,1,50.0,1.2,15.5,41,0
373,61.0,1.025,0.0,1,1,38.0,1.0,13.7,47,0
265,50.0,1.02,0.0,1,1,40.0,0.6,14.2,48,0
4,51.0,1.01,2.0,1,1,26.0,1.4,11.6,35,0
153,55.0,1.01,2.0,0,0,235.0,14.2,8.3,22,1
196,49.0,1.01,3.0,0,0,158.0,11.8,8.1,24,1
225,60.0,1.01,3.0,0,1,95.0,2.7,11.5,35,1


In [162]:
y_pred = model.predict(X_test)

In [163]:
y_pred

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])

In [164]:
import joblib as joblib

In [165]:
joblib.dump(model,'train_model.joblib')

['train_model.joblib']

In [166]:
loadModel = joblib.load('train_model.joblib')

In [167]:
y_pred = model.predict(X_test)

In [168]:
y_pred

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])