In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

#
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [4]:
dt = pd.read_csv('kidney_disease.csv')

In [5]:
cols_to_retain = ['age','al','sc','hemo','sg','pcv','bu','htn','rbc','pc','classification']
dt = dt.drop( [col for col in dt.columns if not col in cols_to_retain] , axis = 1 )
dt = dt.dropna(axis = 0 , how = 'any')

In [6]:
dt['rbc'] = LabelEncoder().fit_transform( dt['rbc'])
dt['pc'] = LabelEncoder().fit_transform( dt['pc'])
dt['htn'] = LabelEncoder().fit_transform( dt['htn'])
dt['classification'] = LabelEncoder().fit_transform( dt['classification'])

In [7]:
X = dt.drop(['classification'] , axis = 1)
Y = dt['classification']

In [8]:
X_train,X_test,Y_train,Y_test=train_test_split(X , Y , test_size = 0.2)

In [9]:
k_range = range(1,20)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train,Y_train)
    y_pred=knn.predict(X_test)
    scores.append(accuracy_score(Y_test,y_pred))
    
import operator
index, value = max(enumerate(scores), key=operator.itemgetter(1))
k_value = index + 1  

In [10]:
models =[]
models.append(('RFC',RandomForestClassifier()))
models.append(('DTC',DecisionTreeClassifier()))
models.append(('KNN',KNeighborsClassifier(n_neighbors = k_value )))
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('SVM',SVC(kernel='linear')))

In [11]:
results = []
names = []
accuracy_values = []
function_name = []

for name,model in models:
    accuracy = cross_val_score(model, X_train, Y_train,scoring='accuracy')
    results.append(accuracy)   
    msg = ("%s: %f" % (name, accuracy.mean()))
    print(msg)
    accuracy_values.append(accuracy.mean())
    function_name.append(name)

RFC: 0.987689
DTC: 0.975189
KNN: 0.919886
LR: 0.981439
SVM: 0.987689


In [12]:
import operator
index, value = max(enumerate(accuracy_values), key=operator.itemgetter(1))

In [13]:
print(index)
print(value)
print(function_name[index])

0
0.9876893939393939
RFC


In [14]:
model = ""

if function_name[index] == "RFC" :
    model = RandomForestClassifier()
    print(function_name[index])
        
elif function_name[index] == "DTC" :
    model = DecisionTreeClassifier()
    print(function_name[index])
                
elif function_name[index] == "KNN" :
    model = KNeighborsClassifier(n_neighbors = k_value)
    print(function_name[index])
    
elif function_name[index] == "SVM" :
    model = SVC(kernel='linear')
    print(function_name[index])
        
elif function_name[index] == "LR" :
    model = LogisticRegression(solver='liblinear')
    print(function_name[index])
        
        
    
    

RFC


In [15]:
model = model.fit(X,Y)

In [23]:
X_test

Unnamed: 0,age,sg,al,rbc,pc,bu,sc,hemo,pcv,htn
49,60.0,1.01,2.0,1,0,72.0,3.0,9.7,29,1
344,64.0,1.02,0.0,1,1,27.0,0.7,14.4,42,0
71,46.0,1.01,1.0,1,1,92.0,3.3,9.8,28,1
44,54.0,1.01,3.0,0,0,77.0,6.3,9.7,28,1
325,58.0,1.02,0.0,1,1,50.0,1.2,14.0,50,0
284,33.0,1.025,0.0,1,1,37.0,1.2,16.9,52,0
271,30.0,1.025,0.0,1,1,25.0,0.5,13.8,42,0
294,75.0,1.02,0.0,1,1,50.0,0.7,14.3,40,0
254,51.0,1.025,0.0,1,1,38.0,0.8,13.0,49,0
225,60.0,1.01,3.0,0,1,95.0,2.7,11.5,35,1


In [16]:
y_pred = model.predict(X_test)

In [17]:
y_pred

array([0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0])

In [18]:
import joblib as joblib

In [19]:
joblib.dump(model,'train_model.joblib')

['train_model.joblib']

In [20]:
loadModel = joblib.load('train_model.joblib')

In [21]:
y_pred = model.predict(X_test)

In [22]:
y_pred

array([0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0])