In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split

In [15]:
data = pd.read_csv("Desktop/desktop files/ML/Datasets/kidney_disease.csv")


In [16]:
data[['htn','dm','cad','pe','ane']] = data[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
data[['rbc','pc']] = data[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
data[['pcc','ba']] = data[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
data[['appet']] = data[['appet']].replace(to_replace={'good':1,'poor':0,'no':np.nan})
data['classification'] = data['classification'].replace(to_replace={'ckd':1.0,'ckd\t':1.0,'notckd':0.0,'no':0.0})
data.rename(columns={'classification':'class'},inplace=True)

In [17]:
data['pe'] = data['pe'].replace(to_replace='good',value=0) # Not having pedal edema is good
data['appet'] = data['appet'].replace(to_replace='no',value=0)
data['cad'] = data['cad'].replace(to_replace='\tno',value=0)
data['dm'] = data['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1, '':np.nan})
data.drop('id',axis=1,inplace=True)

In [19]:
data = data.dropna()

In [21]:
X = data.iloc[:, 0:24]
y = data.iloc[:, -1]

In [26]:
best_features = SelectKBest(score_func=chi2, k =10 )

In [27]:
fit = best_features.fit(X, y)

In [36]:
vals = pd.DataFrame(fit.scores_,columns=['Score'])

In [55]:
cols = pd.DataFrame(X.columns,columns = ['Attributes'])

In [65]:
df = pd.concat([cols,vals],axis=1)

In [66]:
df = df.sort_values(by='Score',ascending=False)

In [67]:
df = df.head(15)

In [71]:
lst = df['Attributes'].tolist()

In [72]:
x = X[lst]

In [117]:
trainx,testx,trainy,testy = train_test_split(x,y,test_size = 0.33)

In [118]:

from sklearn.tree import DecisionTreeClassifier

In [119]:
sk_tree = DecisionTreeClassifier(criterion='gini',max_depth=5)

In [120]:
sk_tree.fit(trainx,trainy)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [121]:
sk_tree.predict(testx)

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,
       1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1.,
       0., 0.])

In [122]:
sk_tree.score(testx,testy)


1.0

In [123]:

from sklearn.ensemble import RandomForestClassifier

In [124]:
rf = RandomForestClassifier(n_estimators=10,criterion='entropy',max_depth = 5)
rf.fit(trainx,trainy)
rf.score(testx,testy)

0.9811320754716981

In [125]:
import multiprocessing
from sklearn.model_selection import GridSearchCV
from sklearn import svm


In [129]:
cpu_cnt = multiprocessing.cpu_count()


params = [
    {
    'kernel':['linear','rbf','poly','sigmoid'],
    'C' : [0.1,0.2,0.4,0.5,1.0,2.0,5.0]
    }
]
svc = svm.SVC()
gs = GridSearchCV(estimator=svc,param_grid = params,scoring='accuracy',cv=5,n_jobs=cpu_cnt)

In [130]:
gs.fit(trainx,trainy)
gs.best_estimator_

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [131]:
gs.best_score_

0.9714285714285715

In [132]:
from sklearn import svm


In [133]:
clf = svm.SVC()
clf.fit(trainx,trainy)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [135]:
clf.score(testx,testy)

0.8113207547169812

### Without Dropping nan

In [3]:
#data = pd.read_csv("Desktop/desktop files/ML/Datasets/kidney_disease_edited.csv")


In [6]:
data['rc'].unique()

array(['5.2', nan, '3.9', '4.6', '4.4', '5', '4.0', '3.7', '3.8', '3.4',
       '2.6', '2.8', '4.3', '3.2', '3.6', '4', '4.1', '4.9', '2.5', '4.2',
       '4.5', '3.1', '4.7', '3.5', '6.0', '5.0', '2.1', '5.6', '2.3',
       '2.9', '2.7', '8.0', '3.3', '3.0', '3', '2.4', '4.8', '\t?', '5.4',
       '6.1', '6.2', '6.3', '5.1', '5.8', '5.5', '5.3', '6.4', '5.7',
       '5.9', '6.5'], dtype=object)

In [7]:
data[['htn','dm','cad','pe','ane']] = data[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
data[['rbc','pc']] = data[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
data[['pcc','ba']] = data[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
data[['appet']] = data[['appet']].replace(to_replace={'good':1,'poor':0,'no':np.nan})
data['classification'] = data['classification'].replace(to_replace={'ckd':1.0,'ckd\t':1.0,'notckd':0.0,'no':0.0})
data.rename(columns={'classification':'class'},inplace=True)

In [8]:
data['pe'] = data['pe'].replace(to_replace='good',value=0) # Not having pedal edema is good
data['appet'] = data['appet'].replace(to_replace='no',value=0)
data['cad'] = data['cad'].replace(to_replace='\tno',value=0)
data['dm'] = data['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1, '':np.nan})
data.drop('id',axis=1,inplace=True)

In [11]:
d_col = data.columns.tolist()

In [19]:
for d in d_col:
    if data[d].isnull().values.any():
        data[d] = data[d].fillna(data[d].mode()[0])
        data[d] = data[d].replace('\t?',data[d].mean()[0])

In [20]:
X = data.iloc[:, 0:24]
y = data.iloc[:, -1]

In [21]:
best_features = SelectKBest(score_func=chi2, k =10 )


In [22]:
for d in d_col:
    print(d)
    print(data[d].dtype)

age
float64
bp
float64
sg
float64
al
float64
su
float64
rbc
float64
pc
float64
pcc
float64
ba
float64
bgr
float64
bu
float64
sc
float64
sod
float64
pot
float64
hemo
float64
pcv
object
wc
object
rc
object
htn
float64
dm
float64
cad
float64
appet
float64
pe
float64
ane
float64
class
float64


In [23]:
data['wc'].unique()

array(['7800', '6000', '7500', '6700', '7300', '9800', '6900', '9600',
       '12100', '4500', '12200', '11000', '3800', '11400', '5300', '9200',
       '6200', '8300', '8400', '10300', '9100', '7900', '6400', '8600',
       '18900', '21600', '4300', '8500', '11300', '7200', '7700', '14600',
       '6300', '\t6200', '7100', '11800', '9400', '5500', '5800', '13200',
       '12500', '5600', '7000', '11900', '10400', '10700', '12700',
       '6800', '6500', '13600', '10200', '9000', '14900', '8200', '15200',
       '5000', '16300', '12400', '\t8400', '10500', '4200', '4700',
       '10900', '8100', '9500', '2200', '12800', '11200', '19100', '\t?',
       '12300', '16700', '2600', '26400', '8800', '7400', '4900', '8000',
       '12000', '15700', '4100', '5700', '11500', '5400', '10800', '9900',
       '5200', '5900', '9300', '9700', '5100', '6600'], dtype=object)