# XGBoost

### Boosting

In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split




In [2]:
iris = datasets.load_iris()
X,y = iris.data,iris.target

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)

xg_cl = xgb.XGBClassifier(objective='binary:logistic',n_estimators=10,seed=123)
xg_cl.fit(X_train,y_train)
preds=xg_cl.predict(X_test)
accuracy =float(np.sum(preds==y_test))/y_test.shape[0]
print('accuracy',accuracy)

accuracy 1.0


# Trees as Base Learners in Scikit API

In [3]:
data=pd.read_csv(r'C:\Users\AMIT\Desktop\MSIT_ML_CLASS-master-5\MSIT_ML_CLASS-master\datasets\boston_housing.csv')

X,y = data.iloc[:,:-1],data.iloc[:,-1]

X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=123)

xg_reg=xgb.XGBRegressor(objective='reg:linear',n_estimators=10,seed=123)

xg_reg.fit(X_train,y_train)
preds=xg_reg.predict(X_test)

from sklearn.metrics import mean_squared_error
rmse=np.sqrt(mean_squared_error(y_test,preds))
print("RMSE %f"%(rmse))



RMSE 173308.249990


# Linear base learner in XGBoost API

In [4]:
data = pd.read_csv(r'C:\Users\AMIT\Desktop\MSIT_ML_CLASS-master-5\MSIT_ML_CLASS-master\datasets\boston_housing.csv')

X,y = data.iloc[:,:-1],data.iloc[:,-1]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=123)

DM_train = xgb.DMatrix(data=X_train,label=y_train)
DM_test = xgb.DMatrix(data=X_test,label=y_test)
params={'booster':'gblinear','objective':'reg:linear'}

xg_reg = xgb.train(params=params,dtrain=DM_train,num_boost_round=10)
preds = xg_reg.predict(DM_test)
rmse=np.sqrt(mean_squared_error(y_test,preds))
print("RMSE: %f"%(rmse))

RMSE: 90071.656144


# Tuning the model

### Untuned model

In [5]:
data = pd.read_csv(r"C:\Users\AMIT\Desktop\MSIT_ML_CLASS-master-5\MSIT_ML_CLASS-master\datasets\boston_housing.csv")

X = data[data.columns.tolist()[:-1]]
print(data.columns.tolist()[:-1])
print(X)

y=data[data.columns.tolist()[-1]]
print(data.columns.tolist()[-1])
print(y)

data_dmatrix = xgb.DMatrix(data=X,label=y)

untuned_params = {'objective':'reg:linear'}
tuned_cv_results_rmse = xgb.cv(dtrain = data_dmatrix, params= untuned_params, nfold =4,num_boost_round =200,metrics='rmse',as_pandas =True, seed =123)
print(tuned_cv_results_rmse)
print(type(tuned_cv_results_rmse))

print("Tuned RMSE: %f"%tuned_cv_results_rmse['test-rmse-mean'].tail(1))

['RM', 'LSTAT', 'PTRATIO']
        RM  LSTAT  PTRATIO
0    6.575   4.98     15.3
1    6.421   9.14     17.8
2    7.185   4.03     17.8
3    6.998   2.94     18.7
4    7.147   5.33     18.7
5    6.430   5.21     18.7
6    6.012  12.43     15.2
7    6.172  19.15     15.2
8    5.631  29.93     15.2
9    6.004  17.10     15.2
10   6.377  20.45     15.2
11   6.009  13.27     15.2
12   5.889  15.71     15.2
13   5.949   8.26     21.0
14   6.096  10.26     21.0
15   5.834   8.47     21.0
16   5.935   6.58     21.0
17   5.990  14.67     21.0
18   5.456  11.69     21.0
19   5.727  11.28     21.0
20   5.570  21.02     21.0
21   5.965  13.83     21.0
22   6.142  18.72     21.0
23   5.813  19.88     21.0
24   5.924  16.30     21.0
25   5.599  16.51     21.0
26   5.813  14.81     21.0
27   6.047  17.28     21.0
28   6.495  12.80     21.0
29   6.674  11.98     21.0
..     ...    ...      ...
459  6.484  18.68     20.2
460  5.304  24.91     20.2
461  6.185  18.03     20.2
462  6.229  13.11     20.2
4

In [6]:
# starting three lines of this cell is same as above cell 



data_dmatrix = xgb.DMatrix(data=X,label=y)

tuned_params={'objective':'reg:linear','colsample_bytree':0.3,'learning_rate':0.1,'max_depth':5}
tuned_cv_results_rmse= xgb.cv(dtrain= data_dmatrix ,params= tuned_params, nfold=4, num_boost_round=200, metrics ='rmse', as_pandas=True, seed=123)

print("tuned RMSE: %f"%tuned_cv_results_rmse['test-rmse-mean']).tail(1)

XGBoostError: b'[23:26:43] src/tree/updater_colmaker.cc:161: Check failed: (n) > (0) colsample_bytree=0.3 is too small that no feature can be included'

# Assignment : kidney disease dataset

In [7]:
data= pd.read_csv(r'C:\Users\AMIT\Desktop\MSIT_ML_CLASS-master-6\MSIT_ML_CLASS-master\datasets\chronic_kidney_with_class.csv')

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,0,48,80,1.02,1,0,?,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7,50,1.02,4,0,?,normal,notpresent,notpresent,...,38,6000,?,no,no,no,good,no,no,ckd
2,2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,...,31,7500,?,no,yes,no,poor,no,yes,ckd
3,3,48,70,1.005,4,0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
Unnamed: 0    400 non-null int64
age           400 non-null object
bp            400 non-null object
sg            400 non-null object
al            400 non-null object
su            400 non-null object
bgr           400 non-null object
bu            400 non-null object
sc            400 non-null object
sod           400 non-null object
pot           400 non-null object
hemo          400 non-null object
pcv           400 non-null object
wc            400 non-null object
rc            400 non-null object
rbc           400 non-null object
pc            400 non-null object
pcc           400 non-null object
ba            400 non-null object
htn           400 non-null object
dm            400 non-null object
cad           400 non-null object
appet         400 non-null object
pe            400 non-null object
ane           400 non-null object
class         400 non-null object
dtypes: int6

In [10]:
data.columns

Index(['Unnamed: 0', 'age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod',
       'pot', 'hemo', 'pcv', 'wc', 'rc', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm',
       'cad', 'appet', 'pe', 'ane', 'class'],
      dtype='object')

In [11]:
data['Unnamed: 0']

0        0
1        1
2        2
3        3
4        4
5        5
6        6
7        7
8        8
9        9
10      10
11      11
12      12
13      13
14      14
15      15
16      16
17      17
18      18
19      19
20      20
21      21
22      22
23      23
24      24
25      25
26      26
27      27
28      28
29      29
      ... 
370    370
371    371
372    372
373    373
374    374
375    375
376    376
377    377
378    378
379    379
380    380
381    381
382    382
383    383
384    384
385    385
386    386
387    387
388    388
389    389
390    390
391    391
392    392
393    393
394    394
395    395
396    396
397    397
398    398
399    399
Name: Unnamed: 0, Length: 400, dtype: int64

In [12]:
data.drop('Unnamed: 0',axis=1,inplace= True)


In [13]:
data.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,?,normal,notpresent,notpresent,121,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.02,4,0,?,normal,notpresent,notpresent,?,...,38,6000,?,no,no,no,good,no,no,ckd
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423,...,31,7500,?,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [14]:
data['bgr'].unique()

array(['?', 'normal', 'abnormal'], dtype=object)

In [15]:
data['bgr'].replace("?",np.nan,inplace=True)

In [16]:
data.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,,normal,notpresent,notpresent,121,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.02,4,0,,normal,notpresent,notpresent,?,...,38,6000,?,no,no,no,good,no,no,ckd
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423,...,31,7500,?,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [17]:
data['pot'].unique()

array(['121', '?', '423', '117', '106', '74', '100', '410', '138', '70',
       '490', '380', '208', '98', '157', '76', '99', '114', '263', '173',
       '95', '108', '156', '264', '123', '93', '107', '159', '140', '171',
       '270', '92', '137', '204', '79', '207', '124', '144', '91', '162',
       '246', '253', '141', '182', '86', '150', '146', '425', '112', '250',
       '360', '163', '129', '133', '102', '158', '165', '132', '104',
       '127', '415', '169', '251', '109', '280', '210', '219', '295', '94',
       '172', '101', '298', '153', '88', '226', '143', '115', '89', '297',
       '233', '294', '323', '125', '90', '308', '118', '224', '128', '122',
       '214', '213', '268', '256', '84', '105', '288', '139', '78', '273',
       '242', '424', '303', '148', '160', '192', '307', '220', '447',
       '309', '22', '111', '261', '215', '234', '131', '352', '80', '239',
       '110', '130', '184', '252', '113', '230', '341', '255', '103',
       '238', '248', '120', '241', '269',

In [18]:
data['pot'].replace('?',np.nan,inplace=True)
data.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.02,4,0,,normal,notpresent,notpresent,,...,38,6000,?,no,no,no,good,no,no,ckd
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423.0,...,31,7500,?,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [19]:
data['pot'].unique()

array(['121', nan, '423', '117', '106', '74', '100', '410', '138', '70',
       '490', '380', '208', '98', '157', '76', '99', '114', '263', '173',
       '95', '108', '156', '264', '123', '93', '107', '159', '140', '171',
       '270', '92', '137', '204', '79', '207', '124', '144', '91', '162',
       '246', '253', '141', '182', '86', '150', '146', '425', '112', '250',
       '360', '163', '129', '133', '102', '158', '165', '132', '104',
       '127', '415', '169', '251', '109', '280', '210', '219', '295', '94',
       '172', '101', '298', '153', '88', '226', '143', '115', '89', '297',
       '233', '294', '323', '125', '90', '308', '118', '224', '128', '122',
       '214', '213', '268', '256', '84', '105', '288', '139', '78', '273',
       '242', '424', '303', '148', '160', '192', '307', '220', '447',
       '309', '22', '111', '261', '215', '234', '131', '352', '80', '239',
       '110', '130', '184', '252', '113', '230', '341', '255', '103',
       '238', '248', '120', '241', '269',

In [20]:
#data['ba'].replace('?',np.nan,inplace= True)

In [21]:
data.replace('?',np.nan,inplace = True)

In [22]:
data.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.02,4,0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423.0,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [23]:
data

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,48,80,1.020,1,0,,normal,notpresent,notpresent,121,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.020,4,0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62,80,1.010,2,3,normal,normal,notpresent,notpresent,423,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.010,2,0,normal,normal,notpresent,notpresent,106,...,35,7300,4.6,no,no,no,good,no,no,ckd
5,60,90,1.015,3,0,,,notpresent,notpresent,74,...,39,7800,4.4,yes,yes,no,good,yes,no,ckd
6,68,70,1.010,0,0,,normal,notpresent,notpresent,100,...,36,,,no,no,no,good,no,no,ckd
7,24,,1.015,2,4,normal,abnormal,notpresent,notpresent,410,...,44,6900,5,no,yes,no,good,yes,no,ckd
8,52,100,1.015,3,0,normal,abnormal,present,notpresent,138,...,33,9600,4.0,yes,yes,no,good,no,yes,ckd
9,53,90,1.020,2,0,abnormal,abnormal,present,notpresent,70,...,29,12100,3.7,yes,yes,no,poor,no,yes,ckd


In [24]:
print(type(data['age'].values))




<class 'numpy.ndarray'>


In [25]:
data['age'].unique()

array(['48', '7', '62', '51', '60', '68', '24', '52', '53', '50', '63',
       '40', '47', '61', '21', '42', '75', '69', nan, '73', '70', '65',
       '76', '72', '82', '46', '45', '35', '54', '11', '59', '67', '15',
       '55', '44', '26', '64', '56', '5', '74', '38', '58', '71', '34',
       '17', '12', '43', '41', '57', '8', '39', '66', '81', '14', '27',
       '83', '30', '4', '3', '6', '32', '80', '49', '90', '78', '19', '2',
       '33', '36', '37', '23', '25', '20', '29', '28', '22', '79'], dtype=object)

In [26]:
data



Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,48,80,1.020,1,0,,normal,notpresent,notpresent,121,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.020,4,0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62,80,1.010,2,3,normal,normal,notpresent,notpresent,423,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.010,2,0,normal,normal,notpresent,notpresent,106,...,35,7300,4.6,no,no,no,good,no,no,ckd
5,60,90,1.015,3,0,,,notpresent,notpresent,74,...,39,7800,4.4,yes,yes,no,good,yes,no,ckd
6,68,70,1.010,0,0,,normal,notpresent,notpresent,100,...,36,,,no,no,no,good,no,no,ckd
7,24,,1.015,2,4,normal,abnormal,notpresent,notpresent,410,...,44,6900,5,no,yes,no,good,yes,no,ckd
8,52,100,1.015,3,0,normal,abnormal,present,notpresent,138,...,33,9600,4.0,yes,yes,no,good,no,yes,ckd
9,53,90,1.020,2,0,abnormal,abnormal,present,notpresent,70,...,29,12100,3.7,yes,yes,no,poor,no,yes,ckd


In [27]:
data['age'].astype(float)

0      48.0
1       7.0
2      62.0
3      48.0
4      51.0
5      60.0
6      68.0
7      24.0
8      52.0
9      53.0
10     50.0
11     63.0
12     68.0
13     68.0
14     68.0
15     40.0
16     47.0
17     47.0
18     60.0
19     62.0
20     61.0
21     60.0
22     48.0
23     21.0
24     42.0
25     61.0
26     75.0
27     69.0
28     75.0
29     68.0
       ... 
370    69.0
371    28.0
372    72.0
373    61.0
374    79.0
375    70.0
376    58.0
377    64.0
378    71.0
379    62.0
380    59.0
381    71.0
382    48.0
383    80.0
384    57.0
385    63.0
386    46.0
387    15.0
388    51.0
389    41.0
390    52.0
391    36.0
392    57.0
393    43.0
394    50.0
395    55.0
396    42.0
397    12.0
398    17.0
399    58.0
Name: age, Length: 400, dtype: float64

In [28]:
col_list=['age','bp','sg','al','su','pot','pc','pcc','ba','hemo','pcv','wc','rc','rbc']
df=data[col_list].astype(float)

In [29]:
df

Unnamed: 0,age,bp,sg,al,su,pot,pc,pcc,ba,hemo,pcv,wc,rc,rbc
0,48.0,80.0,1.020,1.0,0.0,121.0,44.0,7800.0,5.2,36.0,1.2,,,15.4
1,7.0,50.0,1.020,4.0,0.0,,38.0,6000.0,,18.0,0.8,,,11.3
2,62.0,80.0,1.010,2.0,3.0,423.0,31.0,7500.0,,53.0,1.8,,,9.6
3,48.0,70.0,1.005,4.0,0.0,117.0,32.0,6700.0,3.9,56.0,3.8,111.0,2.5,11.2
4,51.0,80.0,1.010,2.0,0.0,106.0,35.0,7300.0,4.6,26.0,1.4,,,11.6
5,60.0,90.0,1.015,3.0,0.0,74.0,39.0,7800.0,4.4,25.0,1.1,142.0,3.2,12.2
6,68.0,70.0,1.010,0.0,0.0,100.0,36.0,,,54.0,24.0,104.0,4.0,12.4
7,24.0,,1.015,2.0,4.0,410.0,44.0,6900.0,5.0,31.0,1.1,,,12.4
8,52.0,100.0,1.015,3.0,0.0,138.0,33.0,9600.0,4.0,60.0,1.9,,,10.8
9,53.0,90.0,1.020,2.0,0.0,70.0,29.0,12100.0,3.7,107.0,7.2,114.0,3.7,9.5


In [30]:
data.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo',
       'pcv', 'wc', 'rc', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'class'],
      dtype='object')

In [31]:
new_list=[   'bgr', 'bu', 'sc', 'sod'
       , 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'class']
df[new_list]=data[new_list]

In [32]:
df

Unnamed: 0,age,bp,sg,al,su,pot,pc,pcc,ba,hemo,...,bu,sc,sod,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.020,1.0,0.0,121.0,44.0,7800.0,5.2,36.0,...,normal,notpresent,notpresent,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4.0,0.0,,38.0,6000.0,,18.0,...,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2.0,3.0,423.0,31.0,7500.0,,53.0,...,normal,notpresent,notpresent,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,117.0,32.0,6700.0,3.9,56.0,...,abnormal,present,notpresent,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.0,106.0,35.0,7300.0,4.6,26.0,...,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
5,60.0,90.0,1.015,3.0,0.0,74.0,39.0,7800.0,4.4,25.0,...,,notpresent,notpresent,yes,yes,no,good,yes,no,ckd
6,68.0,70.0,1.010,0.0,0.0,100.0,36.0,,,54.0,...,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
7,24.0,,1.015,2.0,4.0,410.0,44.0,6900.0,5.0,31.0,...,abnormal,notpresent,notpresent,no,yes,no,good,yes,no,ckd
8,52.0,100.0,1.015,3.0,0.0,138.0,33.0,9600.0,4.0,60.0,...,abnormal,present,notpresent,yes,yes,no,good,no,yes,ckd
9,53.0,90.0,1.020,2.0,0.0,70.0,29.0,12100.0,3.7,107.0,...,abnormal,present,notpresent,yes,yes,no,poor,no,yes,ckd


In [33]:
df=pd.get_dummies(df)
df.head()

Unnamed: 0,age,bp,sg,al,su,pot,pc,pcc,ba,hemo,...,cad_no,cad_yes,appet_good,appet_poor,pe_no,pe_yes,ane_no,ane_yes,class_ckd,class_notckd
0,48.0,80.0,1.02,1.0,0.0,121.0,44.0,7800.0,5.2,36.0,...,1,0,1,0,1,0,1,0,1,0
1,7.0,50.0,1.02,4.0,0.0,,38.0,6000.0,,18.0,...,1,0,1,0,1,0,1,0,1,0
2,62.0,80.0,1.01,2.0,3.0,423.0,31.0,7500.0,,53.0,...,1,0,0,1,1,0,0,1,1,0
3,48.0,70.0,1.005,4.0,0.0,117.0,32.0,6700.0,3.9,56.0,...,1,0,0,1,0,1,0,1,1,0
4,51.0,80.0,1.01,2.0,0.0,106.0,35.0,7300.0,4.6,26.0,...,1,0,1,0,1,0,1,0,1,0


In [34]:
df.iloc[:,18:].head()

Unnamed: 0,sc_notpresent,sc_present,sod_notpresent,sod_present,htn_no,htn_yes,dm_ yes,dm_no,dm_yes,cad_no,cad_yes,appet_good,appet_poor,pe_no,pe_yes,ane_no,ane_yes,class_ckd,class_notckd
0,1,0,1,0,0,1,0,0,1,1,0,1,0,1,0,1,0,1,0
1,1,0,1,0,1,0,0,1,0,1,0,1,0,1,0,1,0,1,0
2,1,0,1,0,1,0,0,0,1,1,0,0,1,1,0,0,1,1,0
3,0,1,1,0,0,1,0,1,0,1,0,0,1,0,1,0,1,1,0
4,1,0,1,0,1,0,0,1,0,1,0,1,0,1,0,1,0,1,0


In [35]:
drop_list=['bgr_abnormal','bu_abnormal','sc_notpresent','sod_notpresent','htn_no','dm_no','cad_no','appet_poor','pe_no','ane_no','class_notckd']
df.drop(drop_list,axis=1,inplace =True)

In [36]:
df.head()

Unnamed: 0,age,bp,sg,al,su,pot,pc,pcc,ba,hemo,...,sc_present,sod_present,htn_yes,dm_ yes,dm_yes,cad_yes,appet_good,pe_yes,ane_yes,class_ckd
0,48.0,80.0,1.02,1.0,0.0,121.0,44.0,7800.0,5.2,36.0,...,0,0,1,0,1,0,1,0,0,1
1,7.0,50.0,1.02,4.0,0.0,,38.0,6000.0,,18.0,...,0,0,0,0,0,0,1,0,0,1
2,62.0,80.0,1.01,2.0,3.0,423.0,31.0,7500.0,,53.0,...,0,0,0,0,1,0,0,0,1,1
3,48.0,70.0,1.005,4.0,0.0,117.0,32.0,6700.0,3.9,56.0,...,1,0,1,0,0,0,0,1,1,1
4,51.0,80.0,1.01,2.0,0.0,106.0,35.0,7300.0,4.6,26.0,...,0,0,0,0,0,0,1,0,0,1


In [37]:
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

X=df.iloc[:,:-1]
y=df.iloc[:,-1]

steps = [('imp',Imputer(missing_values='NaN',strategy='most_frequent',axis=0)),
        ('clf',KNeighborsClassifier(n_neighbors = 6))]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=24)

pipeline=Pipeline(steps)

pipeline.fit(X_train,y_train)
pipeline.predict(X_test)
pipeline.score(X_test,y_test)

0.70833333333333337

In [38]:
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split



steps = [('imp',Imputer(missing_values='NaN',strategy='most_frequent',axis=0)),
        ('clf',DecisionTreeClassifier())]


pipeline=Pipeline(steps)

pipeline.fit(X_train,y_train)
pipeline.predict(X_test)
pipeline.score(X_test,y_test)

0.96666666666666667

In [39]:
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split



steps = [('imp',Imputer(missing_values='NaN',strategy='most_frequent',axis=0)),
        ('clf',RandomForestClassifier(n_estimators=100))]



pipeline=Pipeline(steps)

pipeline.fit(X_train,y_train)
pipeline.predict(X_test)
pipeline.score(X_test,y_test)

1.0

In [40]:
from sklearn.cross_validation import cross_val_score

X=df.iloc[:,:-1]
y=df.iloc[:,-1]

imp=Imputer(missing_values = 'NaN',strategy='most_frequent',axis=0)
imp.fit(X)
X = imp.transform(X)

clf =RandomForestClassifier(n_estimators=100)

scores = cross_val_score(clf,X,y,cv=10,scoring="accuracy")
print(scores)
scores.mean()

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]


1.0

In [42]:




steps = [('imp',Imputer(missing_values='NaN',strategy='mean',axis=0)),
        ('clf',RandomForestClassifier(n_estimators=100))]

pipeline = Pipeline(steps)

scores = cross_val_score(pipeline,X,y,cv=10,scoring='accuracy')
print(scores)
scores.mean()



[ 1.     1.     1.     1.     0.975  1.     1.     1.     1.     1.   ]


0.99749999999999994