# Imports

In [53]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from scipy.io import arff

from algo_implementation import logistic_regression
from preprocessor import Preprocessor

# Dataset 1 preparation

In [55]:
def decode_bytes(x):
    """
    Function for decoding bytes-type data 
    """
    if isinstance(x, bytes):
        return x.decode()
    else:
        return x

In [56]:
preprocessor = Preprocessor()
preprocessor.remove_spaces('chronic_kidney_disease.arff', 'chronic_kidney_disease2.arff')

data, meta = arff.loadarff('chronic_kidney_disease2.arff')
dataset1 = pd.DataFrame(data)
dataset1 = dataset1.applymap(decode_bytes)
dataset1 = dataset1.replace('?', np.nan)

categorical_cols, numerical_cols = preprocessor.get_cat_num_colnames(dataset1)



In [57]:
processed_data = preprocessor.data_preprocess(dataset1, categorical_cols, numerical_cols)
X, y = processed_data.iloc[:, :-1], processed_data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size = 0.2)

## Colinearity detection

### VIF

In [73]:
p = Preprocessor()
vif_coefs = p.vif(X_train)
vif_coefs[vif_coefs['VIF'] >= 7]

  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss


Unnamed: 0,variables,VIF
11,al_0,inf
12,al_1,inf
13,al_2,inf
14,al_3,inf
15,al_4,inf
17,al_nan,inf
18,su_0,inf
19,su_1,inf
20,su_2,inf
21,su_3,inf


### Correlation

In [59]:
corr_matrix = X_train.corr()
corr_matrix



Unnamed: 0,age,bp,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
age,1.0,0.152199,0.236323,0.195399,0.140063,-0.069579,0.043421,-0.184362,-0.187031,0.096293,...,-0.037435,-0.13241,0.169893,0.060294,0.395772,0.4073,0.237916,0.143978,0.108481,0.045511
bp,0.152199,1.0,0.206005,0.14652,0.111594,-0.10527,0.071787,-0.263191,-0.302335,0.01016,...,-0.119272,-0.140634,0.064362,0.060926,0.275136,0.233867,0.065825,0.167141,0.028995,0.164663
bgr,0.236323,0.206005,1.0,0.14069,0.102463,-0.148187,0.071314,-0.291373,-0.280311,0.10049,...,-0.189727,-0.277872,0.197297,0.054702,0.364005,0.50544,0.230908,0.22973,0.09188,0.141118
bu,0.195399,0.14652,0.14069,1.0,0.573234,-0.284353,0.371213,-0.517756,-0.498231,0.067299,...,-0.15833,-0.345455,0.202964,0.124102,0.335019,0.303527,0.174674,0.257183,0.319522,0.400471
sc,0.140063,0.111594,0.102463,0.573234,1.0,-0.641833,0.219745,-0.313843,-0.314718,-0.014338,...,-0.085235,-0.148174,0.051734,0.040131,0.263962,0.198009,0.20371,0.156525,0.161721,0.198469
sod,-0.069579,-0.10527,-0.148187,-0.284353,-0.641833,1.0,0.075369,0.321203,0.329839,0.015646,...,0.111963,0.155426,-0.131046,-0.063453,-0.298546,-0.283913,-0.231388,-0.153113,-0.149995,-0.182446
pot,0.043421,0.071787,0.071314,0.371213,0.219745,0.075369,1.0,-0.103645,-0.129761,-0.085081,...,0.020492,-0.168392,-0.026567,-0.004777,0.061357,0.063677,-0.009521,-0.026297,0.062154,0.112296
hemo,-0.184362,-0.263191,-0.291373,-0.517756,-0.313843,0.321203,-0.103645,1.0,0.853551,-0.139558,...,0.263103,0.41801,-0.265093,-0.182825,-0.554404,-0.476939,-0.259377,-0.382935,-0.369874,-0.545979
pcv,-0.187031,-0.302335,-0.280311,-0.498231,-0.314718,0.329839,-0.129761,0.853551,1.0,-0.174392,...,0.265172,0.444125,-0.283935,-0.158721,-0.542449,-0.476929,-0.261583,-0.382634,-0.395165,-0.497155
wbcc,0.096293,0.01016,0.10049,0.067299,-0.014338,0.015646,-0.085081,-0.139558,-0.174392,1.0,...,-0.010013,-0.130245,0.178629,0.124502,0.097674,0.130672,-0.020628,0.141528,0.126905,0.043194


## Feature selection

### Chi2 selector

In [60]:
selector = SelectKBest(chi2, k=10)
X_chi2_train= selector.fit_transform(X_train, y_train)
X_chi2_test = selector.transform(X_test)

### PCA

In [70]:

pca = PCA(n_components=10)
X_pca_train = pca.fit_transform(X_train)
X_pca_test = pca.transform(X_test)
print(pca.explained_variance_)
print(X_pca_train[:10, :])

[0.96874205 0.43055454 0.24752179 0.2029795  0.17968111 0.16080584
 0.14222498 0.12611737 0.12339236 0.11393265]
[[-1.04865965e+00  8.62620750e-02  6.41237540e-01 -7.17348888e-02
   2.56536518e-02  3.97770065e-02  3.31227419e-02  2.35597485e-03
  -4.79370976e-02 -7.11930065e-02]
 [-6.12027074e-01 -1.80084277e-01 -2.25618813e-01 -5.95629881e-01
   3.08078694e-01 -2.18487656e-01 -1.16268345e-01  4.92871440e-01
  -3.79639104e-01 -2.76446002e-01]
 [ 8.13457028e-02 -5.07508658e-01  4.56847116e-02  4.92307177e-01
   2.93181687e-01 -9.81656871e-02 -7.62794217e-01 -5.18470055e-01
  -1.09571147e-01 -7.13906659e-01]
 [ 1.67598660e+00  1.35486998e+00  3.10384491e-02  2.87033596e-01
   3.52328262e-01  2.64225677e-01  6.79366696e-02  4.15995260e-01
   3.37079708e-01 -5.66564225e-01]
 [-1.04789720e+00  9.58249912e-02  6.42633899e-01 -8.38794476e-02
  -1.43057914e-03  4.33850321e-02  4.38726765e-02 -1.77709405e-02
  -3.75750649e-02 -4.83633974e-02]
 [ 7.42729115e-01 -8.74983429e-01 -1.78046733e-01 -4

## Fit IRLS model

In [71]:
model = logistic_regression()
model.fit(X_pca_train, y_train)
print('beta:', model.beta)
accuracy = model.accuracy(X_pca_test, y_test)
print('pca_accuracy: {:.2f}'.format(accuracy))

(320, 11)
(11,)
[[-8.00000000e+01  8.88178420e-16 -3.99680289e-15  2.55351296e-15
   1.99840144e-15  1.55431223e-15 -2.22044605e-16 -2.22044605e-16
  -1.66533454e-15  2.66453526e-15 -7.77156117e-16]
 [ 8.88178420e-16 -7.72571786e+01  2.33146835e-15 -3.66373598e-15
  -7.54951657e-15  4.44089210e-15  5.99520433e-15 -3.99680289e-15
  -1.06581410e-14  3.33066907e-16  2.49800181e-15]
 [-3.99680289e-15  2.33146835e-15 -3.43367248e+01  3.38618023e-15
   4.44089210e-16 -1.11022302e-15  4.82947016e-15  4.32986980e-15
   3.88578059e-16  1.52655666e-15 -5.66213743e-15]
 [ 2.55351296e-15 -3.66373598e-15  3.38618023e-15 -1.97398629e+01
   2.85882429e-15 -1.07691633e-14  2.94209102e-15  5.13478149e-16
   7.77156117e-16  4.99600361e-16 -3.33066907e-15]
 [ 1.99840144e-15 -7.54951657e-15  4.44089210e-16  2.85882429e-15
  -1.61876149e+01 -1.08801856e-14  3.33066907e-15  4.99600361e-15
  -2.22044605e-16  1.49880108e-15 -2.22044605e-16]
 [ 1.55431223e-15  4.44089210e-15 -1.11022302e-15 -1.07691633e-14
  -

  return 1 / (1 + np.exp(-x))


[[-4.82776226e-16  4.80114544e-16 -3.48878840e-17 -3.09266091e-16
   3.71486150e-17 -1.14802931e-17 -1.99555618e-17 -2.19186315e-17
  -1.14053918e-17  1.07810226e-17  3.27193801e-17]
 [ 4.80114544e-16 -4.80449943e-16  3.59678676e-17  3.09248403e-16
  -3.78312741e-17  1.19721578e-17  2.09231778e-17  2.24952198e-17
   1.27857850e-17 -1.10927821e-17 -3.27817042e-17]
 [-3.48878840e-17  3.59678676e-17 -3.47051891e-18 -2.30050806e-17
   3.52153646e-18 -1.00253402e-18 -2.24196441e-18 -1.80307887e-18
  -1.48186180e-18  8.66955379e-19  1.97168111e-18]
 [-3.09266091e-16  3.09248403e-16 -2.30050806e-17 -2.00089423e-16
   2.46390613e-17 -8.13761102e-18 -1.35118060e-17 -1.45590620e-17
  -7.60927564e-18  6.50538573e-18  2.13997320e-17]
 [ 3.71486150e-17 -3.78312741e-17  3.52153646e-18  2.46390613e-17
  -3.91863079e-18  1.25742899e-18  2.23051838e-18  1.80927742e-18
   1.06279531e-18 -5.70517275e-19 -1.96721860e-18]
 [-1.14802931e-17  1.19721578e-17 -1.00253402e-18 -8.13761102e-18
   1.25742899e-18 -

In [65]:
model = logistic_regression()
model.fit(X_train.values, y_train.values)
print(model.beta)
accuracy = model.accuracy(X_test, y_test)
print('pca_accuracy: {:.2f}'.format(accuracy))

(320, 42)
(42,)
[[-80.         -44.54654441 -16.07350317 ... -16.75       -15.25
  -11.        ]
 [-44.54654441 -27.87061698  -9.20002608 ... -10.24431818  -9.15899645
   -6.37059695]
 [-16.07350317  -9.20002608  -4.10806962 ...  -3.93551745  -3.15949643
   -2.68551745]
 ...
 [-16.75       -10.24431818  -3.93551745 ... -16.75        -8.5
   -4.5       ]
 [-15.25        -9.15899645  -3.15949643 ...  -8.5        -15.25
   -4.25      ]
 [-11.          -6.37059695  -2.68551745 ...  -4.5         -4.25
  -11.        ]]
0.0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
pca_accuracy: 0.65


## Export dataset to $.csv$

In [None]:
kidney_disease = pd.concat([X, pd.DataFrame(y, columns=["class"])], axis=1)
kidney_disease.to_csv('kidney_disease.csv', index=False)