In [2]:
# 데이터 처리
import numpy as np
import pandas as pd
# 머신러닝 알고리즘 및 평가
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as preprocessing
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
# 시각화
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# 데이터 불러오기
dat = pd.read_csv('./data/classification_parkinsons.csv')

# 데이터 확인해보기 상위 10개를 통해서

print(dat.head(10))



             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3  phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4  phon_R01_S01_5      116.014       141.781       110.655         0.01284   
5  phon_R01_S01_6      120.552       131.162       113.787         0.00968   
6  phon_R01_S02_1      120.267       137.244       114.820         0.00333   
7  phon_R01_S02_2      107.332       113.840       104.315         0.00290   
8  phon_R01_S02_3       95.730       132.068        91.754         0.00551   
9  phon_R01_S02_4       95.056       120.103        91.226         0.00532   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer    ...     \
0           0.00007   0.00370   0.00554     0.01109       0.04

In [4]:
# 데이터의 형태 확인해보기

print(dat.shape)


(195, 24)


In [5]:
# 의미없는 변수 제거: name

# name 변수 제거하기
dat_processing = dat.drop(['name'], axis=1, inplace=False)

print(dat_processing.columns)




Index(['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
       'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
       'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
       'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'status', 'RPDE', 'DFA',
       'spread1', 'spread2', 'D2', 'PPE'],
      dtype='object')


In [6]:
# Normalization
dat_processing_norm = preprocessing.minmax_scale(dat_processing)
dat_processed = pd.DataFrame(dat_processing_norm)
dat_processed.columns = dat_processing.columns

  


In [7]:
print(dat_processed.shape)


(195, 23)


In [8]:
# 상수항 추가 : sm.add_constant()

dat_processed = sm.add_constant(dat_processed, has_constant='add')
print(dat_processed.head(20))


    const  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0     1.0     0.184308      0.112592      0.054815        0.195680   
1     1.0     0.198327      0.094930      0.278323        0.254130   
2     1.0     0.165039      0.059128      0.265288        0.280178   
3     1.0     0.165004      0.072927      0.264200        0.263342   
4     1.0     0.161150      0.080909      0.260107        0.354511   
5     1.0     0.187568      0.059232      0.278139        0.254130   
6     1.0     0.185909      0.071647      0.284086        0.052414   
7     1.0     0.110606      0.023873      0.223606        0.038755   
8     1.0     0.043063      0.061082      0.151289        0.121665   
9     1.0     0.039139      0.036658      0.148249        0.115629   
10    1.0     0.000000      0.020607      0.107062        0.107052   
11    1.0     0.020789      0.028019      0.119843        0.118170   
12    1.0     0.282892      0.117826      0.378827        0.039708   
13    1.0     0.2959

In [9]:
# 전체 데이터 변수 확인
print(dat_processed.columns)

Index(['const', 'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)',
       'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ',
       'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3',
       'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'status',
       'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE'],
      dtype='object')


In [10]:
# Target 변수: 'status'를 제외한 데이터 학습데이터 구성하기
dat_processed.columns.difference(["status"])

Index(['D2', 'DFA', 'HNR', 'Jitter:DDP', 'MDVP:APQ', 'MDVP:Fhi(Hz)',
       'MDVP:Flo(Hz)', 'MDVP:Fo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)',
       'MDVP:PPQ', 'MDVP:RAP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'NHR',
       'PPE', 'RPDE', 'Shimmer:APQ3', 'Shimmer:APQ5', 'Shimmer:DDA', 'const',
       'spread1', 'spread2'],
      dtype='object')

In [11]:
feature_columns = list(dat_processed.columns.difference(["status"]))
feature_columns

['D2',
 'DFA',
 'HNR',
 'Jitter:DDP',
 'MDVP:APQ',
 'MDVP:Fhi(Hz)',
 'MDVP:Flo(Hz)',
 'MDVP:Fo(Hz)',
 'MDVP:Jitter(%)',
 'MDVP:Jitter(Abs)',
 'MDVP:PPQ',
 'MDVP:RAP',
 'MDVP:Shimmer',
 'MDVP:Shimmer(dB)',
 'NHR',
 'PPE',
 'RPDE',
 'Shimmer:APQ3',
 'Shimmer:APQ5',
 'Shimmer:DDA',
 'const',
 'spread1',
 'spread2']

In [12]:
X = dat_processed[feature_columns]
y = dat_processed['status'] # 질환여부: 1 or 0

In [13]:
# train_test_split 함수를 이용하여 학습데이터와 검증데이터로 9:1로 나누어 데이터를 구분해보자.

train_x, test_x, train_y, test_y = train_test_split(X, y, stratify=y, test_size=0.1, random_state=2017010500)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)


(175, 23) (20, 23) (175,) (20,)


In [14]:
print(dat_processed.dtypes)


const               float64
MDVP:Fo(Hz)         float64
MDVP:Fhi(Hz)        float64
MDVP:Flo(Hz)        float64
MDVP:Jitter(%)      float64
MDVP:Jitter(Abs)    float64
MDVP:RAP            float64
MDVP:PPQ            float64
Jitter:DDP          float64
MDVP:Shimmer        float64
MDVP:Shimmer(dB)    float64
Shimmer:APQ3        float64
Shimmer:APQ5        float64
MDVP:APQ            float64
Shimmer:DDA         float64
NHR                 float64
HNR                 float64
status              float64
RPDE                float64
DFA                 float64
spread1             float64
spread2             float64
D2                  float64
PPE                 float64
dtype: object


In [15]:
print(train_y.head(10))
print(train_x.head(10))


178    1.0
89     1.0
126    1.0
8      1.0
193    0.0
174    0.0
186    0.0
92     1.0
37     1.0
47     0.0
Name: status, dtype: float64
           D2       DFA       HNR  Jitter:DDP  MDVP:APQ  MDVP:Fhi(Hz)  \
178  0.368467  0.749886  0.649069    0.040456  0.045180      0.114749   
89   1.000000  0.682231  0.139194    0.155402  0.209511      0.252682   
126  0.177061  0.653223  0.395716    0.109006  0.117926      0.194113   
8    0.400034  0.794025  0.543404    0.108525  0.076422      0.061082   
193  0.318222  0.277579  0.429936    0.145288  0.066544      0.601807   
174  0.200038  0.725879  0.691051    0.052015  0.045869      0.086390   
186  0.328600  0.371991  0.630619    0.089581  0.059116      1.000000   
92   0.409743  0.638319  0.420182    0.087655  0.178574      0.128283   
37   0.495243  0.834020  0.670975    0.066142  0.059116      0.170364   
47   0.269288  0.242711  0.746322    0.011559  0.010797      0.325264   

     MDVP:Flo(Hz)  MDVP:Fo(Hz)  MDVP:Jitter(%)  MDVP:Jitt

In [16]:
# 모델 구축 및 학습
# statsmodels의 Logit함수를 이용하여 모델을 적합해보자

model = sm.Logit(train_y, train_x)


In [17]:
# 모델을 실제로 적합해보자

results = model.fit(method='bfgs')


         Current function value: 0.281354
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36




In [18]:
# performance measure
print("model AIC: ","{:.5f}".format(results.aic))
print("model BIC: ","{:.5f}".format(results.bic))
results.summary()

model AIC:  144.47407
model BIC:  217.26414


0,1,2,3
Dep. Variable:,status,No. Observations:,175.0
Model:,Logit,Df Residuals:,152.0
Method:,MLE,Df Model:,22.0
Date:,"Mon, 08 Jul 2019",Pseudo R-squ.:,0.4954
Time:,16:19:11,Log-Likelihood:,-49.237
converged:,False,LL-Null:,-97.576
,,LLR p-value:,2.442e-11

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
D2,4.2639,3.099,1.376,0.169,-1.811,10.339
DFA,0.0542,2.101,0.026,0.979,-4.063,4.171
HNR,-0.9245,5.160,-0.179,0.858,-11.037,9.188
Jitter:DDP,0.6214,2451.293,0.000,1.000,-4803.824,4805.067
MDVP:APQ,1.2534,40.232,0.031,0.975,-77.600,80.107
MDVP:Fhi(Hz),-1.4637,1.705,-0.858,0.391,-4.806,1.879
MDVP:Flo(Hz),-0.3667,1.652,-0.222,0.824,-3.605,2.872
MDVP:Fo(Hz),-2.2135,3.195,-0.693,0.488,-8.475,4.048
MDVP:Jitter(%),-0.1070,31.153,-0.003,0.997,-61.166,60.952


In [19]:
# 실제 Train 학습의 예측값을 출력해보자.

train_y_pred = results.predict(train_x)
print(train_y_pred.head(10))


178    0.754379
89     0.999763
126    0.473022
8      0.965055
193    0.376182
174    0.850443
186    0.151406
92     0.893159
37     0.908429
47     0.012054
dtype: float64


In [20]:
# 실제 Test데이터의 예측값을 출력해보자.

test_y_pred = results.predict(test_x)
print(test_y_pred.head(20))


139    0.896587
70     0.921018
167    0.032148
143    0.710512
157    0.999480
95     0.855700
26     0.866141
25     0.990939
192    0.821703
108    0.609142
123    0.688141
172    0.845136
43     0.136289
67     0.968164
164    0.999518
180    0.878394
190    0.832943
125    0.451190
101    0.999631
144    0.731930
dtype: float64


In [21]:
# Cut-off 정의
def cut_off(y,threshold):
    Y = y.copy() # copy함수를 사용하여 이전의 y값이 변화지 않게 함
    Y[Y>threshold]=1
    Y[Y<=threshold]=0
    return(Y.astype(int))

In [22]:
train_y_pred_prob = results.predict(train_x)
train_y_pred = cut_off(train_y_pred_prob,0.5)
print( train_y_pred.head(10) )

178    1
89     1
126    0
8      1
193    0
174    1
186    0
92     1
37     1
47     0
dtype: int32


In [23]:
test_y_pred_prob = results.predict(test_x)
test_y_pred = cut_off(test_y_pred_prob,0.5)
print( test_y_pred.head(20) )

139    1
70     1
167    0
143    1
157    1
95     1
26     1
25     1
192    1
108    1
123    1
172    1
43     0
67     1
164    1
180    1
190    1
125    0
101    1
144    1
dtype: int32


In [24]:
test_y_pred_prob = results.predict(test_x)
test_y_pred = cut_off(test_y_pred_prob,0.5)
print( test_y_pred.head(10) )

139    1
70     1
167    0
143    1
157    1
95     1
26     1
25     1
192    1
108    1
dtype: int32


In [25]:
# confusion matrix: Training set
cm_train = confusion_matrix(train_y,train_y_pred)
print( cm_train )

[[ 27  16]
 [ 10 122]]


In [26]:
# confusion matrix: Test set
cm_test = confusion_matrix(test_y,test_y_pred)
print( cm_test)

[[ 2  3]
 [ 1 14]]


In [27]:
# performance evaluation
def perf_eval(cm):
    # True positive rate: TPR
    TPR = cm[1, 1] / sum(cm[1]) # recall
    # True negative rate: TNR
    TNR = cm[0, 0] / sum(cm[0])
    # Simple Accuracy
    ACC = (cm[0, 0] + cm[1, 1]) / sum(cm.reshape(-1,))
    # Balanced Correction Rate
    BCR = np.sqrt(TPR * TNR)
    # F1-measure
    Precision = cm[1,1] /sum(cm[:,1])
    F1 = 2*TPR*Precision/(TPR+Precision)
    return ([TPR, TNR, ACC, BCR, F1])

In [28]:

print('Test performance of Logistic Regression')
print('TPR:',perf_eval(cm_test)[0])
print('TNR:',perf_eval(cm_test)[1])
print('ACC:',perf_eval(cm_test)[2])
print('BCR:',perf_eval(cm_test)[3])
print('F1:',perf_eval(cm_test)[4])

Test performance of Logistic Regression
TPR: 0.9333333333333333
TNR: 0.4
ACC: 0.8
BCR: 0.6110100926607787
F1: 0.8749999999999999


In [29]:
# 임계값에 따른 비교
threshold = np.arange(0, 1, 0.1)
threshold

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [30]:
table = pd.DataFrame(columns=['TPR','TNR','ACC','BCR','F1'])
print(table)

Empty DataFrame
Columns: [TPR, TNR, ACC, BCR, F1]
Index: []


In [31]:
for i in threshold:
    test_y_pred_tmp = cut_off(test_y_pred_prob,i)
    cfmat = confusion_matrix(test_y, test_y_pred_tmp)
    table.loc[i] = perf_eval(cfmat)

In [32]:
table.index.name='threshold'
table.columns.name='performance'
print(table)

performance       TPR  TNR   ACC       BCR        F1
threshold                                           
0.0          1.000000  0.0  0.75  0.000000  0.857143
0.1          1.000000  0.2  0.80  0.447214  0.882353
0.2          1.000000  0.4  0.85  0.632456  0.909091
0.3          1.000000  0.4  0.85  0.632456  0.909091
0.4          1.000000  0.4  0.85  0.632456  0.909091
0.5          0.933333  0.4  0.80  0.611010  0.875000
0.6          0.933333  0.4  0.80  0.611010  0.875000
0.7          0.800000  0.4  0.70  0.565685  0.800000
0.8          0.666667  0.4  0.60  0.516398  0.714286
0.9          0.400000  1.0  0.55  0.632456  0.571429
