In [4]:
import pandas as pd
import numpy as np
import os

os.listdir('./data')

['TCGA_binary.csv',
 'TCGA_data.csv',
 'urine_binary.csv',
 'urine_k15_0419.csv',
 'urine_K15_0420.csv',
 'urine_k15_0423.csv',
 'urine_k15_0428.csv']

In [5]:
TCGA = pd.read_csv('./data/TCGA_data.csv')
urine = pd.read_csv('./data/urine_k15_0428.csv')

In [6]:
TCGA.y.value_counts()

Normal    1706
KIRC       528
PRAD       492
BLCA       408
KIRP       288
KICH        66
Name: y, dtype: int64

In [7]:
TCGA.shape

(3488, 787)

In [8]:
urine.shape

(123, 789)

In [5]:
TCGA.y.value_counts()

Normal    1706
KIRC       528
PRAD       492
BLCA       408
KIRP       288
KICH        66
Name: y, dtype: int64

In [12]:
urine.y.value_counts()

BLCA        51
NL          28
PRAD        23
RCC_100k    21
Name: y, dtype: int64

In [9]:
TCGA = TCGA[(TCGA.y == 'BLCA') | (TCGA.y == 'Normal')]
TCGA = TCGA.replace(['BLCA','Normal'],[ 1,0 ])
TCGA = TCGA.reset_index(drop = True)

In [10]:
urine = urine[(urine.y == 'BLCA') | (urine.y == 'NL')]
urine = urine.replace(['BLCA','NL'],[ 1,0 ])
urine = urine.reset_index(drop = True)

In [11]:
TCGA.y.value_counts()

0    1706
1     408
Name: y, dtype: int64

In [12]:
urine.y.value_counts()

1    51
0    28
Name: y, dtype: int64

In [13]:
urine = urine.drop(['Unnamed: 0','X'], axis = 1)

In [14]:
print('TCGA shape : ',TCGA.shape)
print('urine shape : ',urine.shape)

TCGA shape :  (2114, 787)
urine shape :  (79, 787)


# scaling

In [15]:
X = TCGA.drop('y', axis = 1)
y = TCGA['y']

urine_X = urine.drop('y', axis = 1)
urine_y = urine['y']

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler

standardsc = StandardScaler()
mms = MinMaxScaler()
mas = MaxAbsScaler()

standardsc.fit(X)
mms.fit(X)
mas.fit(X)

MaxAbsScaler(copy=True)

In [17]:
X_standard = pd.DataFrame(standardsc.transform(X), columns = X.columns)
X_mms = pd.DataFrame(mms.transform(X), columns = X.columns)
X_mas = pd.DataFrame(mas.transform(X), columns = X.columns)

urine_X_standard = standardsc.transform(urine_X)
urine_X_mms = mms.transform(urine_X)
urine_X_mas = mas.transform(urine_X)

  """


In [18]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve

def get_eval(y_test, pred) :
    
    print('accuracy : {0:.4f} precision : {1:.4f} recall : {2:.4f} F1 : {3:.4f} AUC : {4:.4f}'.format(
                                                        accuracy_score(y_test, pred), 
                                                        precision_score(y_test, pred),
                                                        recall_score(y_test, pred),
                                                        f1_score(y_test,pred),
                                                        roc_auc_score(y_test, pred)
                                                                      ))
    
    
    
    #print('accuracy : {0:.4f}'.format(accuracy_score(y_test,pred)), 'roc_auc_score : ', roc_auc_score(y_test,pred))
    #print(classification_report(y_test, pred))
    print(confusion_matrix(y_test, pred))

In [19]:
from sklearn.preprocessing import Binarizer

def get_eval_by_threshold(y_test, pred_prba_c1, thresholds) :
    
    for custom_threshold in thresholds :
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_prba_c1)
        custom_predict = binarizer.transform(pred_prba_c1)
        
        print('threshold ; ', custom_threshold)
        get_eval(y_test, custom_predict)
        print('\n')

In [20]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 5, random_state=0, shuffle = True)

# logistic regression

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
import warnings
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline

In [22]:
def model_pipeline_2(model) :
    
    cv_acc = []
    cv_acc_sc = []
    cv_acc_mms = []
    cv_acc_mas = []

    for i, (tr_ind, te_ind) in enumerate(skf.split(X,y)) :

        X_train, X_test = X.iloc[tr_ind], X.iloc[te_ind]
        X_train_sc, X_test_sc = X_standard.iloc[tr_ind], X_standard.iloc[te_ind]
        X_train_mms, X_test_mms = X_mms.iloc[tr_ind], X_mms.iloc[te_ind]
        X_train_mas, X_test_mas = X_mas.iloc[tr_ind], X_mas.iloc[te_ind]
        
        y_train, y_test = y[tr_ind], y[te_ind]

        model.fit(X_train, y_train)
        cv_acc.append(accuracy_score(y[te_ind], model.predict(X_test)))
        
        model.fit(X_train_sc, y_train)
        cv_acc_sc.append(accuracy_score(y[te_ind], model.predict(standardsc.transform(X_test_sc))))
        
        model.fit(X_train_mms, y_train)
        cv_acc_mms.append(accuracy_score(y[te_ind], model.predict(mms.transform(X_test_mms))))
        
        model.fit(X_train_mas, y_train)
        cv_acc_mas.append(accuracy_score(y[te_ind], model.predict(mas.transform(X_test_mas))))
        
        print('{0} 번째 accuracy non_scale : {1:.4f}% StandardScale : {2:.4f}% MinMax : {3:.4f}% MaxAbs : {4:.4f}%'.format(i,cv_acc[i],cv_acc_sc[i],cv_acc_mms[i],cv_acc_mas[i]))

    print('\n mean accuracy non_scale : {0:.4f}% StandardScale : {1:.4f}% MinMax : {2:.4f}% MaxAbs : {3:.4f}%'.format(np.mean(cv_acc),np.mean(cv_acc_sc),np.mean(cv_acc_mms),np.mean(cv_acc_mas)))

In [23]:
model_pipeline_2(lr)

0 번째 accuracy non_scale : 0.9764% StandardScale : 0.9575% MinMax : 0.8868% MaxAbs : 0.9717%
1 번째 accuracy non_scale : 0.9905% StandardScale : 0.9693% MinMax : 0.8913% MaxAbs : 0.9882%
2 번째 accuracy non_scale : 0.9764% StandardScale : 0.9645% MinMax : 0.8700% MaxAbs : 0.9716%
3 번째 accuracy non_scale : 0.9834% StandardScale : 0.9597% MinMax : 0.8863% MaxAbs : 0.9810%
4 번째 accuracy non_scale : 0.9763% StandardScale : 0.9787% MinMax : 0.8981% MaxAbs : 0.9739%

 mean accuracy non_scale : 0.9806% StandardScale : 0.9659% MinMax : 0.8865% MaxAbs : 0.9773%


In [31]:
def model_pipeline(model) :
    
    
    pipeline1 = Pipeline([('StandardScaler', standardsc), ('Logistic', model)])
    pipeline2 = Pipeline([('MinMaxScaler', mms), ('Logistic', model)])
    pipeline3 = Pipeline([('MaxAbsScaler', mas), ('Logistic', model)])

    cv_acc = []
    pip1_acc = []
    pip2_acc = []
    pip3_acc = []

    for i, (tr_ind, te_ind) in enumerate(skf.split(X,y)) :

        X_train, X_test = X.iloc[tr_ind], X.iloc[te_ind]
        y_train, y_test = y[tr_ind], y[te_ind]

        model.fit(X_train, y_train)
        pipeline1.fit(X_train, y_train)
        pipeline2.fit(X_train, y_train)
        pipeline3.fit(X_train, y_train)

        cv_acc.append(accuracy_score(y[te_ind], model.predict(X_test)))
        pip1_acc.append(accuracy_score(y[te_ind], pipeline1.predict(X_test)))
        pip2_acc.append(accuracy_score(y[te_ind], pipeline2.predict(X_test)))
        pip3_acc.append(accuracy_score(y[te_ind], pipeline3.predict(X_test)))

        print('{0} 번째 accuracy non_scale : {1:.4f}% StandardScale : {2:.4f}% MinMax : {3:.4f}% MaxAbs : {4:.4f}%'.format(i,cv_acc[i],pip1_acc[i],pip2_acc[i],pip3_acc[i]))

    print('\n mean accuracy non_scale : {0:.4f}% StandardScale : {1:.4f}% MinMax : {2:.4f}% MaxAbs : {3:.4f}%'.format(np.mean(cv_acc),np.mean(pip1_acc),np.mean(pip2_acc),np.mean(pip3_acc)))

In [24]:
#predict urine data using pipeline

def pipeline_urine(model) :
    
    model.fit(X,y)
    print('###non-scaled###')
    get_eval(urine_y, model.predict(urine_X))

    model.fit(X_standard,y)
    print('###StandardScaler###')
    get_eval(urine_y, model.predict(urine_X_standard))
    
    model.fit(X_mms,y)
    print('###MinMaxScaler###')
    get_eval(urine_y, model.predict(urine_X_mms))
    
    model.fit(X_mas,y)
    print('###MaxAbsScaler###')
    get_eval(urine_y, model.predict(urine_X_mas))

In [33]:
model_pipeline(lr)

0 번째 accuracy non_scale : 0.9788% StandardScale : 0.9764% MinMax : 0.8137% MaxAbs : 0.9741%
1 번째 accuracy non_scale : 0.9905% StandardScale : 0.9882% MinMax : 0.8132% MaxAbs : 0.9905%
2 번째 accuracy non_scale : 0.9787% StandardScale : 0.9787% MinMax : 0.8132% MaxAbs : 0.9716%
3 번째 accuracy non_scale : 0.9858% StandardScale : 0.9858% MinMax : 0.8128% MaxAbs : 0.9834%
4 번째 accuracy non_scale : 0.9763% StandardScale : 0.9787% MinMax : 0.8128% MaxAbs : 0.9739%

 mean accuracy non_scale : 0.9820% StandardScale : 0.9816% MinMax : 0.8131% MaxAbs : 0.9787%


In [25]:
pipeline_urine(lr)

###non-scaled###
accuracy : 0.4937 precision : 0.9231 recall : 0.2353 F1 : 0.3750 AUC : 0.5998
[[27  1]
 [39 12]]
###StandardScaler###
accuracy : 0.4937 precision : 0.7200 recall : 0.3529 F1 : 0.4737 AUC : 0.5515
[[21  7]
 [33 18]]
###MinMaxScaler###
accuracy : 0.4430 precision : 1.0000 recall : 0.1373 F1 : 0.2414 AUC : 0.5686
[[28  0]
 [44  7]]
###MaxAbsScaler###
accuracy : 0.5063 precision : 0.8750 recall : 0.2745 F1 : 0.4179 AUC : 0.6015
[[26  2]
 [37 14]]


In [19]:
lr.fit(X_mas,y)
thresholds = [0.3,0.4,0.5,0.6,0.7]
get_eval_by_threshold(urine_y, lr.predict_proba(urine_X_mas)[:,1].reshape(-1,1), thresholds)

threshold ;  0.3
accuracy : 0.5063 precision : 0.8750 recall : 0.2745 F1 : 0.4179 AUC : 0.6015
[[26  2]
 [37 14]]


threshold ;  0.4
accuracy : 0.5063 precision : 0.8750 recall : 0.2745 F1 : 0.4179 AUC : 0.6015
[[26  2]
 [37 14]]


threshold ;  0.5
accuracy : 0.5063 precision : 0.8750 recall : 0.2745 F1 : 0.4179 AUC : 0.6015
[[26  2]
 [37 14]]


threshold ;  0.6
accuracy : 0.4810 precision : 0.8571 recall : 0.2353 F1 : 0.3692 AUC : 0.5819
[[26  2]
 [39 12]]


threshold ;  0.7
accuracy : 0.4810 precision : 0.8571 recall : 0.2353 F1 : 0.3692 AUC : 0.5819
[[26  2]
 [39 12]]




# randomforest

In [26]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [38]:
rf = RandomForestClassifier(random_state=0, n_estimators=20)

In [39]:
model_pipeline_2(rf)

0 번째 accuracy non_scale : 0.9835% StandardScale : 0.1981% MinMax : 0.1934% MaxAbs : 0.9811%
1 번째 accuracy non_scale : 0.9905% StandardScale : 0.9787% MinMax : 0.1939% MaxAbs : 0.9905%
2 번째 accuracy non_scale : 0.9835% StandardScale : 0.9669% MinMax : 0.1939% MaxAbs : 0.9787%
3 번째 accuracy non_scale : 0.9858% StandardScale : 0.9787% MinMax : 0.1919% MaxAbs : 0.9834%
4 번째 accuracy non_scale : 0.9787% StandardScale : 0.9313% MinMax : 0.1919% MaxAbs : 0.9834%

 mean accuracy non_scale : 0.9844% StandardScale : 0.8107% MinMax : 0.1930% MaxAbs : 0.9834%


In [40]:
pipeline_urine(rf)

###non-scaled###
accuracy : 0.6709 precision : 0.6712 recall : 0.9608 F1 : 0.7903 AUC : 0.5518
[[ 4 24]
 [ 2 49]]
###StandardScaler###
accuracy : 0.6709 precision : 0.6712 recall : 0.9608 F1 : 0.7903 AUC : 0.5518
[[ 4 24]
 [ 2 49]]
###MinMaxScaler###
accuracy : 0.6709 precision : 0.6712 recall : 0.9608 F1 : 0.7903 AUC : 0.5518
[[ 4 24]
 [ 2 49]]
###MaxAbsScaler###
accuracy : 0.6709 precision : 0.6712 recall : 0.9608 F1 : 0.7903 AUC : 0.5518
[[ 4 24]
 [ 2 49]]


In [41]:
rf.fit(X,y)
thresholds = [0.45,0.5,0.53,0.55, 0.58,0.6, 0.65]
get_eval_by_threshold(urine_y, rf.predict_proba(urine_X)[:,1].reshape(-1,1), thresholds)

threshold ;  0.45
accuracy : 0.6709 precision : 0.6667 recall : 0.9804 F1 : 0.7937 AUC : 0.5438
[[ 3 25]
 [ 1 50]]


threshold ;  0.5
accuracy : 0.6709 precision : 0.6712 recall : 0.9608 F1 : 0.7903 AUC : 0.5518
[[ 4 24]
 [ 2 49]]


threshold ;  0.53
accuracy : 0.6709 precision : 0.6712 recall : 0.9608 F1 : 0.7903 AUC : 0.5518
[[ 4 24]
 [ 2 49]]


threshold ;  0.55
accuracy : 0.6835 precision : 0.6970 recall : 0.9020 F1 : 0.7863 AUC : 0.5938
[[ 8 20]
 [ 5 46]]


threshold ;  0.58
accuracy : 0.6835 precision : 0.6970 recall : 0.9020 F1 : 0.7863 AUC : 0.5938
[[ 8 20]
 [ 5 46]]


threshold ;  0.6
accuracy : 0.6582 precision : 0.6935 recall : 0.8431 F1 : 0.7611 AUC : 0.5823
[[ 9 19]
 [ 8 43]]


threshold ;  0.65
accuracy : 0.5443 precision : 0.6471 recall : 0.6471 F1 : 0.6471 AUC : 0.5021
[[10 18]
 [18 33]]




# SVM

In [42]:
from sklearn.svm import SVC
svm = SVC(probability=True)

In [43]:
model_pipeline_2(svm)

0 번째 accuracy non_scale : 0.9623% StandardScale : 0.1934% MinMax : 0.8066% MaxAbs : 0.9505%
1 번째 accuracy non_scale : 0.9740% StandardScale : 0.1939% MinMax : 0.8085% MaxAbs : 0.9574%
2 번째 accuracy non_scale : 0.9527% StandardScale : 0.1939% MinMax : 0.8109% MaxAbs : 0.9385%
3 번째 accuracy non_scale : 0.9645% StandardScale : 0.1919% MinMax : 0.8128% MaxAbs : 0.9526%
4 번째 accuracy non_scale : 0.9692% StandardScale : 0.1919% MinMax : 0.8081% MaxAbs : 0.9550%

 mean accuracy non_scale : 0.9645% StandardScale : 0.1930% MinMax : 0.8094% MaxAbs : 0.9508%


In [44]:
pipeline_urine(svm)

###non-scaled###
accuracy : 0.6203 precision : 0.8182 recall : 0.5294 F1 : 0.6429 AUC : 0.6576
[[22  6]
 [24 27]]
###StandardScaler###
accuracy : 0.7468 precision : 0.7627 recall : 0.8824 F1 : 0.8182 AUC : 0.6912
[[14 14]
 [ 6 45]]
###MinMaxScaler###
accuracy : 0.5570 precision : 0.8077 recall : 0.4118 F1 : 0.5455 AUC : 0.6166
[[23  5]
 [30 21]]
###MaxAbsScaler###
accuracy : 0.6203 precision : 0.8182 recall : 0.5294 F1 : 0.6429 AUC : 0.6576
[[22  6]
 [24 27]]


In [45]:
svm.fit(X,y)
thresholds = [0.4,0.45,0.5,0.55,0.6,0.63,0.65,0.7]
get_eval_by_threshold(urine_y, svm.predict_proba(urine_X)[:,1].reshape(-1,1), thresholds)

threshold ;  0.4
accuracy : 0.6709 precision : 0.8049 recall : 0.6471 F1 : 0.7174 AUC : 0.6807
[[20  8]
 [18 33]]


threshold ;  0.45
accuracy : 0.6709 precision : 0.8049 recall : 0.6471 F1 : 0.7174 AUC : 0.6807
[[20  8]
 [18 33]]


threshold ;  0.5
accuracy : 0.6709 precision : 0.8049 recall : 0.6471 F1 : 0.7174 AUC : 0.6807
[[20  8]
 [18 33]]


threshold ;  0.55
accuracy : 0.6582 precision : 0.8000 recall : 0.6275 F1 : 0.7033 AUC : 0.6709
[[20  8]
 [19 32]]


threshold ;  0.6
accuracy : 0.6582 precision : 0.8000 recall : 0.6275 F1 : 0.7033 AUC : 0.6709
[[20  8]
 [19 32]]


threshold ;  0.63
accuracy : 0.6456 precision : 0.7949 recall : 0.6078 F1 : 0.6889 AUC : 0.6611
[[20  8]
 [20 31]]


threshold ;  0.65
accuracy : 0.6456 precision : 0.7949 recall : 0.6078 F1 : 0.6889 AUC : 0.6611
[[20  8]
 [20 31]]


threshold ;  0.7
accuracy : 0.6456 precision : 0.7949 recall : 0.6078 F1 : 0.6889 AUC : 0.6611
[[20  8]
 [20 31]]




# adaboost

In [46]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()

In [47]:
model_pipeline_2(ada)

0 번째 accuracy non_scale : 0.9858% StandardScale : 0.7736% MinMax : 0.1934% MaxAbs : 0.9670%
1 번째 accuracy non_scale : 0.9882% StandardScale : 0.9835% MinMax : 0.1939% MaxAbs : 0.9882%
2 번째 accuracy non_scale : 0.9716% StandardScale : 0.9716% MinMax : 0.1939% MaxAbs : 0.9740%
3 번째 accuracy non_scale : 0.9787% StandardScale : 0.1943% MinMax : 0.1919% MaxAbs : 0.9763%
4 번째 accuracy non_scale : 0.9834% StandardScale : 0.9692% MinMax : 0.1919% MaxAbs : 0.9858%

 mean accuracy non_scale : 0.9815% StandardScale : 0.7784% MinMax : 0.1930% MaxAbs : 0.9782%


In [50]:
ada.fit(X_mas,y)
thresholds = [0.3,0.4,0.45,0.47,0.5,0.55,0.6]
get_eval_by_threshold(urine_y, ada.predict_proba(urine_X_mas)[:,1].reshape(-1,1), thresholds)

threshold ;  0.3
accuracy : 0.6456 precision : 0.6456 recall : 1.0000 F1 : 0.7846 AUC : 0.5000
[[ 0 28]
 [ 0 51]]


threshold ;  0.4
accuracy : 0.6329 precision : 0.6410 recall : 0.9804 F1 : 0.7752 AUC : 0.4902
[[ 0 28]
 [ 1 50]]


threshold ;  0.45
accuracy : 0.6709 precision : 0.6812 recall : 0.9216 F1 : 0.7833 AUC : 0.5679
[[ 6 22]
 [ 4 47]]


threshold ;  0.47
accuracy : 0.6962 precision : 0.7288 recall : 0.8431 F1 : 0.7818 AUC : 0.6359
[[12 16]
 [ 8 43]]


threshold ;  0.5
accuracy : 0.6962 precision : 0.8293 recall : 0.6667 F1 : 0.7391 AUC : 0.7083
[[21  7]
 [17 34]]


threshold ;  0.55
accuracy : 0.5190 precision : 0.9333 recall : 0.2745 F1 : 0.4242 AUC : 0.6194
[[27  1]
 [37 14]]


threshold ;  0.6
accuracy : 0.4304 precision : 0.8750 recall : 0.1373 F1 : 0.2373 AUC : 0.5508
[[27  1]
 [44  7]]




# GBM

In [51]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()

In [52]:
model_pipeline_2(gb)

0 번째 accuracy non_scale : 0.9858% StandardScale : 0.9764% MinMax : 0.1934% MaxAbs : 0.9764%
1 번째 accuracy non_scale : 0.9882% StandardScale : 0.9504% MinMax : 0.1939% MaxAbs : 0.9882%
2 번째 accuracy non_scale : 0.9764% StandardScale : 0.9764% MinMax : 0.1939% MaxAbs : 0.9740%
3 번째 accuracy non_scale : 0.9858% StandardScale : 0.9834% MinMax : 0.1919% MaxAbs : 0.9810%
4 번째 accuracy non_scale : 0.9834% StandardScale : 0.9858% MinMax : 0.1919% MaxAbs : 0.9858%

 mean accuracy non_scale : 0.9839% StandardScale : 0.9745% MinMax : 0.1930% MaxAbs : 0.9811%


In [53]:
pipeline_urine(gb)

###non-scaled###
accuracy : 0.6709 precision : 0.6812 recall : 0.9216 F1 : 0.7833 AUC : 0.5679
[[ 6 22]
 [ 4 47]]
###StandardScaler###
accuracy : 0.6582 precision : 0.6765 recall : 0.9020 F1 : 0.7731 AUC : 0.5581
[[ 6 22]
 [ 5 46]]
###MinMaxScaler###
accuracy : 0.6456 precision : 0.6667 recall : 0.9020 F1 : 0.7667 AUC : 0.5403
[[ 5 23]
 [ 5 46]]
###MaxAbsScaler###
accuracy : 0.6709 precision : 0.6812 recall : 0.9216 F1 : 0.7833 AUC : 0.5679
[[ 6 22]
 [ 4 47]]


In [54]:
gb2 = GradientBoostingClassifier(n_estimators=500,learning_rate=0.01)
model_pipeline_2(gb2)

0 번째 accuracy non_scale : 0.9835% StandardScale : 0.9741% MinMax : 0.1934% MaxAbs : 0.9811%
1 번째 accuracy non_scale : 0.9882% StandardScale : 0.9882% MinMax : 0.1939% MaxAbs : 0.9858%
2 번째 accuracy non_scale : 0.9740% StandardScale : 0.9740% MinMax : 0.1939% MaxAbs : 0.9716%
3 번째 accuracy non_scale : 0.9882% StandardScale : 0.9858% MinMax : 0.1919% MaxAbs : 0.9834%
4 번째 accuracy non_scale : 0.9858% StandardScale : 0.9834% MinMax : 0.1919% MaxAbs : 0.9834%

 mean accuracy non_scale : 0.9839% StandardScale : 0.9811% MinMax : 0.1930% MaxAbs : 0.9811%


In [55]:
pipeline_urine(gb2)

###non-scaled###
accuracy : 0.6582 precision : 0.6765 recall : 0.9020 F1 : 0.7731 AUC : 0.5581
[[ 6 22]
 [ 5 46]]
###StandardScaler###
accuracy : 0.6709 precision : 0.6866 recall : 0.9020 F1 : 0.7797 AUC : 0.5760
[[ 7 21]
 [ 5 46]]
###MinMaxScaler###
accuracy : 0.6582 precision : 0.6765 recall : 0.9020 F1 : 0.7731 AUC : 0.5581
[[ 6 22]
 [ 5 46]]
###MaxAbsScaler###
accuracy : 0.6709 precision : 0.6866 recall : 0.9020 F1 : 0.7797 AUC : 0.5760
[[ 7 21]
 [ 5 46]]


In [60]:
gb.fit(X_mas,y)
thresholds = [0.35,0.4,0.45,0.5,0.55,0.6]
get_eval_by_threshold(urine_y, gb.predict_proba(urine_X_mas)[:,1].reshape(-1,1), thresholds)

threshold ;  0.35
accuracy : 0.6582 precision : 0.6667 recall : 0.9412 F1 : 0.7805 AUC : 0.5420
[[ 4 24]
 [ 3 48]]


threshold ;  0.4
accuracy : 0.6582 precision : 0.6714 recall : 0.9216 F1 : 0.7769 AUC : 0.5501
[[ 5 23]
 [ 4 47]]


threshold ;  0.45
accuracy : 0.6709 precision : 0.6812 recall : 0.9216 F1 : 0.7833 AUC : 0.5679
[[ 6 22]
 [ 4 47]]


threshold ;  0.5
accuracy : 0.6709 precision : 0.6812 recall : 0.9216 F1 : 0.7833 AUC : 0.5679
[[ 6 22]
 [ 4 47]]


threshold ;  0.55
accuracy : 0.6582 precision : 0.6765 recall : 0.9020 F1 : 0.7731 AUC : 0.5581
[[ 6 22]
 [ 5 46]]


threshold ;  0.6
accuracy : 0.6582 precision : 0.6765 recall : 0.9020 F1 : 0.7731 AUC : 0.5581
[[ 6 22]
 [ 5 46]]




# decision tree

In [54]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()

In [55]:
model_pipeline_2(tree)

0 번째 accuracy non_scale : 0.9646% StandardScale : 0.1675% MinMax : 0.1934% MaxAbs : 0.9599%
1 번째 accuracy non_scale : 0.9622% StandardScale : 0.8038% MinMax : 0.1939% MaxAbs : 0.9527%
2 번째 accuracy non_scale : 0.9716% StandardScale : 0.1797% MinMax : 0.1939% MaxAbs : 0.9622%
3 번째 accuracy non_scale : 0.9739% StandardScale : 0.1659% MinMax : 0.1919% MaxAbs : 0.9716%
4 번째 accuracy non_scale : 0.9716% StandardScale : 0.9692% MinMax : 0.1919% MaxAbs : 0.9645%

 mean accuracy non_scale : 0.9688% StandardScale : 0.4572% MinMax : 0.1930% MaxAbs : 0.9622%


In [56]:
pipeline_urine(tree)

###non-scaled###
accuracy : 0.5696 precision : 0.6349 recall : 0.7843 F1 : 0.7018 AUC : 0.4814
[[ 5 23]
 [11 40]]
###StandardScaler###
accuracy : 0.6076 precision : 0.6613 recall : 0.8039 F1 : 0.7257 AUC : 0.5270
[[ 7 21]
 [10 41]]
###MinMaxScaler###
accuracy : 0.6456 precision : 0.6716 recall : 0.8824 F1 : 0.7627 AUC : 0.5483
[[ 6 22]
 [ 6 45]]
###MaxAbsScaler###
accuracy : 0.6582 precision : 0.6714 recall : 0.9216 F1 : 0.7769 AUC : 0.5501
[[ 5 23]
 [ 4 47]]


In [114]:
tree.fit(X_mas,y)
thresholds = [0.3,0.4,0.5,0.6,0.7]
get_eval_by_threshold(urine_y, gb.predict_proba(urine_X_mas)[:,1].reshape(-1,1), thresholds)

threshold ;  0.3
accuracy : 0.6582 precision : 0.6622 recall : 0.9608 F1 : 0.7840 AUC : 0.5340
[[ 3 25]
 [ 2 49]]


threshold ;  0.4
accuracy : 0.6709 precision : 0.6761 recall : 0.9412 F1 : 0.7869 AUC : 0.5599
[[ 5 23]
 [ 3 48]]


threshold ;  0.5
accuracy : 0.6582 precision : 0.6765 recall : 0.9020 F1 : 0.7731 AUC : 0.5581
[[ 6 22]
 [ 5 46]]


threshold ;  0.6
accuracy : 0.6709 precision : 0.6866 recall : 0.9020 F1 : 0.7797 AUC : 0.5760
[[ 7 21]
 [ 5 46]]


threshold ;  0.7
accuracy : 0.6709 precision : 0.6923 recall : 0.8824 F1 : 0.7759 AUC : 0.5840
[[ 8 20]
 [ 6 45]]




# XGBoost

# LightGBM

In [61]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier()

In [62]:
model_pipeline_2(lgb)

0 번째 accuracy non_scale : 0.9835% StandardScale : 0.9788% MinMax : 0.1934% MaxAbs : 0.9788%
1 번째 accuracy non_scale : 0.9882% StandardScale : 0.9882% MinMax : 0.1939% MaxAbs : 0.9882%
2 번째 accuracy non_scale : 0.9787% StandardScale : 0.9716% MinMax : 0.1939% MaxAbs : 0.9764%
3 번째 accuracy non_scale : 0.9882% StandardScale : 0.9834% MinMax : 0.1919% MaxAbs : 0.9858%
4 번째 accuracy non_scale : 0.9858% StandardScale : 0.9858% MinMax : 0.1919% MaxAbs : 0.9834%

 mean accuracy non_scale : 0.9849% StandardScale : 0.9816% MinMax : 0.1930% MaxAbs : 0.9825%


In [63]:
pipeline_urine(lgb)

###non-scaled###
accuracy : 0.6709 precision : 0.6667 recall : 0.9804 F1 : 0.7937 AUC : 0.5438
[[ 3 25]
 [ 1 50]]
###StandardScaler###
accuracy : 0.6962 precision : 0.6800 recall : 1.0000 F1 : 0.8095 AUC : 0.5714
[[ 4 24]
 [ 0 51]]
###MinMaxScaler###
accuracy : 0.6582 precision : 0.6622 recall : 0.9608 F1 : 0.7840 AUC : 0.5340
[[ 3 25]
 [ 2 49]]
###MaxAbsScaler###
accuracy : 0.6709 precision : 0.6667 recall : 0.9804 F1 : 0.7937 AUC : 0.5438
[[ 3 25]
 [ 1 50]]


## parameter grid-search

In [65]:
from sklearn.model_selection import GridSearchCV

## non-scale

In [84]:
lgb = LGBMClassifier(n_jobs = -1, verbose_eval = 10)

params = {
    'max_depth' : [5,7,9],
    'min_child_weight' : [1,3],
    'colsample_bytree' : [0.5,0.75],
    'n_estimators' : [100,300,500]
}

gridcv = GridSearchCV(lgb, param_grid=params)
gridcv.fit(X, y, early_stopping_rounds = 30, eval_metric = 'acc',
                     eval_set = [(X, y), (urine_X, urine_y)], verbose=50)

Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.028733	valid_1's binary_logloss: 0.812329
Early stopping, best iteration is:
[23]	valid_0's binary_logloss: 0.0609068	valid_1's binary_logloss: 0.627913
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[14]	valid_0's binary_logloss: 0.111846	valid_1's binary_logloss: 0.62775
Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.0249513	valid_1's binary_logloss: 0.570383
Early stopping, best iteration is:
[58]	valid_0's binary_logloss: 0.0233345	valid_1's binary_logloss: 0.567097
Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.028733	valid_1's binary_logloss: 0.812329
Early stopping, best iteration is:
[23]	valid_0's binary_logloss: 0.0609068	valid_1's binary_logloss: 0.627913
Training until validation scores don't improve for 30 rounds.
Early stopping, best i

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[14]	valid_0's binary_logloss: 0.11073	valid_1's binary_logloss: 0.625743
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[17]	valid_0's binary_logloss: 0.0867545	valid_1's binary_logloss: 0.637387
Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.0286156	valid_1's binary_logloss: 0.826008
Early stopping, best iteration is:
[23]	valid_0's binary_logloss: 0.0585736	valid_1's binary_logloss: 0.636301
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[14]	valid_0's binary_logloss: 0.11073	valid_1's binary_logloss: 0.625743
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[17]	valid_0's binary_logloss: 0.0867545	valid_1's binary_logloss: 0.637387
Training until validation scores don't improve for 30 rounds.
[

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[15]	valid_0's binary_logloss: 0.104102	valid_1's binary_logloss: 0.636165
Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.024856	valid_1's binary_logloss: 0.632373
Early stopping, best iteration is:
[46]	valid_0's binary_logloss: 0.0258421	valid_1's binary_logloss: 0.62042
Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.0277602	valid_1's binary_logloss: 0.813544
Early stopping, best iteration is:
[30]	valid_0's binary_logloss: 0.0412693	valid_1's binary_logloss: 0.64136
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[15]	valid_0's binary_logloss: 0.104102	valid_1's binary_logloss: 0.636165
Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.024856	valid_1's binary_logloss: 0.632373
Early stopping, best ite

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
        verbose_eval=10),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [5, 7, 9], 'min_child_weight': [1, 3], 'colsample_bytree': [0.5, 0.75], 'n_estimators': [100, 300, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [85]:
print('GridSearchCV best parameter : ', gridcv.best_params_)

GridSearchCV best parameter :  {'colsample_bytree': 0.75, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 100}


In [86]:
thresholds = [0.25,0.3,0.35,0.4,0.45,0.5]
get_eval_by_threshold(urine_y, gridcv.predict_proba(urine_X)[:,1].reshape(-1,1), thresholds)

threshold ;  0.25
accuracy : 0.6835 precision : 0.6711 recall : 1.0000 F1 : 0.8031 AUC : 0.5536
[[ 3 25]
 [ 0 51]]


threshold ;  0.3
accuracy : 0.6962 precision : 0.6800 recall : 1.0000 F1 : 0.8095 AUC : 0.5714
[[ 4 24]
 [ 0 51]]


threshold ;  0.35
accuracy : 0.6582 precision : 0.6714 recall : 0.9216 F1 : 0.7769 AUC : 0.5501
[[ 5 23]
 [ 4 47]]


threshold ;  0.4
accuracy : 0.6835 precision : 0.6970 recall : 0.9020 F1 : 0.7863 AUC : 0.5938
[[ 8 20]
 [ 5 46]]


threshold ;  0.45
accuracy : 0.6076 precision : 0.6667 recall : 0.7843 F1 : 0.7207 AUC : 0.5350
[[ 8 20]
 [11 40]]


threshold ;  0.5
accuracy : 0.5443 precision : 0.6471 recall : 0.6471 F1 : 0.6471 AUC : 0.5021
[[10 18]
 [18 33]]




## standardscale

In [87]:
#속도를 향상시키기 위해 n_estimators = 100으로 감소
lgb = LGBMClassifier( n_jobs = -1, verbose_eval = 10)

params = {
    'max_depth' : [5,7,9],
    'min_child_weight' : [1,3],
    'colsample_bytree' : [0.5,0.75],
    'n_estimators' : [100,300,500]
}

gridcv = GridSearchCV(lgb, param_grid=params)
gridcv.fit(X_standard, y, early_stopping_rounds = 30, eval_metric = 'acc',
                     eval_set = [(X_standard, y), (urine_X_standard, urine_y)], verbose=50)

Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.0284114	valid_1's binary_logloss: 0.765055
Early stopping, best iteration is:
[27]	valid_0's binary_logloss: 0.0496992	valid_1's binary_logloss: 0.593888
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[13]	valid_0's binary_logloss: 0.119025	valid_1's binary_logloss: 0.624558
Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.0245067	valid_1's binary_logloss: 0.650167
Early stopping, best iteration is:
[34]	valid_0's binary_logloss: 0.0367376	valid_1's binary_logloss: 0.622938
Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.0284114	valid_1's binary_logloss: 0.765055
Early stopping, best iteration is:
[27]	valid_0's binary_logloss: 0.0496992	valid_1's binary_logloss: 0.593888
Training until validation scores don't improve for 30 rounds.
Early stopping, bes

Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.0233182	valid_1's binary_logloss: 0.699412
Early stopping, best iteration is:
[25]	valid_0's binary_logloss: 0.0513789	valid_1's binary_logloss: 0.627708
Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.0280538	valid_1's binary_logloss: 0.810185
Early stopping, best iteration is:
[22]	valid_0's binary_logloss: 0.0632796	valid_1's binary_logloss: 0.600367
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[15]	valid_0's binary_logloss: 0.103086	valid_1's binary_logloss: 0.622191
Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.0233182	valid_1's binary_logloss: 0.699412
Early stopping, best iteration is:
[25]	valid_0's binary_logloss: 0.0513789	valid_1's binary_logloss: 0.627708
Training until validation scores don't improve for 30 rounds.
[50]	valid_0's bina

Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.0278124	valid_1's binary_logloss: 0.815424
Early stopping, best iteration is:
[20]	valid_0's binary_logloss: 0.0702028	valid_1's binary_logloss: 0.675189
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[13]	valid_0's binary_logloss: 0.118294	valid_1's binary_logloss: 0.618907
Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.0240698	valid_1's binary_logloss: 0.668566
Early stopping, best iteration is:
[44]	valid_0's binary_logloss: 0.0257899	valid_1's binary_logloss: 0.63854
Training until validation scores don't improve for 30 rounds.
[50]	valid_0's binary_logloss: 0.0278124	valid_1's binary_logloss: 0.815424
Early stopping, best iteration is:
[20]	valid_0's binary_logloss: 0.0702028	valid_1's binary_logloss: 0.675189
Training until validation scores don't improve for 30 rounds.
Early stopping, best

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
        verbose_eval=10),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [5, 7, 9], 'min_child_weight': [1, 3], 'colsample_bytree': [0.5, 0.75], 'n_estimators': [100, 300, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [88]:
print('GridSearchCV best parameter : ', gridcv.best_params_)

GridSearchCV best parameter :  {'colsample_bytree': 0.75, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 100}


In [89]:
thresholds = [0.3,0.4,0.5,0.6,0.7]
get_eval_by_threshold(urine_y, gridcv.predict_proba(urine_X_standard)[:,1].reshape(-1,1), thresholds)

threshold ;  0.3
accuracy : 0.6835 precision : 0.6711 recall : 1.0000 F1 : 0.8031 AUC : 0.5536
[[ 3 25]
 [ 0 51]]


threshold ;  0.4
accuracy : 0.6076 precision : 0.6562 recall : 0.8235 F1 : 0.7304 AUC : 0.5189
[[ 6 22]
 [ 9 42]]


threshold ;  0.5
accuracy : 0.6076 precision : 0.6852 recall : 0.7255 F1 : 0.7048 AUC : 0.5592
[[11 17]
 [14 37]]


threshold ;  0.6
accuracy : 0.5443 precision : 0.6596 recall : 0.6078 F1 : 0.6327 AUC : 0.5182
[[12 16]
 [20 31]]


threshold ;  0.7
accuracy : 0.5316 precision : 0.7059 recall : 0.4706 F1 : 0.5647 AUC : 0.5567
[[18 10]
 [27 24]]




## minmaxscale
- 똥

In [94]:
params = {
    'max_depth' : [5,7,9],
    'min_child_weight' : [1,3],
    'colsample_bytree' : [0.5,0.75],
    'n_estimators' : [100,300,500]
}

gridcv = GridSearchCV(lgb, param_grid=params)
gridcv.fit(X_mms, y, early_stopping_rounds = 30, eval_metric = 'acc',
                     eval_set = [(X_mms, y), (urine_X_mms, urine_y)], verbose=100)

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[18]	valid_0's binary_logloss: 0.0823816	valid_1's binary_logloss: 0.63433
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[11]	valid_0's binary_logloss: 0.139827	valid_1's binary_logloss: 0.624891
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[15]	valid_0's binary_logloss: 0.10349	valid_1's binary_logloss: 0.624566
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[18]	valid_0's binary_logloss: 0.0823816	valid_1's binary_logloss: 0.63433
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[11]	valid_0's binary_logloss: 0.139827	valid_1's binary_logloss: 0.624891
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[15]	valid_0's binary_logloss: 0.10349	valid

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[23]	valid_0's binary_logloss: 0.0629243	valid_1's binary_logloss: 0.617253
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[12]	valid_0's binary_logloss: 0.129016	valid_1's binary_logloss: 0.628602
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[15]	valid_0's binary_logloss: 0.101496	valid_1's binary_logloss: 0.621269
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[23]	valid_0's binary_logloss: 0.0629243	valid_1's binary_logloss: 0.617253
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[12]	valid_0's binary_logloss: 0.129016	valid_1's binary_logloss: 0.628602
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[15]	valid_0's binary_logloss: 0.101496	v

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[28]	valid_0's binary_logloss: 0.0457115	valid_1's binary_logloss: 0.599532
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[13]	valid_0's binary_logloss: 0.119033	valid_1's binary_logloss: 0.62996
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[33]	valid_0's binary_logloss: 0.035197	valid_1's binary_logloss: 0.594568
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[25]	valid_0's binary_logloss: 0.0571256	valid_1's binary_logloss: 0.621581
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[12]	valid_0's binary_logloss: 0.129383	valid_1's binary_logloss: 0.620125
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[42]	valid_0's binary_logloss: 0.0323893	v

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
        verbose_eval=10),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [5, 7, 9], 'min_child_weight': [1, 3], 'colsample_bytree': [0.5, 0.75], 'n_estimators': [100, 300, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [95]:
print('GridSearchCV best parameter : ', gridcv.best_params_)

GridSearchCV best parameter :  {'colsample_bytree': 0.5, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 100}


In [96]:
thresholds = [0.3,0.4,0.5,0.6,0.7]
get_eval_by_threshold(urine_y, gridcv.predict_proba(urine_X_mms)[:,1].reshape(-1,1), thresholds)

threshold ;  0.3
accuracy : 0.6456 precision : 0.6494 recall : 0.9804 F1 : 0.7812 AUC : 0.5081
[[ 1 27]
 [ 1 50]]


threshold ;  0.4
accuracy : 0.6709 precision : 0.6812 recall : 0.9216 F1 : 0.7833 AUC : 0.5679
[[ 6 22]
 [ 4 47]]


threshold ;  0.5
accuracy : 0.6203 precision : 0.6780 recall : 0.7843 F1 : 0.7273 AUC : 0.5529
[[ 9 19]
 [11 40]]


threshold ;  0.6
accuracy : 0.5949 precision : 0.7021 recall : 0.6471 F1 : 0.6735 AUC : 0.5735
[[14 14]
 [18 33]]


threshold ;  0.7
accuracy : 0.4430 precision : 0.6207 recall : 0.3529 F1 : 0.4500 AUC : 0.4800
[[17 11]
 [33 18]]




## mas

In [97]:
params = {
    'max_depth' : [5,7,9],
    'min_child_weight' : [1,3],
    'colsample_bytree' : [0.5,0.75],
    'n_estimators' : [100,300,500]
}

gridcv = GridSearchCV(lgb, param_grid=params)
gridcv.fit(X_mas, y, early_stopping_rounds = 30, eval_metric = 'acc',
                     eval_set = [(X_mas, y), (urine_X_mas, urine_y)], verbose=100)

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[23]	valid_0's binary_logloss: 0.0609068	valid_1's binary_logloss: 0.627913
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[14]	valid_0's binary_logloss: 0.111846	valid_1's binary_logloss: 0.62775
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[58]	valid_0's binary_logloss: 0.0233345	valid_1's binary_logloss: 0.567097
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[23]	valid_0's binary_logloss: 0.0609068	valid_1's binary_logloss: 0.627913
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[14]	valid_0's binary_logloss: 0.111846	valid_1's binary_logloss: 0.62775
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[58]	valid_0's binary_logloss: 0.0233345	v

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[23]	valid_0's binary_logloss: 0.0624349	valid_1's binary_logloss: 0.61086
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[14]	valid_0's binary_logloss: 0.112411	valid_1's binary_logloss: 0.632925
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[58]	valid_0's binary_logloss: 0.027093	valid_1's binary_logloss: 0.604112
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[23]	valid_0's binary_logloss: 0.0624349	valid_1's binary_logloss: 0.61086
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[14]	valid_0's binary_logloss: 0.112411	valid_1's binary_logloss: 0.632925
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[58]	valid_0's binary_logloss: 0.027093	val

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[30]	valid_0's binary_logloss: 0.0412123	valid_1's binary_logloss: 0.64682
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[15]	valid_0's binary_logloss: 0.103865	valid_1's binary_logloss: 0.636023
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[32]	valid_0's binary_logloss: 0.0361969	valid_1's binary_logloss: 0.64517
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[29]	valid_0's binary_logloss: 0.0472056	valid_1's binary_logloss: 0.64148
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[12]	valid_0's binary_logloss: 0.128847	valid_1's binary_logloss: 0.637633
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[32]	valid_0's binary_logloss: 0.04146	vali

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
        verbose_eval=10),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [5, 7, 9], 'min_child_weight': [1, 3], 'colsample_bytree': [0.5, 0.75], 'n_estimators': [100, 300, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [98]:
print('GridSearchCV best parameter : ', gridcv.best_params_)

GridSearchCV best parameter :  {'colsample_bytree': 0.75, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 100}


In [99]:
thresholds = [0.3,0.4,0.5,0.6,0.7]
get_eval_by_threshold(urine_y, gridcv.predict_proba(urine_X_mas)[:,1].reshape(-1,1), thresholds)

threshold ;  0.3
accuracy : 0.6962 precision : 0.6800 recall : 1.0000 F1 : 0.8095 AUC : 0.5714
[[ 4 24]
 [ 0 51]]


threshold ;  0.4
accuracy : 0.6835 precision : 0.6970 recall : 0.9020 F1 : 0.7863 AUC : 0.5938
[[ 8 20]
 [ 5 46]]


threshold ;  0.5
accuracy : 0.5443 precision : 0.6471 recall : 0.6471 F1 : 0.6471 AUC : 0.5021
[[10 18]
 [18 33]]


threshold ;  0.6
accuracy : 0.5190 precision : 0.6667 recall : 0.5098 F1 : 0.5778 AUC : 0.5228
[[15 13]
 [25 26]]


threshold ;  0.7
accuracy : 0.4810 precision : 0.6667 recall : 0.3922 F1 : 0.4938 AUC : 0.5175
[[18 10]
 [31 20]]


