In [1]:
import pandas as pd
import numpy as np
import os
os.listdir('../my/data')

['BC_32ea_k100_cyto.csv',
 'BC_32ea_k15_cyto.csv',
 'K15_common.csv',
 'Normal_21ea_k100_cyto.csv',
 'Normal_21ea_k15_cyto.csv',
 'PC_20ea_k100_cyto.csv',
 'PC_20ea_k15_cyto.csv',
 'RC_20ea_k100_cyto.csv',
 'TCGA_common.csv',
 'TCGA_data.csv']

In [2]:
tcga = pd.read_csv('../my/data/TCGA_common.csv')
urine = pd.read_csv('../my/data/K15_common.csv')

urine_BC_Norm=urine.loc[urine['cancer'].isin(['Normal','BC'])]
tcga_BC_Norm=tcga.loc[tcga['y'].isin(['Normal','BC'])]
X_train = tcga_BC_Norm.drop('y', axis = 1)
y_train = tcga_BC_Norm['y']
X_test = urine_BC_Norm.drop('cancer',axis=1)
y_test= urine_BC_Norm['cancer']

print('X_train shape : , ', X_train.shape, 'y_train : ', y_train.shape)
print('X_test shape : , ', X_test.shape, 'y_test : ', y_test.shape)

X_train shape : ,  (2114, 763) y_train :  (2114,)
X_test shape : ,  (53, 763) y_test :  (53,)


In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

def get_eval(y_test, pred) :
    '''
    print('accuracy : {0:.4f} precision : {1:.4f} recall : {2:.4f} F1 : {3:.4f} AUC : {4:.4f}'.format(
                                                        accuracy_score(y_test, pred), 
                                                        precision_score(y_test, pred),
                                                        recall_score(y_test, pred),
                                                        f1_score(y_test,pred),
                                                        roc_auc_score(y_test, pred)
                                                                      ))
    '''
    
    print('accuracy : {0:.4f}'.format(accuracy_score(y_test,pred)))
    print(classification_report(y_test, pred))
    print(confusion_matrix(y_test, pred))

# scaling

In [4]:
from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler
#MinMaxScaler
X_train_mms = MinMaxScaler().fit_transform(X_train)
X_test_mms = MinMaxScaler().fit_transform(X_test)

#MaxAbsScaler
X_train_mas = MaxAbsScaler().fit_transform(X_train)
X_test_mas = MaxAbsScaler().fit_transform(X_test)

# default tree

In [5]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

print('###original data result###')
dt.fit(X_train, y_train)
get_eval(y_test, dt.predict(X_test))

print('###MinMaxScaler data result###')
dt.fit(X_train_mms, y_train)
get_eval(y_test, dt.predict(X_test_mms))

print('###MaxAbsScaler data result###')
dt.fit(X_train_mas, y_train)
get_eval(y_test, dt.predict(X_test_mas))

###original data result###
accuracy : 0.5660
              precision    recall  f1-score   support

          BC       0.61      0.78      0.68        32
      Normal       0.42      0.24      0.30        21

   micro avg       0.57      0.57      0.57        53
   macro avg       0.51      0.51      0.49        53
weighted avg       0.53      0.57      0.53        53

[[25  7]
 [16  5]]
###MinMaxScaler data result###
accuracy : 0.5849
              precision    recall  f1-score   support

          BC       0.61      0.84      0.71        32
      Normal       0.44      0.19      0.27        21

   micro avg       0.58      0.58      0.58        53
   macro avg       0.53      0.52      0.49        53
weighted avg       0.55      0.58      0.53        53

[[27  5]
 [17  4]]
###MaxAbsScaler data result###
accuracy : 0.5283
              precision    recall  f1-score   support

          BC       0.59      0.75      0.66        32
      Normal       0.33      0.19      0.24        21

 

In [6]:
dt.get_params()

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': None,
 'splitter': 'best'}

In [7]:
from sklearn.model_selection import GridSearchCV

prams = {
    'max_depth' : [6,8,10,12,16,20,24]
}

grid_cv = GridSearchCV(dt, param_grid = prams, scoring = 'accuracy', cv = 3)
grid_cv.fit(X_train_mas, y_train)
print('GridSearchCV best accuracy : {0:.4f}'.format(grid_cv.best_score_))
print('GridSearchCV best hyperparameter', grid_cv.best_params_)

GridSearchCV best accuracy : 0.9697
GridSearchCV best hyperparameter {'max_depth': 10}


In [8]:
cv_result = pd.DataFrame(grid_cv.cv_results_)
cv_result



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.806499,0.009434,0.01716,0.000378,6,{'max_depth': 6},0.963121,0.970213,0.941761,0.958373,0.01209,6,0.995742,0.996451,0.996454,0.996216,0.000335
1,0.796784,0.20359,0.000675,0.000955,8,{'max_depth': 8},0.965957,0.97305,0.943182,0.960738,0.012739,4,0.997161,0.998581,0.997163,0.997635,0.000669
2,0.676488,0.036057,0.003357,0.004748,10,{'max_depth': 10},0.971631,0.967376,0.97017,0.969726,0.001766,1,0.998581,0.99929,0.997872,0.998581,0.000579
3,1.15378,0.044638,0.005042,0.003593,12,{'max_depth': 12},0.961702,0.968794,0.977273,0.969253,0.006364,3,0.99929,1.0,0.999291,0.999527,0.000334
4,1.146623,0.06967,0.006298,0.006227,16,{'max_depth': 16},0.967376,0.974468,0.96733,0.969726,0.003355,1,1.0,1.0,1.0,1.0,0.0
5,0.716045,0.014659,0.009587,0.005595,20,{'max_depth': 20},0.967376,0.961702,0.93892,0.956008,0.012294,7,1.0,1.0,1.0,1.0,0.0
6,1.027154,0.189414,0.002463,0.003484,24,{'max_depth': 24},0.968794,0.965957,0.943182,0.959319,0.011461,5,1.0,1.0,1.0,1.0,0.0


In [9]:
max_depth = [4,6,7,8,10]

for depth in max_depth :
    dt = DecisionTreeClassifier(random_state = 156, max_depth = depth)
    dt.fit(X_train_mas, y_train)
    pred = dt.predict(X_test_mas)
    accuracy = accuracy_score(y_test, pred)
    print('max_depth = {0} accuracy : {1:.4f}'.format(depth, accuracy))

max_depth = 4 accuracy : 0.5472
max_depth = 6 accuracy : 0.5849
max_depth = 7 accuracy : 0.6415
max_depth = 8 accuracy : 0.6226
max_depth = 10 accuracy : 0.4717
