In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
os.listdir('../my/data')

['BC_32ea_k100_cyto.csv',
 'BC_32ea_k15_cyto.csv',
 'K15_common.csv',
 'Normal_21ea_k100_cyto.csv',
 'Normal_21ea_k15_cyto.csv',
 'PC_20ea_k100_cyto.csv',
 'PC_20ea_k15_cyto.csv',
 'RC_20ea_k100_cyto.csv',
 'TCGA_common.csv',
 'TCGA_data.csv']

In [4]:
tcga = pd.read_csv('../my/data/TCGA_common.csv')
urine = pd.read_csv('../my/data/K15_common.csv')

In [5]:
print(tcga.shape)
print(urine.shape)

(2606, 764)
(72, 764)


In [7]:
tcga.y.value_counts()

Normal    1706
PC         492
BC         408
Name: y, dtype: int64

In [8]:
urine.cancer.value_counts()

BC        32
Normal    21
PC        19
Name: cancer, dtype: int64

# BC-Normal

In [9]:
urine_BC_Norm=urine.loc[urine['cancer'].isin(['Normal','BC'])]
tcga_BC_Norm=tcga.loc[tcga['y'].isin(['Normal','BC'])]

In [10]:
print(tcga_BC_Norm.shape)
print(urine_BC_Norm.shape)

(2114, 764)
(53, 764)


In [11]:
X_train = tcga_BC_Norm.drop('y', axis = 1)
y_train = tcga_BC_Norm['y']
X_test = urine_BC_Norm.drop('cancer',axis=1)
y_test= urine_BC_Norm['cancer']

In [12]:
print('X_train shape : , ', X_train.shape, 'y_train : ', y_train.shape)
print('X_test shape : , ', X_test.shape, 'y_test : ', y_test.shape)

X_train shape : ,  (2114, 763) y_train :  (2114,)
X_test shape : ,  (53, 763) y_test :  (53,)


# Scaling
## MaxAbsScaler

In [37]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

pipeline_ada = Pipeline([('scaler', MaxAbsScaler()), ('classifier', AdaBoostClassifier())])
pipeline_tree = Pipeline([('scaler', MaxAbsScaler()), ('classifier', DecisionTreeClassifier())])

pipeline_ada.fit(X_train, y_train)
pipeline_tree.fit(X_train, y_train)

ada_pred = pipeline_ada.predict(X_test)
tree_pred = pipeline_tree.predict(X_test)

In [38]:
print("adaboost Accuracy",(accuracy_score(y_test, ada_pred))*100,"%")
print("tree Accuracy",(accuracy_score(y_test, tree_pred))*100,"%")

adaboost Accuracy 62.264150943396224 %
tree Accuracy 56.60377358490566 %


In [39]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

def get_eval(y_test, pred) :
    '''
    print('accuracy : {0:.4f} precision : {1:.4f} recall : {2:.4f} F1 : {3:.4f} AUC : {4:.4f}'.format(
                                                        accuracy_score(y_test, pred), 
                                                        precision_score(y_test, pred),
                                                        recall_score(y_test, pred),
                                                        f1_score(y_test,pred),
                                                        roc_auc_score(y_test, pred)
                                                                      ))
    '''
    
    print('accuracy : {0:.4f}'.format(accuracy_score(y_test,pred)))
    print(classification_report(y_test, pred))
    print(confusion_matrix(y_test, pred))

In [40]:
get_eval(y_test, ada_pred)
get_eval(y_test, tree_pred)

accuracy : 0.6226
              precision    recall  f1-score   support

          BC       0.66      0.78      0.71        32
      Normal       0.53      0.38      0.44        21

   micro avg       0.62      0.62      0.62        53
   macro avg       0.60      0.58      0.58        53
weighted avg       0.61      0.62      0.61        53

[[25  7]
 [13  8]]
accuracy : 0.5660
              precision    recall  f1-score   support

          BC       0.62      0.75      0.68        32
      Normal       0.43      0.29      0.34        21

   micro avg       0.57      0.57      0.57        53
   macro avg       0.52      0.52      0.51        53
weighted avg       0.54      0.57      0.54        53

[[24  8]
 [15  6]]


## MinMaxScaler

In [41]:
pipeline_ada = Pipeline([('scaler', MinMaxScaler()), ('classifier', AdaBoostClassifier())])
pipeline_tree = Pipeline([('scaler', MinMaxScaler()), ('classifier', DecisionTreeClassifier())])

pipeline_ada.fit(X_train, y_train)
pipeline_tree.fit(X_train, y_train)

ada_pred = pipeline_ada.predict(X_test)
tree_pred = pipeline_tree.predict(X_test)

In [42]:
get_eval(y_test, ada_pred)
get_eval(y_test, tree_pred)

accuracy : 0.6038
              precision    recall  f1-score   support

          BC       0.63      0.81      0.71        32
      Normal       0.50      0.29      0.36        21

   micro avg       0.60      0.60      0.60        53
   macro avg       0.57      0.55      0.54        53
weighted avg       0.58      0.60      0.57        53

[[26  6]
 [15  6]]
accuracy : 0.5849
              precision    recall  f1-score   support

          BC       0.61      0.88      0.72        32
      Normal       0.43      0.14      0.21        21

   micro avg       0.58      0.58      0.58        53
   macro avg       0.52      0.51      0.47        53
weighted avg       0.54      0.58      0.52        53

[[28  4]
 [18  3]]
