In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# 데이터 불러오기 및 전처리

## TCGA

In [2]:
TCGA = pd.read_csv('TCGA_data.csv')

In [3]:
TCGA.loc[(TCGA.y == 'BLCA'),'y'] = 'BC'
TCGA.loc[(TCGA.y == 'PRAD'),'y'] = 'PC'
TCGA.loc[(TCGA.y == 'KICH'),'y'] = 'RC'
TCGA.loc[(TCGA.y == 'KIRC'),'y'] = 'RC'
TCGA.loc[(TCGA.y == 'KIRP'),'y'] = 'RC'

In [4]:
TCGA = TCGA[~ (TCGA.y== 'RC')]

In [5]:
TCGA.y.value_counts()

Normal    1706
PC         492
BC         408
Name: y, dtype: int64

## K15

In [6]:
BC_15 = pd.read_csv('BC_32ea_k15_cyto.csv')
Normal_15 = pd.read_csv('Normal_21ea_k15_cyto.csv')
PC_15 = pd.read_csv('PC_20ea_k15_cyto.csv')

#drop person id
BC_15.drop('Unnamed: 0', axis = 1, inplace = True)
Normal_15.drop('Unnamed: 0', axis = 1, inplace = True)
PC_15.drop('Unnamed: 0', axis = 1, inplace = True)

BC_15['cancer'] = 'BC'
Normal_15['cancer'] = 'normal'
PC_15['cancer'] = 'PC'

K15 = pd.concat([BC_15,Normal_15,PC_15], axis = 0)
K15.dropna(inplace=True, axis=0)

In [7]:
K15.cancer.value_counts()

BC        32
normal    21
PC        19
Name: cancer, dtype: int64

## K100

## 공통 COLUMN

In [8]:
TCGA2 = TCGA.drop(list(set(TCGA.columns[:-1]) - set(K15.columns)), axis = 1)
K15_2 = K15.drop(list(set(K15.columns[:-1]) - set(TCGA.columns)), axis = 1)

### BC-Normal

#### TCGA data

In [9]:
TCGA_BC_Norm = TCGA2.loc[(TCGA2.y == 'BC') | (TCGA2.y == 'Normal') ]
#reset_index
TCGA_BC_Norm.reset_index(inplace = True, drop=True)
TCGA_BC_Norm['y'].value_counts()

Normal    1706
BC         408
Name: y, dtype: int64

In [10]:
print('BC비율 : ',len(TCGA_BC_Norm[TCGA_BC_Norm.y == 'BC']) / len(TCGA_BC_Norm))
print('Normal비율 : ',len(TCGA_BC_Norm[TCGA_BC_Norm.y == 'Normal']) / len(TCGA_BC_Norm))

BC비율 :  0.19299905392620625
Normal비율 :  0.8070009460737938


#### urine data

In [11]:
K15_2.cancer.value_counts()

BC        32
normal    21
PC        19
Name: cancer, dtype: int64

In [11]:
K15_2.loc[K15_2.cancer == 'normal','cancer'] = 'Normal'

In [12]:
K15_BC_Norm = K15_2.loc[(K15_2.cancer == 'BC') | (K15_2.cancer == 'Normal') ]
#reset_index
K15_BC_Norm.reset_index(inplace = True, drop=True)
K15_BC_Norm['cancer'].value_counts()

BC        32
Normal    21
Name: cancer, dtype: int64

In [13]:
print('BC비율 : ',len(K15_BC_Norm[K15_BC_Norm.cancer == 'BC']) / len(K15_BC_Norm))
print('Normal비율 : ',len(K15_BC_Norm[K15_BC_Norm.cancer == 'Normal']) / len(K15_BC_Norm))

BC비율 :  0.6037735849056604
Normal비율 :  0.39622641509433965


### BC-Others

#### TCGA data

In [76]:
TCGA_BC_Others = TCGA2.copy()
TCGA_BC_Others['y'].replace(['PC','Normal'], 'Others', inplace = True)
TCGA_BC_Others.reset_index(inplace = True, drop=True)

In [77]:
TCGA_BC_Others.y.value_counts()

Others    2198
BC         408
Name: y, dtype: int64

In [16]:
print('BC 비율 : ',len(TCGA_BC_Others[TCGA_BC_Others.y == 'BC']) / len(TCGA_BC_Others))
print('Others 비율 : ',len(TCGA_BC_Others[TCGA_BC_Others.y == 'Others']) / len(TCGA_BC_Others))

BC 비율 :  0.15656178050652342
Others 비율 :  0.8434382194934766


#### urine data

In [17]:
K15_2.cancer.value_counts()

BC        32
Normal    21
PC        19
Name: cancer, dtype: int64

In [79]:
K15_BC_Others = K15_2.copy()
K15_BC_Others['cancer'].replace(['PC','Normal'], 'Others', inplace = True)

In [80]:
K15_BC_Others.cancer.value_counts()

Others    40
BC        32
Name: cancer, dtype: int64

In [81]:
print('BC비율 : ',len(K15_BC_Others[K15_BC_Others.cancer == "BC"]) / len(K15_BC_Others))
print('Others비율 : ',len(K15_BC_Others[K15_BC_Others.cancer == 'Others']) / len(K15_BC_Others))

BC비율 :  0.4444444444444444
Others비율 :  0.5555555555555556


# Modeling

In [21]:
from sklearn.ensemble import RandomForestClassifier
rf1 = RandomForestClassifier()
from sklearn.svm import SVC
svm1 = SVC(gamma='auto')
from sklearn.linear_model import LogisticRegression
lr1 = LogisticRegression()
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()

In [22]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## BC-Normal

In [23]:
x_train = TCGA_BC_Norm.drop('y', axis = 1)
y_train = TCGA_BC_Norm['y']
x_test = K15_BC_Norm.drop('cancer', axis = 1)
y_test = K15_BC_Norm['cancer']

### randomforest

In [24]:
rf1.fit(x_train, y_train)
rf1_pred = rf1.predict(x_test)
print(classification_report(y_test, rf1_pred, target_names=['BC', 'Normal']))



              precision    recall  f1-score   support

          BC       0.60      1.00      0.75        32
      Normal       0.00      0.00      0.00        21

   micro avg       0.60      0.60      0.60        53
   macro avg       0.30      0.50      0.38        53
weighted avg       0.36      0.60      0.45        53



  'precision', 'predicted', average, warn_for)


In [25]:
confusion_matrix(y_test, rf1_pred)

array([[32,  0],
       [21,  0]], dtype=int64)

### svm

In [26]:
svm1.fit(x_train, y_train)
svm1_pred = svm1.predict(x_test)
print(classification_report(y_test, svm1_pred, target_names=['BC', 'Normal']))

              precision    recall  f1-score   support

          BC       1.00      0.25      0.40        32
      Normal       0.47      1.00      0.64        21

   micro avg       0.55      0.55      0.55        53
   macro avg       0.73      0.62      0.52        53
weighted avg       0.79      0.55      0.49        53



In [27]:
confusion_matrix(y_test, svm1_pred)

array([[ 8, 24],
       [ 0, 21]], dtype=int64)

### logistic regression

In [28]:
lr1.fit(x_train, y_train)
lr1_pred =  lr1.predict(x_test)
print(classification_report(y_test,lr1_pred, target_names=['BC', 'Normal']))



              precision    recall  f1-score   support

          BC       1.00      0.31      0.48        32
      Normal       0.49      1.00      0.66        21

   micro avg       0.58      0.58      0.58        53
   macro avg       0.74      0.66      0.57        53
weighted avg       0.80      0.58      0.55        53



In [29]:
confusion_matrix(y_test, lr1_pred)

array([[10, 22],
       [ 0, 21]], dtype=int64)

### LDA

In [30]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()

In [31]:
lda.fit(x_train, y_train)
lda_pred = lda.predict(x_test)
print(classification_report(y_test, lda_pred, target_names=['BC', 'Normal']))

              precision    recall  f1-score   support

          BC       0.61      0.44      0.51        32
      Normal       0.40      0.57      0.47        21

   micro avg       0.49      0.49      0.49        53
   macro avg       0.50      0.50      0.49        53
weighted avg       0.53      0.49      0.49        53





In [32]:
confusion_matrix(y_test, lda_pred)

array([[14, 18],
       [ 9, 12]], dtype=int64)

### tree

In [33]:
tree.fit(x_train, y_train)
tree_pred = tree.predict(x_test)
print(classification_report(y_test, tree_pred, target_names=['BC', 'Normal']))

              precision    recall  f1-score   support

          BC       0.63      0.75      0.69        32
      Normal       0.47      0.33      0.39        21

   micro avg       0.58      0.58      0.58        53
   macro avg       0.55      0.54      0.54        53
weighted avg       0.57      0.58      0.57        53



In [34]:
confusion_matrix(y_test, tree_pred)

array([[24,  8],
       [14,  7]], dtype=int64)

In [35]:
pd.DataFrame({'y_true' : y_test,
             'rf_pred' : rf1_pred,'svm_pred' : svm1_pred,
              'lr_pred' : lr1_pred, 'lda_pred' : lda_pred, 'tree_pred' : tree_pred})

Unnamed: 0,y_true,rf_pred,svm_pred,lr_pred,lda_pred,tree_pred
0,BC,BC,Normal,Normal,BC,BC
1,BC,BC,Normal,Normal,BC,BC
2,BC,BC,Normal,Normal,BC,BC
3,BC,BC,Normal,Normal,Normal,BC
4,BC,BC,BC,BC,BC,BC
5,BC,BC,Normal,BC,BC,Normal
6,BC,BC,Normal,Normal,BC,Normal
7,BC,BC,Normal,BC,Normal,BC
8,BC,BC,Normal,Normal,Normal,BC
9,BC,BC,Normal,Normal,BC,Normal


### stacking

In [36]:
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True, random_state=0)

In [39]:
import warnings
warnings.filterwarnings(action='ignore')

In [52]:
#stacking function
def stacking_3_models(train_data, test_data, model1, model2, model3) :
    train_data['M1'] = np.NaN
    train_data['M2'] = np.NaN

    test_data['M1']= np.NaN
    test_data['M2']= np.NaN
    
    #train data
    for i, (idx_train, idx_cv) in enumerate(cv.split(train_data)):
        df_train = train_data.iloc[idx_train]
        df_cv = train_data.iloc[idx_cv]

        x_train = df_train.drop(['y','M1','M2'], axis=1)
        y_train = df_train['y']

        x_cv = df_cv.drop(['y','M1','M2'], axis=1)
        #y_cv = df_cv['y']

        #rf
        model1.fit(x_train, y_train)
        train_data.loc[idx_cv, 'M1'] = model1.predict(x_cv)

        #lr
        model2.fit(x_train, y_train)
        train_data.loc[idx_cv, 'M2'] = model2.predict(x_cv)
        
        
    x_train = train_data.drop(['y','M1','M2'], axis=1)
    y_train = train_data['y']
    x_test = test_data.drop(['cancer','M1','M2'], axis = 1)
    
    model1.fit(x_train, y_train)
    test_data['M1'] = model1.predict(x_test)
    model2.fit(x_train, y_train)
    test_data['M2'] = model2.predict(x_test)
    
    #M1, M2의 BC -> 0, Normal -> 1
    train_data['M1'] = train_data['M1'].astype('category')
    train_data['M1_cat'] = train_data['M1'].cat.codes
    train_data['M2'] = train_data['M2'].astype('category')
    train_data['M2_cat'] = train_data['M2'].cat.codes

    test_data['M1'] = test_data['M1'].astype('category')
    test_data['M1_cat'] = test_data['M1'].cat.codes
    test_data['M2'] = test_data['M2'].astype('category')
    test_data['M2_cat'] = test_data['M2'].cat.codes

    #M1, M2를 숫자로 바꾼 M1_cat, M2_cat을 이용하여 final Model logistic regression 
    x_train = train_data.drop(['y','M1','M2'], axis=1)
    y_train = train_data['y']
    x_test = test_data.drop(['cancer','M1','M2'], axis = 1)
    y_test = test_data['cancer']

    model3.fit(x_train, y_train)
    
    return classification_report(y_test, model3.predict(x_test)), confusion_matrix(y_test, model3.predict(x_test))

In [None]:
#함수를 이용해서 여러가지 모델 조합으로 stacing해볼 수 있을 듯

In [54]:
confus, report = stacking_3_models(TCGA_BC_Norm, K15_BC_Norm, rf1, lr1, lda)
print(confus)
print(report)

              precision    recall  f1-score   support

          BC       0.59      0.41      0.48        32
      Normal       0.39      0.57      0.46        21

   micro avg       0.47      0.47      0.47        53
   macro avg       0.49      0.49      0.47        53
weighted avg       0.51      0.47      0.47        53

[[14 18]
 [11 10]]


In [56]:
confus, report = stacking_3_models(TCGA_BC_Norm, K15_BC_Norm, rf1, lr1, tree)
print(confus)
print(report)

              precision    recall  f1-score   support

          BC       0.56      0.78      0.65        32
      Normal       0.12      0.05      0.07        21

   micro avg       0.49      0.49      0.49        53
   macro avg       0.34      0.41      0.36        53
weighted avg       0.38      0.49      0.42        53

[[26  6]
 [20  1]]


In [92]:
confus, report = stacking_3_models(TCGA_BC_Norm, K15_BC_Norm, svm1, lr1, tree)
print(confus)
print(report)

              precision    recall  f1-score   support

          BC       0.88      0.44      0.58        32
      Normal       0.51      0.90      0.66        21

   micro avg       0.62      0.62      0.62        53
   macro avg       0.69      0.67      0.62        53
weighted avg       0.73      0.62      0.61        53

[[14 18]
 [ 2 19]]


## BC-Others

In [59]:
x_train = TCGA_BC_Others.drop(['target'], axis = 1)
y_train = TCGA_BC_Others['target']
x_test = K15_BC_Others.drop(['target'], axis = 1)
y_test = K15_BC_Others['target']

In [60]:
TCGA_BC_Others.columns

Index(['cyto.1p36.32', 'cyto.1p36.31', 'cyto.1p36.23', 'cyto.1p36.22',
       'cyto.1p36.21', 'cyto.1p36.13', 'cyto.1p36.12', 'cyto.1p36.11',
       'cyto.1p35.3', 'cyto.1p35.2',
       ...
       'cyto.9q32', 'cyto.9q33.1', 'cyto.9q33.2', 'cyto.9q33.3',
       'cyto.9q34.11', 'cyto.9q34.12', 'cyto.9q34.13', 'cyto.9q34.2',
       'cyto.9q34.3', 'target'],
      dtype='object', length=764)

In [61]:
K15_BC_Others.columns

Index(['cyto.10p11.1', 'cyto.10p11.21', 'cyto.10p11.22', 'cyto.10p11.23',
       'cyto.10p12.1', 'cyto.10p12.2', 'cyto.10p12.31', 'cyto.10p12.32',
       'cyto.10p12.33', 'cyto.10p13',
       ...
       'cyto.9q32', 'cyto.9q33.1', 'cyto.9q33.2', 'cyto.9q33.3',
       'cyto.9q34.11', 'cyto.9q34.12', 'cyto.9q34.13', 'cyto.9q34.2',
       'cyto.9q34.3', 'target'],
      dtype='object', length=764)

### randomforest

In [62]:
#랜포 할 때마다 바뀜 값이..
rf1.fit(x_train, y_train)
rf1_pred = rf1.predict(x_test)
print(classification_report(y_test, rf1_pred, target_names=['BC', 'Others']))

              precision    recall  f1-score   support

          BC       0.46      0.91      0.61        32
      Others       0.67      0.15      0.24        40

   micro avg       0.49      0.49      0.49        72
   macro avg       0.56      0.53      0.43        72
weighted avg       0.57      0.49      0.41        72



In [120]:
confusion_matrix(y_test, rf1_pred)

array([[29,  3],
       [33,  7]], dtype=int64)

### svm

In [63]:
svm1.fit(x_train, y_train)
svm1_pred = svm1.predict(x_test)
print(classification_report(y_test, svm1_pred, target_names=['BC', 'Others']))

              precision    recall  f1-score   support

          BC       1.00      0.16      0.27        32
      Others       0.60      1.00      0.75        40

   micro avg       0.62      0.62      0.62        72
   macro avg       0.80      0.58      0.51        72
weighted avg       0.78      0.62      0.54        72



In [122]:
confusion_matrix(y_test, svm1_pred)

array([[ 5, 27],
       [ 0, 40]], dtype=int64)

### logistic regression

In [64]:
lr1.fit(x_train, y_train)
lr1_pred = lr1.predict(x_test)
print(classification_report(y_test, lr1_pred, target_names=['BC', 'Others']))

              precision    recall  f1-score   support

          BC       1.00      0.28      0.44        32
      Others       0.63      1.00      0.78        40

   micro avg       0.68      0.68      0.68        72
   macro avg       0.82      0.64      0.61        72
weighted avg       0.80      0.68      0.63        72



In [124]:
confusion_matrix(y_test, lr1.predict(x_test))

array([[ 9, 23],
       [ 0, 40]], dtype=int64)

### lda

In [65]:
lda.fit(x_train, y_train)
lda_pred = lda.predict(x_test)
print(classification_report(y_test, lda_pred, target_names=['BC', 'Others']))

              precision    recall  f1-score   support

          BC       0.34      0.34      0.34        32
      Others       0.47      0.47      0.48        40

   micro avg       0.42      0.42      0.42        72
   macro avg       0.41      0.41      0.41        72
weighted avg       0.42      0.42      0.42        72



In [126]:
confusion_matrix(y_test,  lda_pred)

array([[11, 21],
       [21, 19]], dtype=int64)

### tree

In [66]:
tree.fit(x_train, y_train)
tree_pred = tree.predict(x_test)
print(classification_report(y_test, tree_pred, target_names=['BC', 'Others']))

              precision    recall  f1-score   support

          BC       0.52      0.75      0.62        32
      Others       0.69      0.45      0.55        40

   micro avg       0.58      0.58      0.58        72
   macro avg       0.61      0.60      0.58        72
weighted avg       0.62      0.58      0.58        72



In [67]:
confusion_matrix(y_test,  tree_pred)

array([[24,  8],
       [22, 18]], dtype=int64)

In [68]:
pd.DataFrame({'y_true' : y_test,
             'rf_pred' : rf1_pred,'svm_pred' : svm1_pred,
              'lr_pred' : lr1_pred, 'lda_pred' : lda_pred, 'tree_pred' : tree_pred})

Unnamed: 0,y_true,rf_pred,svm_pred,lr_pred,lda_pred,tree_pred
0,BC,BC,Others,Others,Others,Others
1,BC,BC,Others,Others,Others,BC
2,BC,BC,Others,Others,Others,BC
3,BC,BC,Others,Others,BC,BC
4,BC,BC,BC,BC,Others,BC
5,BC,BC,Others,Others,Others,BC
6,BC,BC,Others,Others,Others,Others
7,BC,BC,Others,BC,BC,BC
8,BC,BC,Others,Others,Others,Others
9,BC,Others,Others,Others,Others,BC


### stacking

In [45]:
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True, random_state=0)

In [88]:
report,confus = stacking_3_models(TCGA_BC_Others, K15_BC_Others, rf1, lr1, lda)
print(report)
print(confus)

              precision    recall  f1-score   support

          BC       0.36      0.50      0.42        32
      Others       0.43      0.30      0.35        40

   micro avg       0.39      0.39      0.39        72
   macro avg       0.40      0.40      0.39        72
weighted avg       0.40      0.39      0.38        72

[[16 16]
 [28 12]]


In [89]:
report,confus = stacking_3_models(TCGA_BC_Others, K15_BC_Others, rf1, lr1, tree)
print(report)
print(confus)

              precision    recall  f1-score   support

          BC       0.52      0.44      0.47        32
      Others       0.60      0.68      0.64        40

   micro avg       0.57      0.57      0.57        72
   macro avg       0.56      0.56      0.55        72
weighted avg       0.56      0.57      0.56        72

[[14 18]
 [13 27]]


In [94]:
report,confus = stacking_3_models(TCGA_BC_Others, K15_BC_Others, svm1, lr1, tree)
print(report)
print(confus)

              precision    recall  f1-score   support

          BC       0.56      0.47      0.51        32
      Others       0.62      0.70      0.66        40

   micro avg       0.60      0.60      0.60        72
   macro avg       0.59      0.58      0.58        72
weighted avg       0.59      0.60      0.59        72

[[15 17]
 [12 28]]


In [275]:
pd.DataFrame({'y_true' : y_test, 'tree_stack' : tree.predict(x_test),
            'svm' : svm1_pred, 'lr' : lr1_pred, 'rf': rf1_pred,'lda' : lda_pred})

Unnamed: 0,y_true,tree_stack,svm,lr,rf,lda
0,BC,BC,Others,Others,BC,Others
1,BC,Others,Others,Others,BC,Others
2,BC,BC,Others,Others,BC,Others
3,BC,BC,Others,Others,BC,BC
4,BC,Others,BC,BC,BC,Others
5,BC,Others,Others,Others,BC,Others
6,BC,BC,Others,Others,BC,Others
7,BC,BC,Others,BC,BC,BC
8,BC,BC,Others,Others,BC,Others
9,BC,Others,Others,Others,Others,Others
