In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# 데이터 불러오기 및 전처리

## TCGA

In [22]:
TCGA = pd.read_csv('TCGA_data.csv')

In [23]:
TCGA.loc[(TCGA.y == 'BLCA'),'y'] = 'BC'
TCGA.loc[(TCGA.y == 'PRAD'),'y'] = 'PC'
TCGA.loc[(TCGA.y == 'KICH'),'y'] = 'RC'
TCGA.loc[(TCGA.y == 'KIRC'),'y'] = 'RC'
TCGA.loc[(TCGA.y == 'KIRP'),'y'] = 'RC'

In [24]:
TCGA.y.value_counts()

Normal    1706
RC         882
PC         492
BC         408
Name: y, dtype: int64

## K15

In [None]:
BC_15 = pd.read_csv('BC_32ea_k15_cyto.csv')
Normal_15 = pd.read_csv('Normal_21ea_k15_cyto.csv')
PC_15 = pd.read_csv('PC_20ea_k15_cyto.csv')

#drop person id
BC_15.drop('Unnamed: 0', axis = 1, inplace = True)
Normal_15.drop('Unnamed: 0', axis = 1, inplace = True)
PC_15.drop('Unnamed: 0', axis = 1, inplace = True)

BC_15['cancer'] = 'BC'
Normal_15['cancer'] = 'normal'
PC_15['cancer'] = 'PC'

K15 = pd.concat([BC_15,Normal_15,PC_15], axis = 0)
K15.dropna(inplace=True, axis=0)

## K100

## 공통 COLUMN

In [61]:
TCGA2 = TCGA.drop(list(set(TCGA.columns[:-1]) - set(K15.columns)), axis = 1)
K15_2 = K15.drop(list(set(K15.columns[:-1]) - set(TCGA.columns)), axis = 1)

### BC-Normal

#### TCGA data

In [39]:
TCGA_BC_Norm = TCGA2.loc[(TCGA2.y == 'BC') | (TCGA2.y == 'Normal') ]
#reset_index
TCGA_BC_Norm.reset_index(inplace = True, drop=True)
TCGA_BC_Norm['y'].value_counts()

Normal    1706
BC         408
Name: y, dtype: int64

In [42]:
print('BC비율 : ',len(TCGA_BC_Norm[TCGA_BC_Norm.y == 'BC']) / len(TCGA_BC_Norm))
print('Normal비율 : ',len(TCGA_BC_Norm[TCGA_BC_Norm.y == 'Normal']) / len(TCGA_BC_Norm))

BC비율 :  0.19299905392620625
Normal비율 :  0.8070009460737938


#### urine data

In [62]:
K15_2.cancer.value_counts()

BC        32
normal    21
PC        19
Name: cancer, dtype: int64

In [63]:
K15_2.loc[K15_2.cancer == 'normal','cancer'] = 'Normal'

In [174]:
K15_BC_Norm = K15_2.loc[(K15_2.cancer == 'BC') | (K15_2.cancer == 'Normal') ]
#reset_index
K15_BC_Norm.reset_index(inplace = True, drop=True)
K15_BC_Norm['cancer'].value_counts()

BC        32
Normal    21
Name: cancer, dtype: int64

In [66]:
print('BC비율 : ',len(K15_BC_Norm[K15_BC_Norm.cancer == 'BC']) / len(K15_BC_Norm))
print('Normal비율 : ',len(K15_BC_Norm[K15_BC_Norm.cancer == 'Normal']) / len(K15_BC_Norm))

BC비율 :  0.6037735849056604
Normal비율 :  0.39622641509433965


### BC-Others

#### TCGA data

In [192]:
TCGA_BC_Others = TCGA2.copy()
TCGA_BC_Others['target'] =0
TCGA_BC_Others.loc[TCGA_BC_Others.y == 'BC', 'target'] = 1

In [193]:
TCGA_BC_Others.target.value_counts()

0    3080
1     408
Name: target, dtype: int64

In [197]:
print('BC 비율 : ',len(TCGA_BC_Others[TCGA_BC_Others.target == 1]) / len(TCGA_BC_Others))
print('Others 비율 : ',len(TCGA_BC_Others[TCGA_BC_Others.target == 0]) / len(TCGA_BC_Others))

BC 비율 :  0.11697247706422019
Others 비율 :  0.8830275229357798


#### urine data

In [62]:
K15_2.cancer.value_counts()

BC        32
normal    21
PC        19
Name: cancer, dtype: int64

In [194]:
K15_BC_Others = K15_2.copy()
K15_BC_Others['target'] =0
K15_BC_Others.loc[K15_BC_Others.cancer == 'BC', 'target'] = 1

In [198]:
print('BC비율 : ',len(K15_BC_Others[K15_BC_Others.target == 1]) / len(K15_BC_Others))
print('Others비율 : ',len(K15_BC_Others[K15_BC_Others.target == 0]) / len(K15_BC_Others))

BC비율 :  0.4444444444444444
Others비율 :  0.5555555555555556


# Modeling

In [44]:
from sklearn.ensemble import RandomForestClassifier
rf1 = RandomForestClassifier()
from sklearn.svm import SVC
svm1 = SVC(gamma='auto')
from sklearn.linear_model import LogisticRegression
lr1 = LogisticRegression()
from sklearn.ensemble import VotingClassifier

In [93]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## BC-Normal

In [88]:
x_train = TCGA_BC_Norm.drop('y', axis = 1)
y_train = TCGA_BC_Norm['y']
x_test = K15_BC_Norm.drop('cancer', axis = 1)
y_test = K15_BC_Norm['cancer']

### randomforest

In [96]:
rf1.fit(x_train, y_train)
print(classification_report(y_test, rf1.predict(x_test), target_names=['BC', 'Normal']))

              precision    recall  f1-score   support

          BC       0.60      0.97      0.74        32
      Normal       0.00      0.00      0.00        21

   micro avg       0.58      0.58      0.58        53
   macro avg       0.30      0.48      0.37        53
weighted avg       0.36      0.58      0.45        53



In [98]:
confusion_matrix(y_test, rf1.predict(x_test))

array([[31,  1],
       [21,  0]], dtype=int64)

### svm

In [90]:
svm1.fit(x_train, y_train)
print(classification_report(y_test, svm1.predict(x_test), target_names=['BC', 'Normal']))

              precision    recall  f1-score   support

          BC       1.00      0.25      0.40        32
      Normal       0.47      1.00      0.64        21

   micro avg       0.55      0.55      0.55        53
   macro avg       0.73      0.62      0.52        53
weighted avg       0.79      0.55      0.49        53



In [99]:
confusion_matrix(y_test, svm1.predict(x_test))

array([[ 8, 24],
       [ 0, 21]], dtype=int64)

### logistic regression

In [91]:
lr1.fit(x_train, y_train)
print(classification_report(y_test, lr1.predict(x_test), target_names=['BC', 'Normal']))



              precision    recall  f1-score   support

          BC       1.00      0.31      0.48        32
      Normal       0.49      1.00      0.66        21

   micro avg       0.58      0.58      0.58        53
   macro avg       0.74      0.66      0.57        53
weighted avg       0.80      0.58      0.55        53



In [100]:
confusion_matrix(y_test, lr1.predict(x_test))

array([[10, 22],
       [ 0, 21]], dtype=int64)

### stacking

In [101]:
pd.DataFrame({'y_true' : y_test,
             'rf_pred' : rf1.predict(x_test),'svm_pred' : svm1.predict(x_test),'lr_pred' : lr1.predict(x_test)})

Unnamed: 0,y_true,rf_pred,svm_pred,lr_pred
0,BC,BC,Normal,Normal
1,BC,BC,Normal,Normal
2,BC,BC,Normal,Normal
3,BC,BC,Normal,Normal
4,BC,BC,BC,BC
5,BC,BC,Normal,BC
6,BC,BC,Normal,Normal
7,BC,BC,Normal,BC
8,BC,Normal,Normal,Normal
9,BC,BC,Normal,Normal


In [103]:
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True, random_state=0)

In [175]:
TCGA_BC_Norm['M1'] = np.NaN
TCGA_BC_Norm['M2'] = np.NaN

K15_BC_Norm['M1']= np.NaN
K15_BC_Norm['M2']= np.NaN

In [163]:
import warnings
warnings.filterwarnings(action='ignore')

In [176]:
#train data
for i, (idx_train, idx_cv) in enumerate(cv.split(TCGA_BC_Norm)):
    df_train = TCGA_BC_Norm.iloc[idx_train]
    df_cv = TCGA_BC_Norm.iloc[idx_cv]
    
    x_train = df_train.drop(['y','M1','M2'], axis=1)
    y_train = df_train['y']
    
    x_cv = df_cv.drop(['y','M1','M2'], axis=1)
    #y_cv = df_cv['y']
    
    #randomforest
    rf1.fit(x_train, y_train)
    TCGA_BC_Norm.loc[idx_cv, 'M1'] = rf1.predict(x_cv)
    
    #svm
    svm1.fit(x_train, y_train)
    TCGA_BC_Norm.loc[idx_cv, 'M2'] = svm1.predict(x_cv)

In [177]:
#cv accuracy usning train data
print('randomforest accuracy : ',sum(TCGA_BC_Norm['y'] == TCGA_BC_Norm['M1']) / len(TCGA_BC_Norm) )
print('svm accuracy : ',sum(TCGA_BC_Norm['y'] == TCGA_BC_Norm['M2']) / len(TCGA_BC_Norm) )

randomforest accuracy :  0.9815515610217597
svm accuracy :  0.9654683065279092


In [180]:
x_train = TCGA_BC_Norm.drop(['y','M1','M2'], axis=1)
y_train = TCGA_BC_Norm['y']
x_test = K15_BC_Norm.drop(['cancer','M1','M2'], axis = 1)

In [183]:
rf1.fit(x_train, y_train)
K15_BC_Norm['M1'] = rf1.predict(x_test)
svm1.fit(x_train, y_train)
K15_BC_Norm['M2'] = svm1.predict(x_test)

In [184]:
TCGA_BC_Norm['M1'] = TCGA_BC_Norm['M1'].astype('category')
TCGA_BC_Norm['M1_cat'] = TCGA_BC_Norm['M1'].cat.codes
TCGA_BC_Norm['M2'] = TCGA_BC_Norm['M2'].astype('category')
TCGA_BC_Norm['M2_cat'] = TCGA_BC_Norm['M2'].cat.codes

K15_BC_Norm['M1'] = K15_BC_Norm['M1'].astype('category')
K15_BC_Norm['M1_cat'] = K15_BC_Norm['M1'].cat.codes
K15_BC_Norm['M2'] = K15_BC_Norm['M2'].astype('category')
K15_BC_Norm['M2_cat'] = K15_BC_Norm['M2'].cat.codes

In [185]:
#final Model logistic regression using M1, M2
x_train = TCGA_BC_Norm.drop(['y','M1','M2'], axis=1)
y_train = TCGA_BC_Norm['y']
x_test = K15_BC_Norm.drop(['cancer','M1','M2'], axis = 1)
y_test = K15_BC_Norm['cancer']

In [186]:
lr1.fit(x_train, y_train)
print(classification_report(y_test, lr1.predict(x_test)))

              precision    recall  f1-score   support

          BC       1.00      0.34      0.51        32
      Normal       0.50      1.00      0.67        21

   micro avg       0.60      0.60      0.60        53
   macro avg       0.75      0.67      0.59        53
weighted avg       0.80      0.60      0.57        53



In [188]:
confusion_matrix(y_test, lr1.predict(x_test))

array([[11, 21],
       [ 0, 21]], dtype=int64)

In [187]:
pd.DataFrame({'y_true' : y_test, 'y_pred' : lr1.predict(x_test)})

Unnamed: 0,y_true,y_pred
0,BC,Normal
1,BC,BC
2,BC,Normal
3,BC,Normal
4,BC,BC
5,BC,BC
6,BC,Normal
7,BC,BC
8,BC,Normal
9,BC,Normal


## BC-Others

In [88]:
x_train = TCGA_BC_Norm.drop('y', axis = 1)
y_train = TCGA_BC_Norm['y']
x_test = K15_BC_Norm.drop('cancer', axis = 1)
y_test = K15_BC_Norm['cancer']

### randomforest

In [96]:
rf1.fit(x_train, y_train)
print(classification_report(y_test, rf1.predict(x_test), target_names=['BC', 'Normal']))

              precision    recall  f1-score   support

          BC       0.60      0.97      0.74        32
      Normal       0.00      0.00      0.00        21

   micro avg       0.58      0.58      0.58        53
   macro avg       0.30      0.48      0.37        53
weighted avg       0.36      0.58      0.45        53



In [98]:
confusion_matrix(y_test, rf1.predict(x_test))

array([[31,  1],
       [21,  0]], dtype=int64)

### svm

In [90]:
svm1.fit(x_train, y_train)
print(classification_report(y_test, svm1.predict(x_test), target_names=['BC', 'Normal']))

              precision    recall  f1-score   support

          BC       1.00      0.25      0.40        32
      Normal       0.47      1.00      0.64        21

   micro avg       0.55      0.55      0.55        53
   macro avg       0.73      0.62      0.52        53
weighted avg       0.79      0.55      0.49        53



In [99]:
confusion_matrix(y_test, svm1.predict(x_test))

array([[ 8, 24],
       [ 0, 21]], dtype=int64)

### logistic regression

In [91]:
lr1.fit(x_train, y_train)
print(classification_report(y_test, lr1.predict(x_test), target_names=['BC', 'Normal']))



              precision    recall  f1-score   support

          BC       1.00      0.31      0.48        32
      Normal       0.49      1.00      0.66        21

   micro avg       0.58      0.58      0.58        53
   macro avg       0.74      0.66      0.57        53
weighted avg       0.80      0.58      0.55        53



In [100]:
confusion_matrix(y_test, lr1.predict(x_test))

array([[10, 22],
       [ 0, 21]], dtype=int64)

### stacking

In [101]:
pd.DataFrame({'y_true' : y_test,
             'rf_pred' : rf1.predict(x_test),'svm_pred' : svm1.predict(x_test),'lr_pred' : lr1.predict(x_test)})

Unnamed: 0,y_true,rf_pred,svm_pred,lr_pred
0,BC,BC,Normal,Normal
1,BC,BC,Normal,Normal
2,BC,BC,Normal,Normal
3,BC,BC,Normal,Normal
4,BC,BC,BC,BC
5,BC,BC,Normal,BC
6,BC,BC,Normal,Normal
7,BC,BC,Normal,BC
8,BC,Normal,Normal,Normal
9,BC,BC,Normal,Normal


In [103]:
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True, random_state=0)

In [175]:
TCGA_BC_Norm['M1'] = np.NaN
TCGA_BC_Norm['M2'] = np.NaN

K15_BC_Norm['M1']= np.NaN
K15_BC_Norm['M2']= np.NaN

In [163]:
import warnings
warnings.filterwarnings(action='ignore')

In [176]:
#train data
for i, (idx_train, idx_cv) in enumerate(cv.split(TCGA_BC_Norm)):
    df_train = TCGA_BC_Norm.iloc[idx_train]
    df_cv = TCGA_BC_Norm.iloc[idx_cv]
    
    x_train = df_train.drop(['y','M1','M2'], axis=1)
    y_train = df_train['y']
    
    x_cv = df_cv.drop(['y','M1','M2'], axis=1)
    #y_cv = df_cv['y']
    
    #randomforest
    rf1.fit(x_train, y_train)
    TCGA_BC_Norm.loc[idx_cv, 'M1'] = rf1.predict(x_cv)
    
    #svm
    svm1.fit(x_train, y_train)
    TCGA_BC_Norm.loc[idx_cv, 'M2'] = svm1.predict(x_cv)

In [177]:
#cv accuracy usning train data
print('randomforest accuracy : ',sum(TCGA_BC_Norm['y'] == TCGA_BC_Norm['M1']) / len(TCGA_BC_Norm) )
print('svm accuracy : ',sum(TCGA_BC_Norm['y'] == TCGA_BC_Norm['M2']) / len(TCGA_BC_Norm) )

randomforest accuracy :  0.9815515610217597
svm accuracy :  0.9654683065279092


In [180]:
x_train = TCGA_BC_Norm.drop(['y','M1','M2'], axis=1)
y_train = TCGA_BC_Norm['y']
x_test = K15_BC_Norm.drop(['cancer','M1','M2'], axis = 1)

In [183]:
rf1.fit(x_train, y_train)
K15_BC_Norm['M1'] = rf1.predict(x_test)
svm1.fit(x_train, y_train)
K15_BC_Norm['M2'] = svm1.predict(x_test)

In [184]:
TCGA_BC_Norm['M1'] = TCGA_BC_Norm['M1'].astype('category')
TCGA_BC_Norm['M1_cat'] = TCGA_BC_Norm['M1'].cat.codes
TCGA_BC_Norm['M2'] = TCGA_BC_Norm['M2'].astype('category')
TCGA_BC_Norm['M2_cat'] = TCGA_BC_Norm['M2'].cat.codes

K15_BC_Norm['M1'] = K15_BC_Norm['M1'].astype('category')
K15_BC_Norm['M1_cat'] = K15_BC_Norm['M1'].cat.codes
K15_BC_Norm['M2'] = K15_BC_Norm['M2'].astype('category')
K15_BC_Norm['M2_cat'] = K15_BC_Norm['M2'].cat.codes

In [185]:
#final Model logistic regression using M1, M2
x_train = TCGA_BC_Norm.drop(['y','M1','M2'], axis=1)
y_train = TCGA_BC_Norm['y']
x_test = K15_BC_Norm.drop(['cancer','M1','M2'], axis = 1)
y_test = K15_BC_Norm['cancer']

In [186]:
lr1.fit(x_train, y_train)
print(classification_report(y_test, lr1.predict(x_test)))

              precision    recall  f1-score   support

          BC       1.00      0.34      0.51        32
      Normal       0.50      1.00      0.67        21

   micro avg       0.60      0.60      0.60        53
   macro avg       0.75      0.67      0.59        53
weighted avg       0.80      0.60      0.57        53



In [188]:
confusion_matrix(y_test, lr1.predict(x_test))

array([[11, 21],
       [ 0, 21]], dtype=int64)

In [187]:
pd.DataFrame({'y_true' : y_test, 'y_pred' : lr1.predict(x_test)})

Unnamed: 0,y_true,y_pred
0,BC,Normal
1,BC,BC
2,BC,Normal
3,BC,Normal
4,BC,BC
5,BC,BC
6,BC,Normal
7,BC,BC
8,BC,Normal
9,BC,Normal
