In [40]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# K15

In [68]:
BC_15 = pd.read_csv('BC_32ea_k15_cyto.csv')
Normal_15 = pd.read_csv('Normal_21ea_k15_cyto.csv')
PC_15 = pd.read_csv('PC_20ea_k15_cyto.csv')

#drop person id
BC_15.drop('Unnamed: 0', axis = 1, inplace = True)
Normal_15.drop('Unnamed: 0', axis = 1, inplace = True)
PC_15.drop('Unnamed: 0', axis = 1, inplace = True)

BC_15['cancer'] = 'BC'
Normal_15['cancer'] = 'normal'
PC_15['cancer'] = 'PC'

K15 = pd.concat([BC_15,Normal_15,PC_15], axis = 0)

In [69]:
K15.shape

(73, 770)

In [70]:
K15.dropna(axis = 0).shape

(72, 770)

In [71]:
K15.dropna(inplace=True, axis=0)

## BC - Normal

In [73]:
BC_Norm_15 = K15.loc[(K15.cancer == 'BC') | (K15.cancer == 'normal')]
BC_Norm_15.shape

(53, 770)

In [74]:
#reset_index
BC_Norm_15.reset_index(inplace = True, drop=True)

In [75]:
print('BC비율 : ',len(BC_Norm_15[BC_Norm_15.cancer == 'BC']) / len(BC_Norm_15))
print('Normal비율 : ',len(BC_Norm_15[BC_Norm_15.cancer == 'normal']) / len(BC_Norm_15))

BC비율 :  0.6037735849056604
Normal비율 :  0.39622641509433965


In [48]:
from sklearn.ensemble import RandomForestClassifier
rf1 = RandomForestClassifier()
from sklearn.svm import SVC
svm1 = SVC(gamma='auto')
from sklearn.linear_model import LogisticRegression
lr1 = LogisticRegression()
from sklearn.ensemble import VotingClassifier


In [49]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True, random_state=0)

In [50]:
BC_Norm_15.columns

Index(['id', 'cyto.10p11.1', 'cyto.10p11.21', 'cyto.10p11.22', 'cyto.10p11.23',
       'cyto.10p12.1', 'cyto.10p12.2', 'cyto.10p12.31', 'cyto.10p12.32',
       'cyto.10p12.33',
       ...
       'cyto.9q33.1', 'cyto.9q33.2', 'cyto.9q33.3', 'cyto.9q34.11',
       'cyto.9q34.12', 'cyto.9q34.13', 'cyto.9q34.2', 'cyto.9q34.3', '<NA>',
       'cancer'],
      dtype='object', length=770)

In [76]:
BC_Norm_15_result = pd.DataFrame(columns = ['i','K', 'true', 'lr_predict','svm_predict','rf_predict','ensemble'])
BC_Norm_15_acc = pd.DataFrame(columns = ['K', 'lr_acc','svm_acc','rf_acc'])

In [52]:
import warnings
warnings.filterwarnings(action='ignore')
#warnings.filterwarnings(action='default')

In [77]:
for i, (idx_train, idx_cv) in enumerate(cv.split(BC_Norm_15)):
    df_train = BC_Norm_15.iloc[idx_train]
    df_cv = BC_Norm_15.iloc[idx_cv]
    
    x_train = df_train.drop(['cancer','id'], axis=1)
    y_train = df_train['cancer']
    
    x_cv = df_cv.drop(['cancer','id'], axis=1)
    y_cv = df_cv['cancer']
    
    print('[K =',i,']')
    print('<logistic regression>')
    lr1.fit(x_train, y_train,)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, lr1.predict(x_train)), accuracy_score(y_cv, lr1.predict(x_cv))))
    
    print('<svm>')
    svm1.fit(x_train, y_train,)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, svm1.predict(x_train)), accuracy_score(y_cv, svm1.predict(x_cv))))
    
    print('<rf>')
    rf1.fit(x_train, y_train)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, rf1.predict(x_train)), accuracy_score(y_cv, rf1.predict(x_cv))))
    
    
    print('<ensemble>')
    model = VotingClassifier(estimators=[('lr', lr1), ('svm', svm1), ('rf',rf1)], voting = 'hard')
    model.fit(x_train, y_train)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, model.predict(x_train)), accuracy_score(y_cv, model.predict(x_cv))))
    
    BC_Norm_15_result = BC_Norm_15_result.append(pd.DataFrame(data = {'i':idx_cv,'K': [i]*len(y_cv), 'true': y_cv, 
                                               'lr_predict' : lr1.predict(x_cv),'svm_predict' : svm1.predict(x_cv), 'rf_predict' : rf1.predict(x_cv),
                                                               'ensemble' : model.predict(x_cv)}))
    BC_Norm_15_acc = BC_Norm_15_acc.append(pd.DataFrame({'K' : [i], 'lr_acc' : accuracy_score(y_cv, lr1.predict(x_cv)),
                                                   'svm_acc' : accuracy_score(y_cv, svm1.predict(x_cv)), 'rf_acc' : accuracy_score(y_cv, rf1.predict(x_cv))}
    ))
    print('\n')

[K = 0 ]
<logistic regression>
train accurcacy = 1.0, cv accurcacy = 0.7272727272727273
<svm>
train accurcacy = 0.6190476190476191, cv accurcacy = 0.5454545454545454
<rf>
train accurcacy = 1.0, cv accurcacy = 0.7272727272727273
<ensemble>
train accurcacy = 1.0, cv accurcacy = 0.7272727272727273


[K = 1 ]
<logistic regression>
train accurcacy = 0.9761904761904762, cv accurcacy = 0.5454545454545454
<svm>
train accurcacy = 0.5952380952380952, cv accurcacy = 0.6363636363636364
<rf>
train accurcacy = 0.9523809523809523, cv accurcacy = 0.5454545454545454
<ensemble>
train accurcacy = 0.9523809523809523, cv accurcacy = 0.6363636363636364


[K = 2 ]
<logistic regression>
train accurcacy = 1.0, cv accurcacy = 0.7272727272727273
<svm>
train accurcacy = 0.5952380952380952, cv accurcacy = 0.6363636363636364
<rf>
train accurcacy = 0.9285714285714286, cv accurcacy = 0.6363636363636364
<ensemble>
train accurcacy = 1.0, cv accurcacy = 0.7272727272727273


[K = 3 ]
<logistic regression>
train accurcacy

### ensemble

In [54]:
BC_Norm_15_result.reset_index(drop=True, inplace=True)

In [55]:
BC_Norm_15_result['manual_ensemble'] = np.NAN
for i in range(len(BC_Norm_15_result)) :
    BC_Norm_15_result['manual_ensemble'][i] = BC_Norm_15_result[['lr_predict','svm_predict','rf_predict']].iloc[i].value_counts().index[0]

In [56]:
BC_Norm_15_result

Unnamed: 0,i,K,true,lr_predict,svm_predict,rf_predict,ensemble,manual_ensemble
0,2,0,BC,BC,BC,BC,BC,BC
1,4,0,BC,BC,BC,BC,BC,BC
2,11,0,BC,BC,BC,BC,BC,BC
3,26,0,BC,normal,BC,normal,BC,normal
4,29,0,BC,BC,BC,normal,BC,BC
5,31,0,BC,BC,BC,normal,BC,BC
6,32,0,normal,normal,BC,normal,BC,normal
7,33,0,normal,BC,BC,BC,BC,BC
8,38,0,normal,BC,BC,normal,BC,BC
9,41,0,normal,normal,BC,normal,normal,normal


In [59]:
for i in range(5) :
    print('[K = {}] accuracy : {}'.format(i, 
    sum(BC_Norm_15_result.loc[BC_Norm_15_result.K == i, 'true'] == BC_Norm_15_result.loc[BC_Norm_15_result.K == i, 'manual_ensemble']) / sum(BC_Norm_15_result.K == i)))
    

[K = 0] accuracy : 0.6363636363636364
[K = 1] accuracy : 0.5454545454545454
[K = 2] accuracy : 0.7272727272727273
[K = 3] accuracy : 0.5
[K = 4] accuracy : 0.5


In [60]:
BC_Norm_15_acc

Unnamed: 0,K,lr_acc,svm_acc,rf_acc
0,0,0.727273,0.545455,0.545455
0,1,0.545455,0.636364,0.636364
0,2,0.727273,0.636364,0.727273
0,3,0.3,0.7,0.7
0,4,0.6,0.5,0.4


In [61]:
BC_Norm_15_acc.mean()

K          2.000000
lr_acc     0.580000
svm_acc    0.603636
rf_acc     0.601818
dtype: float64

# BC-others

In [80]:
BC_others_15 = K15.copy()

In [81]:
BC_others_15.columns

Index(['id', 'cyto.10p11.1', 'cyto.10p11.21', 'cyto.10p11.22', 'cyto.10p11.23',
       'cyto.10p12.1', 'cyto.10p12.2', 'cyto.10p12.31', 'cyto.10p12.32',
       'cyto.10p12.33',
       ...
       'cyto.9q33.1', 'cyto.9q33.2', 'cyto.9q33.3', 'cyto.9q34.11',
       'cyto.9q34.12', 'cyto.9q34.13', 'cyto.9q34.2', 'cyto.9q34.3', '<NA>',
       'cancer'],
      dtype='object', length=770)

In [82]:
BC_others_15['target'] = 0
BC_others_15.loc[BC_others_15.cancer == 'BC', 'target'] = 1

In [83]:
BC_others_15.columns

Index(['id', 'cyto.10p11.1', 'cyto.10p11.21', 'cyto.10p11.22', 'cyto.10p11.23',
       'cyto.10p12.1', 'cyto.10p12.2', 'cyto.10p12.31', 'cyto.10p12.32',
       'cyto.10p12.33',
       ...
       'cyto.9q33.2', 'cyto.9q33.3', 'cyto.9q34.11', 'cyto.9q34.12',
       'cyto.9q34.13', 'cyto.9q34.2', 'cyto.9q34.3', '<NA>', 'cancer',
       'target'],
      dtype='object', length=771)

In [84]:
BC_others_15_result = pd.DataFrame(columns = ['i','K', 'true', 'lr_predict','svm_predict','rf_predict','ensemble'])
BC_others_15_acc = pd.DataFrame(columns = ['K', 'lr_acc','svm_acc','rf_acc'])

In [86]:
for i, (idx_train, idx_cv) in enumerate(cv.split(BC_others_15)):
    df_train = BC_others_15.iloc[idx_train]
    df_cv = BC_others_15.iloc[idx_cv]
    
    x_train = df_train.drop(['cancer','id'], axis=1)
    y_train = df_train['cancer']
    
    x_cv = df_cv.drop(['cancer','id'], axis=1)
    y_cv = df_cv['cancer']
    
    print('[K =',i,']')
    print('<logistic regression>')
    lr1.fit(x_train, y_train,)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, lr1.predict(x_train)), accuracy_score(y_cv, lr1.predict(x_cv))))
    
    print('<svm>')
    svm1.fit(x_train, y_train,)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, svm1.predict(x_train)), accuracy_score(y_cv, svm1.predict(x_cv))))
    
    print('<rf>')
    rf1.fit(x_train, y_train)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, rf1.predict(x_train)), accuracy_score(y_cv, rf1.predict(x_cv))))
    
    
    print('<ensemble>')
    model = VotingClassifier(estimators=[('lr', lr1), ('svm', svm1), ('rf',rf1)], voting = 'hard')
    model.fit(x_train, y_train)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, model.predict(x_train)), accuracy_score(y_cv, model.predict(x_cv))))
    
    BC_others_15_result = BC_others_15_result.append(pd.DataFrame(data = {'i':idx_cv,'K': [i]*len(y_cv), 'true': y_cv, 
                                               'lr_predict' : lr1.predict(x_cv),'svm_predict' : svm1.predict(x_cv), 'rf_predict' : rf1.predict(x_cv),
                                                               'ensemble' : model.predict(x_cv)}))
    BC_others_15_acc = BC_others_15_acc.append(pd.DataFrame({'K' : [i], 'lr_acc' : accuracy_score(y_cv, lr1.predict(x_cv)),
                                                   'svm_acc' : accuracy_score(y_cv, svm1.predict(x_cv)), 'rf_acc' : accuracy_score(y_cv, rf1.predict(x_cv))}
    ))
    print('\n')

[K = 0 ]
<logistic regression>
train accurcacy = 1.0, cv accurcacy = 0.8666666666666667
<svm>
train accurcacy = 0.43859649122807015, cv accurcacy = 0.4666666666666667
<rf>
train accurcacy = 0.9824561403508771, cv accurcacy = 0.4
<ensemble>
train accurcacy = 1.0, cv accurcacy = 0.6666666666666666


[K = 1 ]
<logistic regression>
train accurcacy = 1.0, cv accurcacy = 0.6666666666666666
<svm>
train accurcacy = 0.47368421052631576, cv accurcacy = 0.3333333333333333
<rf>
train accurcacy = 1.0, cv accurcacy = 0.6
<ensemble>
train accurcacy = 0.9824561403508771, cv accurcacy = 0.5333333333333333


[K = 2 ]
<logistic regression>
train accurcacy = 1.0, cv accurcacy = 0.9285714285714286
<svm>
train accurcacy = 0.43103448275862066, cv accurcacy = 0.5
<rf>
train accurcacy = 1.0, cv accurcacy = 0.7142857142857143
<ensemble>
train accurcacy = 1.0, cv accurcacy = 0.6428571428571429


[K = 3 ]
<logistic regression>
train accurcacy = 1.0, cv accurcacy = 0.7142857142857143
<svm>
train accurcacy = 0.4310

### ensemble

In [87]:
BC_others_15_result.reset_index(drop=True, inplace=True)

In [88]:
BC_others_15_result['manual_ensemble'] = np.NAN
for i in range(len(BC_others_15_result)) :
    BC_others_15_result['manual_ensemble'][i] = BC_others_15_result[['lr_predict','svm_predict','rf_predict']].iloc[i].value_counts().index[0]

In [89]:
BC_others_15_result

Unnamed: 0,i,K,true,lr_predict,svm_predict,rf_predict,ensemble,manual_ensemble
0,7,0,BC,BC,BC,BC,BC,BC
1,22,0,BC,BC,BC,PC,BC,BC
2,26,0,BC,BC,BC,normal,BC,BC
3,27,0,BC,BC,BC,normal,BC,BC
4,28,0,BC,BC,BC,normal,BC,BC
5,30,0,BC,BC,BC,BC,BC,BC
6,31,0,BC,BC,BC,PC,BC,BC
7,33,0,normal,normal,BC,PC,BC,PC
8,34,0,normal,PC,BC,BC,PC,BC
9,42,0,normal,normal,BC,normal,BC,normal


In [95]:
for i in range(5) :
    k_len = sum(BC_others_15_result.K == i)
    k_true = BC_others_15_result.loc[BC_others_15_result.K == i, 'true']
    print('[K = {}]'.format(i)) 
    print('accuracy lr : {} svm : {} rf : {} ensemble : {} manual : {}'.
          format(
    sum(k_true == BC_others_15_result.loc[BC_others_15_result.K == i, 'lr_predict']) / k_len,
    sum(k_true == BC_others_15_result.loc[BC_others_15_result.K == i, 'svm_predict']) / k_len,
    sum(k_true == BC_others_15_result.loc[BC_others_15_result.K == i, 'rf_predict']) / k_len,
    sum(k_true == BC_others_15_result.loc[BC_others_15_result.K == i, 'ensemble']) / k_len,
    sum(k_true == BC_others_15_result.loc[BC_others_15_result.K == i, 'manual_ensemble']) / k_len
    ))

[K = 0]
accuracy lr : 0.8666666666666667 svm : 0.4666666666666667 rf : 0.4 ensemble : 0.6666666666666666 manual : 0.7333333333333333
[K = 1]
accuracy lr : 0.6666666666666666 svm : 0.3333333333333333 rf : 0.6 ensemble : 0.5333333333333333 manual : 0.5333333333333333
[K = 2]
accuracy lr : 0.9285714285714286 svm : 0.5 rf : 0.7142857142857143 ensemble : 0.6428571428571429 manual : 0.7857142857142857
[K = 3]
accuracy lr : 0.7142857142857143 svm : 0.5 rf : 0.5714285714285714 ensemble : 0.5 manual : 0.5714285714285714
[K = 4]
accuracy lr : 0.7857142857142857 svm : 0.42857142857142855 rf : 0.35714285714285715 ensemble : 0.7142857142857143 manual : 0.7142857142857143


In [96]:
BC_others_15_acc

Unnamed: 0,K,lr_acc,svm_acc,rf_acc
0,0,0.866667,0.466667,0.4
0,1,0.666667,0.333333,0.6
0,2,0.928571,0.5,0.714286
0,3,0.714286,0.5,0.571429
0,4,0.785714,0.428571,0.357143


In [97]:
BC_others_15_acc.mean()

K          2.000000
lr_acc     0.792381
svm_acc    0.445714
rf_acc     0.528571
dtype: float64

# K100

In [3]:
BC_100 = pd.read_csv('BC_32ea_k100_cyto.csv')
Normal_100 = pd.read_csv('Normal_21ea_k100_cyto.csv')
PC_100 = pd.read_csv('PC_20ea_k100_cyto.csv')

#drop person id
BC_100.drop('Unnamed: 0', axis = 1, inplace = True)
Normal_100.drop('Unnamed: 0', axis = 1, inplace = True)
PC_100.drop('Unnamed: 0', axis = 1, inplace = True)

BC_100['cancer'] = 'BC'
Normal_100['cancer'] = 'normal'
PC_100['cancer'] = 'PC'

K100 = pd.concat([BC_100,Normal_100,PC_100], axis = 0)

In [4]:
K100.shape

(73, 767)

In [6]:
K100.dropna(axis = 0).shape

(72, 767)

In [7]:
K100.dropna(axis = 0, inplace=True)

In [8]:
BC_Norm_100 = K100.loc[(K100.cancer == 'BC') | (K100.cancer == 'Norm_100al')]
BC_Norm_100.shape

(53, 767)

In [9]:
#reset_index
BC_Norm_100.reset_index(inplace = True, drop=True)

In [10]:
print('BC비율 : ',len(BC_Norm_100[BC_Norm_100.cancer == 'BC']) / len(BC_Norm_100))
print('Norm_100al비율 : ',len(BC_Norm_100[BC_Norm_100.cancer == 'Norm_100al']) / len(BC_Norm_100))

BC비율 :  0.6037735849056604
Normal비율 :  0.39622641509433965


In [11]:
from sklearn.ensemble import RandomForestClassifier
rf1 = RandomForestClassifier()
from sklearn.svm import SVC
svm1 = SVC(gamma='auto')
from sklearn.linear_model import LogisticRegression
lr1 = LogisticRegression()
from sklearn.ensemble import VotingClassifier


In [12]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True, random_state=0)

## BC-Norm_100al

In [13]:
BC_Norm_100.columns

Index(['id', 'cyto.10p11.1', 'cyto.10p11.21', 'cyto.10p11.22', 'cyto.10p11.23',
       'cyto.10p12.1', 'cyto.10p12.2', 'cyto.10p12.31', 'cyto.10p12.32',
       'cyto.10p12.33',
       ...
       'cyto.9q33.1', 'cyto.9q33.2', 'cyto.9q33.3', 'cyto.9q34.11',
       'cyto.9q34.12', 'cyto.9q34.13', 'cyto.9q34.2', 'cyto.9q34.3', '<NA>',
       'cancer'],
      dtype='object', length=767)

In [14]:
BC_Norm_100_result = pd.DataFrame(columns = ['i','K', 'true', 'lr_predict','svm_predict','rf_predict','ensemble'])
BC_Norm_100_acc = pd.DataFrame(columns = ['K', 'lr_acc','svm_acc','rf_acc'])

In [15]:
import warnings
warnings.filterwarnings(action='ignore')
#warnings.filterwarnings(action='default')

In [16]:
for i, (idx_train, idx_cv) in enumerate(cv.split(BC_Norm_100)):
    df_train = BC_Norm_100.iloc[idx_train]
    df_cv = BC_Norm_100.iloc[idx_cv]
    
    x_train = df_train.drop(['cancer','id'], axis=1)
    y_train = df_train['cancer']
    
    x_cv = df_cv.drop(['cancer','id'], axis=1)
    y_cv = df_cv['cancer']
    
    print('[K =',i,']')
    print('<logistic regression>')
    lr1.fit(x_train, y_train,)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, lr1.predict(x_train)), accuracy_score(y_cv, lr1.predict(x_cv))))
    
    print('<svm>')
    svm1.fit(x_train, y_train,)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, svm1.predict(x_train)), accuracy_score(y_cv, svm1.predict(x_cv))))
    
    print('<rf>')
    rf1.fit(x_train, y_train)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, rf1.predict(x_train)), accuracy_score(y_cv, rf1.predict(x_cv))))
    
    
    print('<ensemble>')
    model = VotingClassifier(estimators=[('lr', lr1), ('svm', svm1), ('rf',rf1)], voting = 'hard')
    model.fit(x_train, y_train)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, model.predict(x_train)), accuracy_score(y_cv, model.predict(x_cv))))
    
    BC_Norm_100_result = BC_Norm_100_result.append(pd.DataFrame(data = {'i':idx_cv,'K': [i]*len(y_cv), 'true': y_cv, 
                                               'lr_predict' : lr1.predict(x_cv),'svm_predict' : svm1.predict(x_cv), 'rf_predict' : rf1.predict(x_cv),
                                                               'ensemble' : model.predict(x_cv)}))
    BC_Norm_100_acc = BC_Norm_100_acc.append(pd.DataFrame({'K' : [i], 'lr_acc' : accuracy_score(y_cv, lr1.predict(x_cv)),
                                                   'svm_acc' : accuracy_score(y_cv, svm1.predict(x_cv)), 'rf_acc' : accuracy_score(y_cv, rf1.predict(x_cv))}
    ))
    print('\n')

[K = 0 ]
<logistic regression>
train accurcacy = 1.0, cv accurcacy = 0.7272727272727273
<svm>
train accurcacy = 0.6190476190476191, cv accurcacy = 0.5454545454545454
<rf>
train accurcacy = 1.0, cv accurcacy = 0.45454545454545453
<ensemble>
train accurcacy = 1.0, cv accurcacy = 0.5454545454545454


[K = 1 ]
<logistic regression>
train accurcacy = 0.9761904761904762, cv accurcacy = 0.7272727272727273
<svm>
train accurcacy = 0.5952380952380952, cv accurcacy = 0.6363636363636364
<rf>
train accurcacy = 1.0, cv accurcacy = 0.8181818181818182
<ensemble>
train accurcacy = 1.0, cv accurcacy = 0.6363636363636364


[K = 2 ]
<logistic regression>
train accurcacy = 0.9761904761904762, cv accurcacy = 0.7272727272727273
<svm>
train accurcacy = 0.5952380952380952, cv accurcacy = 0.6363636363636364
<rf>
train accurcacy = 1.0, cv accurcacy = 0.7272727272727273
<ensemble>
train accurcacy = 0.9761904761904762, cv accurcacy = 0.7272727272727273


[K = 3 ]
<logistic regression>
train accurcacy = 0.976744186

### ensemble

In [17]:
BC_Norm_100_result.reset_index(drop=True, inplace=True)

In [18]:
BC_Norm_100_result['manual_ensemble'] = np.NAN
for i in range(len(BC_Norm_100_result)) :
    BC_Norm_100_result['manual_ensemble'][i] = BC_Norm_100_result[['lr_predict','svm_predict','rf_predict']].iloc[i].value_counts().index[0]

In [19]:
BC_Norm_100_result

Unnamed: 0,i,K,true,lr_predict,svm_predict,rf_predict,ensemble,manual_ensemble
0,2,0,BC,BC,BC,BC,BC,BC
1,4,0,BC,BC,BC,BC,BC,BC
2,11,0,BC,BC,BC,BC,BC,BC
3,26,0,BC,normal,BC,BC,normal,BC
4,29,0,BC,BC,BC,normal,BC,BC
5,31,0,BC,BC,BC,BC,BC,BC
6,32,0,normal,normal,BC,BC,BC,BC
7,33,0,normal,BC,BC,BC,BC,BC
8,38,0,normal,BC,BC,BC,BC,BC
9,41,0,normal,normal,BC,BC,BC,BC


In [21]:
for i in range(5) :
    print('[K = {}] accuracy : {}'.format(i, 
    sum(BC_Norm_100_result.loc[BC_Norm_100_result.K == i, 'true'] == BC_Norm_100_result.loc[BC_Norm_100_result.K == i, 'manual_ensemble']) / sum(BC_Norm_100_result.K == i)))
    

[K = 0] accuracy : 0.5454545454545454
[K = 1] accuracy : 0.7272727272727273
[K = 2] accuracy : 0.7272727272727273
[K = 3] accuracy : 0.5
[K = 4] accuracy : 0.7


In [22]:
BC_Norm_100_acc

Unnamed: 0,K,lr_acc,svm_acc,rf_acc
0,0,0.727273,0.545455,0.454545
0,1,0.727273,0.636364,0.818182
0,2,0.727273,0.636364,0.727273
0,3,0.5,0.7,0.5
0,4,0.7,0.5,0.5


In [272]:
BC_Norm_100_acc.mean()

K          2.000000
lr_acc     0.580000
svm_acc    0.603636
rf_acc     0.734545
dtype: float64

# BC-others

In [23]:
BC_others_100 = K100.copy()

In [24]:
BC_others_100.columns

Index(['id', 'cyto.10p11.1', 'cyto.10p11.21', 'cyto.10p11.22', 'cyto.10p11.23',
       'cyto.10p12.1', 'cyto.10p12.2', 'cyto.10p12.31', 'cyto.10p12.32',
       'cyto.10p12.33',
       ...
       'cyto.9q33.1', 'cyto.9q33.2', 'cyto.9q33.3', 'cyto.9q34.11',
       'cyto.9q34.12', 'cyto.9q34.13', 'cyto.9q34.2', 'cyto.9q34.3', '<NA>',
       'cancer'],
      dtype='object', length=767)

In [25]:
BC_others_100['target'] = 0
BC_others_100.loc[BC_others_100.cancer == 'BC', 'target'] = 1

In [26]:
BC_others_100.columns

Index(['id', 'cyto.10p11.1', 'cyto.10p11.21', 'cyto.10p11.22', 'cyto.10p11.23',
       'cyto.10p12.1', 'cyto.10p12.2', 'cyto.10p12.31', 'cyto.10p12.32',
       'cyto.10p12.33',
       ...
       'cyto.9q33.2', 'cyto.9q33.3', 'cyto.9q34.11', 'cyto.9q34.12',
       'cyto.9q34.13', 'cyto.9q34.2', 'cyto.9q34.3', '<NA>', 'cancer',
       'target'],
      dtype='object', length=768)

In [28]:
BC_others_100_result = pd.DataFrame(columns = ['i','K', 'true', 'lr_predict','svm_predict','rf_predict','ensemble'])
BC_others_100_acc = pd.DataFrame(columns = ['K', 'lr_acc','svm_acc','rf_acc'])

In [29]:
for i, (idx_train, idx_cv) in enumerate(cv.split(BC_others_100)):
    df_train = BC_others_100.iloc[idx_train]
    df_cv = BC_others_100.iloc[idx_cv]
    
    x_train = df_train.drop(['cancer','id','<NA>','target'], axis=1)
    y_train = df_train['target']
    
    x_cv = df_cv.drop(['cancer','id','<NA>','target'], axis=1)
    y_cv = df_cv['target']
    
    print('[K =',i,']')
    print('<logistic regression>')
    lr1.fit(x_train, y_train,)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, lr1.predict(x_train)), accuracy_score(y_cv, lr1.predict(x_cv))))
    
    print('<svm>')
    svm1.fit(x_train, y_train,)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, svm1.predict(x_train)), accuracy_score(y_cv, svm1.predict(x_cv))))
    
    print('<rf>')
    rf1.fit(x_train, y_train)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, rf1.predict(x_train)), accuracy_score(y_cv, rf1.predict(x_cv))))
    

    print('<ensemble>')
    model = VotingClassifier(estimators=[('lr', lr1), ('svm', svm1), ('rf',rf1)], voting = 'hard')
    model.fit(x_train, y_train)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, model.predict(x_train)), accuracy_score(y_cv, model.predict(x_cv))))
    
    BC_others_100_result = BC_others_100_result.append(pd.DataFrame(data = {'i':idx_cv,'K': [i]*len(y_cv), 'true': y_cv, 
                                               'lr_predict' : lr1.predict(x_cv),'svm_predict' : svm1.predict(x_cv), 'rf_predict' : rf1.predict(x_cv),
                                                               'ensemble' : model.predict(x_cv)}))
    BC_others_100_acc = BC_others_100_acc.append(pd.DataFrame({'K' : [i], 'lr_acc' : accuracy_score(y_cv, lr1.predict(x_cv)),
                                                   'svm_acc' : accuracy_score(y_cv, svm1.predict(x_cv)), 'rf_acc' : accuracy_score(y_cv, rf1.predict(x_cv))}
    ))

    print('\n')

[K = 0 ]
<logistic regression>
train accurcacy = 0.9298245614035088, cv accurcacy = 0.6
<svm>
train accurcacy = 0.5614035087719298, cv accurcacy = 0.5333333333333333
<rf>
train accurcacy = 0.9824561403508771, cv accurcacy = 0.6666666666666666
<ensemble>
train accurcacy = 0.9298245614035088, cv accurcacy = 0.6


[K = 1 ]
<logistic regression>
train accurcacy = 0.9473684210526315, cv accurcacy = 0.8
<svm>
train accurcacy = 0.5263157894736842, cv accurcacy = 0.6666666666666666
<rf>
train accurcacy = 0.9473684210526315, cv accurcacy = 0.8
<ensemble>
train accurcacy = 0.9473684210526315, cv accurcacy = 0.8666666666666667


[K = 2 ]
<logistic regression>
train accurcacy = 0.9137931034482759, cv accurcacy = 0.7857142857142857
<svm>
train accurcacy = 0.5689655172413793, cv accurcacy = 0.5
<rf>
train accurcacy = 0.9655172413793104, cv accurcacy = 0.8571428571428571
<ensemble>
train accurcacy = 0.9137931034482759, cv accurcacy = 0.7857142857142857


[K = 3 ]
<logistic regression>
train accurcacy

### ensemble

In [30]:
BC_others_100_result.reset_index(drop=True, inplace=True)

In [31]:
BC_others_100_result['manual_ensemble'] = np.NAN
for i in range(len(BC_others_100_result)) :
    BC_others_100_result['manual_ensemble'][i] = BC_others_100_result[['lr_predict','svm_predict','rf_predict']].iloc[i].value_counts().index[0]

In [32]:
BC_others_100_result

Unnamed: 0,i,K,true,lr_predict,svm_predict,rf_predict,ensemble,manual_ensemble
0,7,0,1,1,0,1,1,1.0
1,22,0,1,0,0,0,0,0.0
2,26,0,1,0,0,0,0,0.0
3,27,0,1,0,0,0,0,0.0
4,28,0,1,0,0,0,0,0.0
5,30,0,1,0,0,1,0,0.0
6,31,0,1,0,0,0,0,0.0
7,33,0,0,0,0,0,0,0.0
8,34,0,0,0,0,0,0,0.0
9,42,0,0,0,0,0,0,0.0


In [37]:
for i in range(5) :
    print('[K = {}] accuracy : {}'.format(i, 
    sum(BC_others_100_result.loc[BC_others_100_result.K == i, 'true'] == BC_others_100_result.loc[BC_others_100_result.K == i, 'manual_ensemble']) / sum(BC_others_100_result.K == i)))
    

[K = 0] accuracy : 0.6
[K = 1] accuracy : 0.8
[K = 2] accuracy : 0.7857142857142857
[K = 3] accuracy : 0.7857142857142857
[K = 4] accuracy : 0.5714285714285714


In [38]:
BC_others_100_acc

Unnamed: 0,K,lr_acc,svm_acc,rf_acc
0,0,0.6,0.533333,0.666667
0,1,0.8,0.666667,0.8
0,2,0.785714,0.5,0.857143
0,3,0.785714,0.5,0.857143
0,4,0.571429,0.571429,0.642857


In [39]:
BC_others_100_acc.mean()

K          2.000000
lr_acc     0.708571
svm_acc    0.554286
rf_acc     0.764762
dtype: float64