In [1]:
import os
import random
import itertools
import re

# 基本的なライブラリ
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

# 描画ライブラリ
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn_analyzer import CustomPairPlot
import graphviz
import pydotplus
from IPython.display import Image
from IPython.display import HTML
from six import StringIO
from ipywidgets import interact, FloatSlider

# 前処理
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import VarianceThreshold

# 補完
from sklearn.experimental import (
    enable_iterative_imputer,
)  # IterativeImputerをimportするために必要
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

# エンコード
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

# データセット分割
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# 特徴量選択
from sklearn.feature_selection import (
    GenericUnivariateSelect,
    f_classif,
    mutual_info_classif,
    chi2,
)
from boruta import BorutaPy

# 学習中
from tqdm import tqdm
from sklearn.model_selection import learning_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import warnings


# config python file
import config

SEED = config.SEED


from functions import *

fix_seed(SEED)


# 最大表示列数の指定（ここでは50列を指定）N
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

%matplotlib inline

  y: pd.Series(),


# 目的
遺伝子学的分類に基づいた、予後の2値分類を実施する。  
分類はCLAUDIN_SUBTYPEに基づいて実施。  
予後は5年、10年、15年の3つの年次に分けている。Trueで死亡であることに注意すること。

# データ読み込み
読み込み元：
    config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/claudin_subtype_chi2"

サブタイプ毎のデータを使用

データの種類が多いので、辞書型で表現する  
```        
+/X_dict
    +/train
        +/X5
            +.X_chi2
            +.X_subtypes
        +/X10
            +.X_chi2
            +.X_subtypes
        +/X15
            +.X_chi2
            +.X_subtypes
    +/test
        +/X5
            +.X_chi2
            +.X_subtypes
        +/X10
            +.X_chi2
            +.X_subtypes
        +/X15
            +.X_chi2
            +.X_subtypes

+/y_dict
    +/train
        +.y5
            +.y_subtypes
        +.y10
            +.y_subtypes
        +.y15
            +.y_subtypes
    +/test
        +.y5
            +.y_subtypes
        +.y10
            +.y_subtypes
        +.y15
            +.y_subtypes
        
        
```

In [2]:
# データが複数種類あるので、辞書に保存する
X_dict = {}
y_dict = {}

# 前処理済みのデータの保存ディレクトリ（テーブル・遺伝子混合）
dir_path = config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CROSS_DIR
# データのカテゴリごとのループ
for dir_cat in [
    d for d in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, d))
]:
    if not dir_cat in X_dict:
        X_dict[dir_cat] = dict()  # ネストの辞書なので、キーに対応する辞書がなければ生成する
        y_dict[dir_cat] = dict()
    # train、testのステップ毎のループ
    for dir_step in [
        d for d in os.listdir(dir_path + "/" + dir_cat) if not d.startswith(".")
    ]:
        if not dir_step in X_dict[dir_cat]:
            X_dict[dir_cat][dir_step] = dict()
            y_dict[dir_cat][dir_step] = dict()
        for f_name in tqdm(os.listdir(dir_path + "/" + dir_cat + "/" + dir_step)):
            if f_name[0] == "X":
                X_dict[dir_cat][dir_step][f_name] = pd.read_pickle(
                    dir_path + "/" + dir_cat + "/" + dir_step + "/" + f_name
                )
            if f_name[0] == "y":
                y_dict[dir_cat][dir_step][f_name] = pd.read_pickle(
                    dir_path + "/" + dir_cat + "/" + dir_step + "/" + f_name
                )
        X_dict[dir_cat][dir_step] = dict(sorted(X_dict[dir_cat][dir_step].items()))
        y_dict[dir_cat][dir_step] = dict(sorted(y_dict[dir_cat][dir_step].items()))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1073.95it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 815.56it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:00<00:00, 4248.83it/s]
100%|███████████████████████████████████████████████████████████

# モデルのトレーニング

## データ全体のベースライン・学習

In [3]:
for (kx, vx), (ky, vy) in zip(
    X_dict["chi2"]["train"].items(),
    y_dict["chi2"]["train"].items(),
):  # X, yを同時にループ
    assert kx[1:3] == ky[1:3]  # カテゴリが同じかの確認
    if accuracy_score(vy, np.zeros(len(vy))) >= 0.5:
        print('"0">"1"', ky, round(accuracy_score(vy, np.zeros(len(vy))), 3))
    else:
        print('"1">"0"', ky, round(accuracy_score(vy, np.ones(len(vy))), 3))

"0">"1" y05.pickle 0.812
"0">"1" y10.pickle 0.636
"1">"0" y15.pickle 0.533


In [4]:
# 各予後毎の、ベースモデルの学習
for (kx, vx), (ky, vy) in zip(
    X_dict["chi2"]["train"].items(),
    y_dict["chi2"]["train"].items(),
):
    print("-----" * 10)
    assert kx[1:3] == ky[1:3]
    print(kx)
    print("使用特徴量", vx.columns)
    print("学習サンプルサイズ：", vx.shape)
    print(vy.value_counts())
    display(compare_bcms(vx, vy))

--------------------------------------------------
X05.pickle
使用特徴量 Index(['FGD3', 'LRIG1', 'ANXA9', 'PDZK1', 'CCNE1', 'STAC2', 'PTTG1', 'AIF1L',
       'KIAA1467', 'SEC14L2',
       ...
       'SYBU', 'FLNB', 'TTYH1', 'PI15', 'RBBP8', 'ARSG', 'BTG2', 'MCM10',
       'DIO1', 'SLC7A8'],
      dtype='object', length=355)
学習サンプルサイズ： (1306, 355)
0    1060
1     246
Name: OS_05years, dtype: int64


11it [00:41,  3.74s/it]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.705887,0.705937,0.448629,0.442233
Linear SVM,0.95142,0.728949,0.86115,0.319061
Polynomial SVM,1.0,0.727405,1.0,0.307344
Logistic Regression,0.927769,0.736577,0.786245,0.306263
AdaBoost,0.8968,0.774874,0.680265,0.266679
Nearest Neighbors,0.834525,0.772584,0.430911,0.231033
Decision Tree,0.881658,0.740446,0.643464,0.22944
Random Forest,0.848137,0.807804,0.322525,0.008333
Quadratic Discriminant Analysis,0.969117,0.81165,0.910602,0.0
RBF SVM,0.811979,0.81165,0.003564,0.0


--------------------------------------------------
X10.pickle
使用特徴量 Index(['FGD3', 'LRIG1', 'ANXA9', 'PDZK1', 'CCNE1', 'STAC2', 'PTTG1', 'AIF1L',
       'KIAA1467', 'SEC14L2',
       ...
       'FLNB', 'TTYH1', 'PI15', 'RBBP8', 'HIST1H2BG', 'ARSG', 'BTG2', 'MCM10',
       'DIO1', 'SLC7A8'],
      dtype='object', length=384)
学習サンプルサイズ： (1048, 384)
0    667
1    381
Name: OS_10years, dtype: int64


11it [00:38,  3.49s/it]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.656701,0.649817,0.562034,0.548122
AdaBoost,0.856551,0.64978,0.794919,0.494966
Logistic Regression,0.911048,0.63174,0.875205,0.488242
Polynomial SVM,1.0,0.626987,1.0,0.486947
Linear SVM,0.944763,0.620311,0.922982,0.477744
Nearest Neighbors,0.748623,0.627912,0.640221,0.469255
Decision Tree,0.823473,0.631612,0.739205,0.446622
RBF SVM,0.784351,0.674643,0.646573,0.443723
Random Forest,0.87606,0.668855,0.80118,0.410713
Quadratic Discriminant Analysis,1.0,0.636493,1.0,0.0


--------------------------------------------------
X15.pickle
使用特徴量 Index(['FGD3', 'LRIG1', 'ANXA9', 'PDZK1', 'CCNE1', 'STAC2', 'PTTG1', 'AIF1L',
       'KIAA1467', 'SEC14L2',
       ...
       'FLNB', 'TTYH1', 'PI15', 'RBBP8', 'HIST1H2BG', 'ARSG', 'BTG2', 'MCM10',
       'DIO1', 'SLC7A8'],
      dtype='object', length=395)
学習サンプルサイズ： (811, 395)
1    432
0    379
Name: OS_15years, dtype: int64


11it [00:28,  2.59s/it]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RBF SVM,0.800384,0.670837,0.823735,0.705152
Sigmoid SVM,0.532677,0.532746,0.695061,0.692588
Random Forest,0.866419,0.646161,0.878101,0.692261
AdaBoost,0.8856,0.627582,0.893471,0.655209
Nearest Neighbors,0.748187,0.624044,0.771799,0.651356
Naive Bayes,0.644472,0.637504,0.647227,0.637349
Logistic Regression,0.962736,0.584463,0.964937,0.604978
Polynomial SVM,1.0,0.590681,1.0,0.604849
Linear SVM,0.989725,0.583273,0.990342,0.602943
Decision Tree,0.811755,0.57841,0.824002,0.600451


# Borutaによる特徴量選択

In [5]:
for i in range(5, 16, 5):  # 全部の特徴量で学習
    print("----------" * 10)
    print("index: ", format(str(i).zfill(2)))
    rf1 = RandomForestClassifier(n_jobs=-1, max_depth=5, random_state=SEED)
    rf1.fit(
        X_dict["chi2"]["train"]["X{0}.pickle".format(str(i).zfill(2))],
        y_dict["chi2"]["train"]["y{0}.pickle".format(str(i).zfill(2))],
    )
    rf1_pred = rf1.predict(
        X_dict["chi2"]["train"]["X{0}.pickle".format(str(i).zfill(2))]
    )
    print("train score")
    print(
        show_scores(
            y_dict["chi2"]["train"]["y{0}.pickle".format(str(i).zfill(2))], rf1_pred
        )
    )
    rf1_pred = rf1.predict(
        X_dict["chi2"]["test"]["X{0}.pickle".format(str(i).zfill(2))]
    )
    print("test score")
    print(
        show_scores(
            y_dict["chi2"]["test"]["y{0}.pickle".format(str(i).zfill(2))], rf1_pred
        )
    )

    # RandomForestRegressorでBorutaを実行
    rf = RandomForestClassifier(n_jobs=-1, max_depth=5)
    feat_selector = BorutaPy(rf, n_estimators="auto", verbose=0, random_state=SEED)
    feat_selector.fit(
        X_dict["chi2"]["train"]["X{0}.pickle".format(str(i).zfill(2))].values,
        y_dict["chi2"]["train"]["y{0}.pickle".format(str(i).zfill(2))].values,
    )

    # 選択された特徴量を確認
    selected = feat_selector.support_
    print("選択された特徴量の数:{0}".format(np.sum(selected)))
    print(selected)
    print(
        X_dict["chi2"]["train"]["X{0}.pickle".format(str(i).zfill(2))].columns[selected]
    )

    # 選択した特徴量で学習
    X_selected = X_dict["chi2"]["train"]["X{0}.pickle".format(str(i).zfill(2))][
        X_dict["chi2"]["train"]["X{0}.pickle".format(str(i).zfill(2))].columns[selected]
    ]
    rf2 = RandomForestClassifier(n_jobs=-1, max_depth=5, random_state=SEED)
    rf2.fit(X_selected, y_dict["chi2"]["train"]["y{0}.pickle".format(str(i).zfill(2))])
    rf2_pred = rf2.predict(
        X_dict["chi2"]["train"]["X{0}.pickle".format(str(i).zfill(2))][
            X_selected.columns
        ]
    )
    print(
        show_scores(
            y_dict["chi2"]["train"]["y{0}.pickle".format(str(i).zfill(2))], rf2_pred
        )
    )
    rf2_pred = rf2.predict(
        X_dict["chi2"]["test"]["X{0}.pickle".format(str(i).zfill(2))][
            X_selected.columns
        ]
    )
    print(
        show_scores(
            y_dict["chi2"]["test"]["y{0}.pickle".format(str(i).zfill(2))], rf2_pred
        )
    )

----------------------------------------------------------------------------------------------------
index:  05
train score
accuracy:  0.8522205206738132
precision:  1.0
recall:  0.21544715447154472
f1 score:  0.3545150501672241
None
test score
accuracy:  0.8211009174311926
precision:  0.0
recall:  0.0
f1 score:  0.0
None


  _warn_prf(average, modifier, msg_start, len(result))
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.support_ = np.zeros(n_feat, dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.support_weak_ = np.zeros(n_feat, dtype=np.bool)


選択された特徴量の数:26
[ True False False False False False False False False False False False
 False False False False False False False False False  True False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False  True  True False False False False False False False  True
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False  True False False False False False False False False False
 False False False False False False False False False False False False
 False False  True False False  True False False False False False False
 False False False False False False False False False False False False
 False False False False False  True False False False False False False
 False False False False False False 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.support_ = np.zeros(n_feat, dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.support_weak_ = np.zeros(n_feat, dtype=np.bool)


選択された特徴量の数:41
[ True False False False False False  True False False False False False
 False False  True False False False False False False  True False False
  True False False False False False False False  True False False False
 False False False False False False  True False  True False False False
 False False False False False False False False False False False  True
 False False False False False False False  True False False False False
 False False False False  True False False False False False False False
 False False False False False False False False False False False False
 False False False  True False False False False False False False False
 False False False  True False False False False False False False  True
 False False False False False False False False False False False False
 False False False  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False  True 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.support_ = np.zeros(n_feat, dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.support_weak_ = np.zeros(n_feat, dtype=np.bool)


## subtype毎のベースライン・学習

In [6]:
for (kx, vx), (ky, vy) in zip(
    X_dict["claudin_subtype_chi2"]["train"].items(),
    y_dict["claudin_subtype_chi2"]["train"].items(),
):
    assert kx.split("_")[1] == ky.split("_")[1]
    if accuracy_score(vy, np.zeros(len(vy))) >= 0.5:
        print('"0">"1"', ky, round(accuracy_score(vy, np.zeros(len(vy))), 3))
    else:
        print('"1">"0"', ky, round(accuracy_score(vy, np.ones(len(vy))), 3))

"0">"1" y05_Basal.pickle 0.655
"0">"1" y05_Her2.pickle 0.667
"0">"1" y05_LumA.pickle 0.923
"0">"1" y05_LumB.pickle 0.788
"0">"1" y05_Normal.pickle 0.821
"0">"1" y05_claudin-low.pickle 0.812
"0">"1" y10_Basal.pickle 0.557
"1">"0" y10_Her2.pickle 0.558
"0">"1" y10_LumA.pickle 0.783
"0">"1" y10_LumB.pickle 0.54
"0">"1" y10_Normal.pickle 0.646
"0">"1" y10_claudin-low.pickle 0.692
"1">"0" y15_Basal.pickle 0.577
"1">"0" y15_Her2.pickle 0.684
"0">"1" y15_LumA.pickle 0.626
"1">"0" y15_LumB.pickle 0.675
"1">"0" y15_Normal.pickle 0.571
"0">"1" y15_claudin-low.pickle 0.575


In [7]:
for (kx, vx), (ky, vy) in zip(
    X_dict["claudin_subtype_chi2"]["train"].items(),
    y_dict["claudin_subtype_chi2"]["train"].items(),
):
    print("-----" * 10)
    assert kx.split("_")[1] == ky.split("_")[1]
    print(kx)
    print("使用特徴量", vx.columns)
    print("学習サンプルサイズ：", vx.shape)
    print(vy.value_counts())
    display(compare_bcms(vx, vy))

--------------------------------------------------
X05_Basal.pickle
使用特徴量 Index(['CXCL13', 'CSN3', 'SCGB2A2', 'IGHG1'], dtype='object')
学習サンプルサイズ： (148, 4)
0    97
1    51
Name: OS_05years, dtype: int64


11it [00:01,  9.43it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.722209,0.694762,0.626209,0.574322
Quadratic Discriminant Analysis,0.728212,0.681905,0.634223,0.563464
Linear SVM,0.727488,0.681905,0.566543,0.50681
Logistic Regression,0.716205,0.701905,0.537419,0.50308
Polynomial SVM,0.740989,0.632381,0.586607,0.476721
AdaBoost,0.939165,0.642381,0.908858,0.446189
Random Forest,0.768028,0.627143,0.612353,0.388636
Decision Tree,0.757491,0.60619,0.623461,0.370668
Nearest Neighbors,0.758983,0.634286,0.610418,0.343997
RBF SVM,0.706419,0.599524,0.407957,0.240404


--------------------------------------------------
X05_Her2.pickle
使用特徴量 Index(['U79293', 'KRT81', 'GFRA1', 'ESR1', 'SCUBE2', 'CLCA2'], dtype='object')
学習サンプルサイズ： (153, 6)
0    102
1     51
Name: OS_05years, dtype: int64


11it [00:01,  7.95it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.645647,0.630833,0.583513,0.566265
Quadratic Discriminant Analysis,0.720449,0.64375,0.620832,0.498205
Decision Tree,0.787263,0.695,0.668107,0.487929
AdaBoost,0.982582,0.6475,0.973693,0.412994
Random Forest,0.76691,0.70125,0.525727,0.400267
Logistic Regression,0.689934,0.641667,0.40273,0.364747
Nearest Neighbors,0.736433,0.603333,0.565481,0.340657
Polynomial SVM,0.753126,0.622917,0.557608,0.287309
RBF SVM,0.7139,0.655417,0.385193,0.261587
Linear SVM,0.676864,0.647917,0.134062,0.12381


--------------------------------------------------
X05_LumA.pickle
使用特徴量 Index(['S100P'], dtype='object')
学習サンプルサイズ： (466, 1)
0    430
1     36
Name: OS_05years, dtype: int64


11it [01:14,  6.75s/it]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoost,0.931569,0.916142,0.216022,0.0
Decision Tree,0.922745,0.922618,0.0,0.0
Linear SVM,0.922745,0.922618,0.0,0.0
Logistic Regression,0.922745,0.922618,0.0,0.0
Naive Bayes,0.922745,0.922618,0.0,0.0
Nearest Neighbors,0.924653,0.916189,0.117282,0.0
Polynomial SVM,0.922745,0.922618,0.0,0.0
Quadratic Discriminant Analysis,0.922745,0.922618,0.0,0.0
RBF SVM,0.922745,0.922618,0.0,0.0
Random Forest,0.922745,0.922618,0.0,0.0


--------------------------------------------------
X05_LumB.pickle
使用特徴量 Index(['TMEM26', 'PDZK1', 'KCNK1', 'ANKRD30A', 'IGKC', 'DB005376', 'SCGB2A2',
       'BEX1', 'FBN2', 'SCGB2A1', 'GP2'],
      dtype='object')
学習サンプルサイズ： (306, 11)
0    241
1     65
Name: OS_05years, dtype: int64


11it [00:01,  7.85it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.767599,0.747204,0.563335,0.525057
Quadratic Discriminant Analysis,0.830416,0.754194,0.657291,0.447791
Polynomial SVM,0.876166,0.73828,0.670226,0.352661
AdaBoost,0.957153,0.751075,0.894907,0.348157
Logistic Regression,0.7952,0.780645,0.388285,0.327273
Decision Tree,0.843495,0.731398,0.584431,0.259242
Nearest Neighbors,0.832241,0.738387,0.48826,0.221663
Linear SVM,0.789391,0.774086,0.074345,0.036508
RBF SVM,0.793386,0.790645,0.054911,0.025
Random Forest,0.799195,0.777634,0.114073,0.0


--------------------------------------------------
X05_Normal.pickle
使用特徴量 Index(['CFB', 'CPB1'], dtype='object')
学習サンプルサイズ： (95, 2)
0    78
1    17
Name: OS_05years, dtype: int64


11it [00:00, 11.91it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.843256,0.831111,0.536462,0.325
Quadratic Discriminant Analysis,0.845595,0.82,0.55312,0.316667
Polynomial SVM,0.857278,0.817778,0.492775,0.266667
Logistic Regression,0.852599,0.816667,0.452276,0.145238
Nearest Neighbors,0.852627,0.786667,0.469014,0.116667
Linear SVM,0.829179,0.816667,0.13992,0.033333
AdaBoost,1.0,0.704444,1.0,0.0
Decision Tree,0.820999,0.816667,0.0,0.0
RBF SVM,0.845595,0.795556,0.275998,0.0
Random Forest,0.820999,0.816667,0.0,0.0


--------------------------------------------------
X05_claudin-low.pickle
使用特徴量 Index(['SNAR-A3', 'S100A9', 'SFRP1', 'C4orf7', 'GABRP'], dtype='object')
学習サンプルサイズ： (138, 5)
0    112
1     26
Name: OS_05years, dtype: int64


11it [00:01,  9.30it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.851045,0.833516,0.536386,0.43
Quadratic Discriminant Analysis,0.855858,0.84011,0.543654,0.410476
AdaBoost,1.0,0.797802,1.0,0.349048
Nearest Neighbors,0.846213,0.833516,0.466509,0.340476
RBF SVM,0.868755,0.826374,0.551463,0.273333
Polynomial SVM,0.871987,0.819231,0.568798,0.206667
Linear SVM,0.8462,0.826374,0.407098,0.2
Logistic Regression,0.842987,0.819231,0.427146,0.186667
Decision Tree,0.825277,0.790659,0.203027,0.0
Random Forest,0.814813,0.813187,0.03125,0.0


--------------------------------------------------
X10_Basal.pickle
使用特徴量 Index(['CSN3', 'IGKC', 'S100A8', 'CXCL13', 'IGHG1', 'SCGB2A2'], dtype='object')
学習サンプルサイズ： (122, 6)
0    68
1    54
Name: OS_10years, dtype: int64


11it [00:01, 10.76it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear SVM,0.729491,0.703846,0.684535,0.645867
Naive Bayes,0.709458,0.687179,0.677577,0.632133
Polynomial SVM,0.785063,0.696154,0.758693,0.623936
Logistic Regression,0.711284,0.695513,0.666559,0.618395
Quadratic Discriminant Analysis,0.751326,0.655128,0.72807,0.59511
Random Forest,0.745922,0.648077,0.703627,0.584329
Decision Tree,0.731301,0.614103,0.687152,0.548535
RBF SVM,0.714003,0.621795,0.675344,0.548139
Nearest Neighbors,0.721301,0.598718,0.669251,0.476631
AdaBoost,0.974529,0.476923,0.971344,0.445012


--------------------------------------------------
X10_Her2.pickle
使用特徴量 Index(['U79293', 'KRT81', 'SERPINA3', 'CAPN8', 'SERPINA5', 'C19orf33', 'GFRA1',
       'ESR1', 'S100A9', 'SCUBE2', 'CLCA2', 'ATHL1'],
      dtype='object')
学習サンプルサイズ： (129, 12)
1    72
0    57
Name: OS_10years, dtype: int64


11it [00:01,  9.93it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RBF SVM,0.751098,0.698718,0.800862,0.750198
Random Forest,0.749374,0.667949,0.788303,0.721336
Sigmoid SVM,0.558134,0.557692,0.716317,0.708039
Naive Bayes,0.665812,0.652564,0.71114,0.700568
Nearest Neighbors,0.730416,0.629487,0.771059,0.686281
Logistic Regression,0.699403,0.612821,0.750089,0.670424
Linear SVM,0.699418,0.605128,0.74823,0.664722
Quadratic Discriminant Analysis,0.828581,0.605128,0.853956,0.661903
Polynomial SVM,0.894916,0.582692,0.91006,0.622575
Decision Tree,0.738159,0.590385,0.758663,0.618937


--------------------------------------------------
X10_LumA.pickle
使用特徴量 Index(['SLC30A8', 'S100P', 'C8orf4', 'GRIA2', 'VTCN1', 'C6orf126', 'MYBPC1',
       'CLIC6', 'MKX', 'BEX1', 'PVALB', 'KRT15', 'SLC7A2', 'CLEC3A', 'NFKBIZ',
       'CPB1'],
      dtype='object')
学習サンプルサイズ： (359, 16)
0    281
1     78
Name: OS_10years, dtype: int64


11it [00:02,  4.14it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.767256,0.727302,0.55917,0.470421
Quadratic Discriminant Analysis,0.816778,0.721746,0.615983,0.370669
Logistic Regression,0.798521,0.766349,0.379004,0.300306
Decision Tree,0.830396,0.749365,0.53491,0.289496
Polynomial SVM,0.948314,0.696429,0.874631,0.269663
Nearest Neighbors,0.823893,0.743889,0.514,0.235556
AdaBoost,0.92572,0.707698,0.81148,0.215961
Linear SVM,0.782731,0.782857,0.0,0.0
RBF SVM,0.79666,0.774524,0.124493,0.0
Random Forest,0.78954,0.777302,0.061477,0.0


--------------------------------------------------
X10_LumB.pickle
使用特徴量 Index(['TMEM26', 'PDZK1', 'KCNK1', 'TFAP2B', 'ANKRD30A', 'IGKC', 'DB005376',
       'SCGB2A2', 'BEX1', 'FBN2', 'SCGB2A1', 'GP2'],
      dtype='object')
学習サンプルサイズ： (252, 12)
0    136
1    116
Name: OS_10years, dtype: int64


11it [00:01,  7.72it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RBF SVM,0.728841,0.615692,0.702011,0.581171
Polynomial SVM,0.832455,0.599846,0.823682,0.579535
Naive Bayes,0.6314,0.596,0.623169,0.574143
Quadratic Discriminant Analysis,0.737652,0.595538,0.728532,0.559289
AdaBoost,0.931665,0.571846,0.92648,0.550946
Logistic Regression,0.646388,0.607692,0.592313,0.53981
Linear SVM,0.649476,0.587846,0.606468,0.527845
Random Forest,0.793215,0.595385,0.758516,0.512874
Nearest Neighbors,0.635367,0.472923,0.618262,0.463472
Decision Tree,0.750004,0.519538,0.726074,0.454936


--------------------------------------------------
X10_Normal.pickle
使用特徴量 Index(['TCN1', 'CFB', 'CLIC6', 'SNAR-A3', 'CPB1'], dtype='object')
学習サンプルサイズ： (79, 5)
0    51
1    28
Name: OS_10years, dtype: int64


11it [00:00, 11.84it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear SVM,0.79329,0.725,0.711855,0.647857
Polynomial SVM,0.884703,0.775,0.827843,0.643333
Logistic Regression,0.807336,0.75,0.719633,0.612857
Quadratic Discriminant Analysis,0.817175,0.7625,0.712735,0.581429
Naive Bayes,0.779206,0.7625,0.651013,0.562857
RBF SVM,0.855125,0.710714,0.774624,0.46
AdaBoost,1.0,0.657143,1.0,0.438095
Decision Tree,0.777797,0.633929,0.679585,0.430476
Nearest Neighbors,0.814378,0.621429,0.694903,0.31381
Random Forest,0.774961,0.671429,0.585628,0.253333


--------------------------------------------------
X10_claudin-low.pickle
使用特徴量 Index(['PROM1', 'ROPN1', 'SNAR-A3', 'S100A9', 'SFRP1', 'C4orf7', 'ELF5',
       'GABRP'],
      dtype='object')
学習サンプルサイズ： (107, 8)
0    74
1    33
Name: OS_10years, dtype: int64


11it [00:01, 10.35it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoost,1.0,0.759091,1.0,0.52329
Nearest Neighbors,0.837983,0.756364,0.70883,0.521905
Naive Bayes,0.681164,0.653636,0.577958,0.512597
Quadratic Discriminant Analysis,0.822466,0.719091,0.675884,0.444444
Decision Tree,0.800655,0.692727,0.662706,0.425736
RBF SVM,0.823486,0.748182,0.627296,0.391905
Linear SVM,0.792311,0.737273,0.582978,0.386667
Logistic Regression,0.780885,0.719091,0.589367,0.386667
Polynomial SVM,0.886791,0.681818,0.785608,0.341667
Random Forest,0.778834,0.71,0.467441,0.241905


--------------------------------------------------
X15_Basal.pickle
使用特徴量 Index(['CSN3', 'IGKC', 'S100A8', 'CXCL13', 'IGHG1', 'SCGB2A2'], dtype='object')
学習サンプルサイズ： (97, 6)
1    56
0    41
Name: OS_15years, dtype: int64


11it [00:00, 11.26it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.754872,0.684444,0.797845,0.721612
Random Forest,0.754833,0.688889,0.797311,0.70961
Sigmoid SVM,0.57726,0.572222,0.731687,0.696954
Logistic Regression,0.722793,0.658889,0.770031,0.69
Naive Bayes,0.72052,0.668889,0.755271,0.681132
Polynomial SVM,0.784679,0.617778,0.826863,0.675145
Linear SVM,0.720559,0.617778,0.767112,0.672271
RBF SVM,0.727416,0.596667,0.784792,0.663413
Nearest Neighbors,0.691915,0.595556,0.746963,0.647727
AdaBoost,0.99313,0.536667,0.993999,0.575678


--------------------------------------------------
X15_Her2.pickle
使用特徴量 Index(['U79293', 'KRT81', 'SERPINA3', 'CAPN8', 'SERPINA5', 'C19orf33', 'GFRA1',
       'ESR1', 'S100A9', 'SCUBE2', 'CLCA2', 'ATHL1'],
      dtype='object')
学習サンプルサイズ： (114, 12)
1    78
0    36
Name: OS_15years, dtype: int64


11it [00:01, 10.35it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sigmoid SVM,0.68419,0.682576,0.812352,0.800045
RBF SVM,0.714458,0.640152,0.826622,0.772839
Linear SVM,0.692994,0.631818,0.814536,0.768246
Random Forest,0.711536,0.631818,0.821586,0.766449
Nearest Neighbors,0.74268,0.641667,0.825124,0.750422
Quadratic Discriminant Analysis,0.875281,0.649242,0.912398,0.748678
Naive Bayes,0.686132,0.641667,0.777443,0.737519
Decision Tree,0.767047,0.613636,0.844541,0.737159
Logistic Regression,0.672501,0.587879,0.790166,0.73072
Polynomial SVM,0.915191,0.639394,0.941016,0.726961


--------------------------------------------------
X15_LumA.pickle
使用特徴量 Index(['SLC30A8', 'S100P', 'C8orf4', 'GRIA2', 'VTCN1', 'C6orf126', 'MYBPC1',
       'CLIC6', 'MKX', 'DB005376', 'BEX1', 'PVALB', 'KRT15', 'SLC7A2',
       'CLEC3A', 'CYP4F22', 'NFKBIZ', 'CPB1'],
      dtype='object')
学習サンプルサイズ： (270, 18)
0    169
1    101
Name: OS_15years, dtype: int64


11it [00:01,  7.07it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.697531,0.677778,0.643307,0.622351
Linear SVM,0.720165,0.651852,0.613114,0.506496
Decision Tree,0.769547,0.637037,0.664319,0.48492
Logistic Regression,0.713992,0.655556,0.580504,0.479874
AdaBoost,0.936626,0.6,0.913551,0.460158
Quadratic Discriminant Analysis,0.773251,0.6,0.707041,0.456753
Nearest Neighbors,0.758848,0.618519,0.666191,0.447482
RBF SVM,0.80535,0.611111,0.728761,0.43011
Polynomial SVM,0.927984,0.518519,0.902859,0.325915
Random Forest,0.787654,0.622222,0.662757,0.312642


--------------------------------------------------
X15_LumB.pickle
使用特徴量 Index(['TMEM26', 'PDZK1', 'KCNK1', 'TFAP2B', 'CELSR2', 'ANKRD30A', 'IGKC',
       'DB005376', 'SHISA2', 'SCGB2A2', 'BEX1', 'FBN2', 'SCGB2A1', 'FCRLB',
       'GP2'],
      dtype='object')
学習サンプルサイズ： (194, 15)
1    131
0     63
Name: OS_15years, dtype: int64


11it [00:01,  8.82it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RBF SVM,0.789826,0.696579,0.864403,0.80637
Logistic Regression,0.749136,0.722105,0.828127,0.804177
Sigmoid SVM,0.675264,0.675789,0.806105,0.802144
Linear SVM,0.745136,0.701053,0.824158,0.78859
Random Forest,0.751429,0.666053,0.83973,0.784739
Quadratic Discriminant Analysis,0.819018,0.690789,0.868672,0.774076
Nearest Neighbors,0.766893,0.655263,0.842491,0.767704
Naive Bayes,0.736535,0.695789,0.804149,0.765968
AdaBoost,0.983974,0.65,0.988214,0.751046
Polynomial SVM,0.947885,0.659737,0.962726,0.739925


--------------------------------------------------
X15_Normal.pickle
使用特徴量 Index(['TCN1', 'CFB', 'SCGB3A1', 'CLIC6', 'SNAR-A3', 'CALML5', 'CLEC3A',
       'CPB1'],
      dtype='object')
学習サンプルサイズ： (56, 8)
1    32
0    24
Name: OS_15years, dtype: int64


11it [00:00, 12.11it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoost,1.0,0.88,1.0,0.879762
Logistic Regression,0.863098,0.806667,0.880411,0.824206
Polynomial SVM,0.996,0.82,0.996487,0.809365
Linear SVM,0.87698,0.773333,0.888745,0.783175
Nearest Neighbors,0.86502,0.776667,0.868783,0.771984
Naive Bayes,0.847216,0.786667,0.861549,0.768254
RBF SVM,0.908745,0.73,0.916718,0.744206
Quadratic Discriminant Analysis,0.910784,0.71,0.918372,0.735873
Random Forest,0.722314,0.623333,0.800582,0.726825
Sigmoid SVM,0.571412,0.57,0.727052,0.70583


--------------------------------------------------
X15_claudin-low.pickle
使用特徴量 Index(['ROPN1B', 'PROM1', 'SOX10', 'ROPN1', 'SNAR-A3', 'S100A9', 'SFRP1',
       'C4orf7', 'ELF5', 'GABRP'],
      dtype='object')
学習サンプルサイズ： (80, 10)
0    46
1    34
Name: OS_15years, dtype: int64


11it [00:00, 11.28it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.722222,0.7,0.706563,0.585303
Random Forest,0.734722,0.6625,0.708905,0.547381
AdaBoost,1.0,0.6625,1.0,0.531984
Logistic Regression,0.761111,0.6625,0.704146,0.519365
Quadratic Discriminant Analysis,0.829167,0.6375,0.790251,0.498095
Polynomial SVM,0.916667,0.625,0.896726,0.492857
Linear SVM,0.755556,0.625,0.681739,0.475952
Nearest Neighbors,0.833333,0.675,0.791551,0.462857
RBF SVM,0.823611,0.625,0.782521,0.457143
Decision Tree,0.768056,0.6625,0.677715,0.438095
