In [1]:
import os
import random
import itertools
import re

# 基本的なライブラリ
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

# 描画ライブラリ
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn_analyzer import CustomPairPlot
import graphviz
import pydotplus
from IPython.display import Image
from IPython.display import HTML
from six import StringIO
from ipywidgets import interact, FloatSlider

# 前処理
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import VarianceThreshold

# 補完
from sklearn.experimental import (
    enable_iterative_imputer,
)  # IterativeImputerをimportするために必要
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

# エンコード
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

# データセット分割
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# 特徴量選択
from sklearn.feature_selection import (
    GenericUnivariateSelect,
    f_classif,
    mutual_info_classif,
    chi2,
)

# 学習中
from tqdm import tqdm
from sklearn.model_selection import learning_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import warnings


# config python file
import config

SEED = config.SEED


from functions import *

fix_seed(SEED)


# 最大表示列数の指定（ここでは50列を指定）N
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

%matplotlib inline

  y: pd.Series(),


# 目的
遺伝子学的分類に基づいた、予後の2値分類を実施する。  
分類はCLAUDIN_SUBTYPEに基づいて実施。  
予後は5年、10年、15年の3つの年次に分けている。Trueで死亡であることに注意すること。

# データ読み込み
読み込み元：
    config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/feature_selected_chi2"

サブタイプ毎のデータを使用

データの種類が多いので、辞書型で表現する  
```
+/y_dict
    +/train
        +.y5
            +.y_subtypes
        +.y10
            +.y_subtypes
        +.y15
            +.y_subtypes
    +/test
        +.y5
            +.y_subtypes
        +.y10
            +.y_subtypes
        +.y15
            +.y_subtypes
        
        
+/X_dict
    +/train
        +/X5
            +.X_subtypes
        +/X10
            +.X_subtypes
        +/X15
            +.X_subtypes
    +/test
        +/X5
            +.X_subtypes
        +/X10
            +.X_subtypes
        +/X15
            +.X_subtypes
```

In [2]:
train_dict = {}
train_dict["y5"] = pd.read_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/y5_train.pkl"
)
train_dict["y10"] = pd.read_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/y10_train.pkl"
)
train_dict["y15"] = pd.read_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/y15_train.pkl"
)


test_dict = {}
test_dict["y5"] = pd.read_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/y5_test.pkl"
)
test_dict["y10"] = pd.read_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/y10_test.pkl"
)
test_dict["y15"] = pd.read_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/y15_test.pkl"
)

y_dict = {"train": train_dict, "test": test_dict}

In [3]:
# train
X5_train_dict = {}
X10_train_dict = {}
X15_train_dict = {}

y5_train_dict = {}
y10_train_dict = {}
y15_train_dict = {}

X5_test_dict = {}
X10_test_dict = {}
X15_test_dict = {}

y5_test_dict = {}
y10_test_dict = {}
y15_test_dict = {}
dir_name = (
    config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/feature_selected_chi2"
)

for d_name in os.listdir(dir_name):
    if d_name == "train":  # train df dict
        for f_name in tqdm(os.listdir(dir_name + "/" + d_name)):
            if re.match(r"X5_.*", f_name):
                X5_train_dict[f_name] = pd.read_pickle(
                    dir_name + "/" + d_name + "/" + f_name
                )
            elif re.match(r"X10_.*", f_name):
                X10_train_dict[f_name] = pd.read_pickle(
                    dir_name + "/" + d_name + "/" + f_name
                )
            elif re.match(r"X15_.*", f_name):
                X15_train_dict[f_name] = pd.read_pickle(
                    dir_name + "/" + d_name + "/" + f_name
                )
            elif re.match(r"y5_.*", f_name):
                y5_train_dict[f_name] = pd.read_pickle(
                    dir_name + "/" + d_name + "/" + f_name
                )
            elif re.match(r"y10_.*", f_name):
                y10_train_dict[f_name] = pd.read_pickle(
                    dir_name + "/" + d_name + "/" + f_name
                )
            elif re.match(r"y15_.*", f_name):
                y15_train_dict[f_name] = pd.read_pickle(
                    dir_name + "/" + d_name + "/" + f_name
                )
    elif d_name == "test":  # test df dict
        for f_name in tqdm(os.listdir(dir_name + "/" + d_name)):
            if re.match(r"X5_.*", f_name):
                X5_test_dict[f_name] = pd.read_pickle(
                    dir_name + "/" + d_name + "/" + f_name
                )
            elif re.match(r"X10_.*", f_name):
                X10_test_dict[f_name] = pd.read_pickle(
                    dir_name + "/" + d_name + "/" + f_name
                )
            elif re.match(r"X15_.*", f_name):
                X15_test_dict[f_name] = pd.read_pickle(
                    dir_name + "/" + d_name + "/" + f_name
                )
            elif re.match(r"y5_.*", f_name):
                y5_test_dict[f_name] = pd.read_pickle(
                    dir_name + "/" + d_name + "/" + f_name
                )
            elif re.match(r"y10_.*", f_name):
                y10_test_dict[f_name] = pd.read_pickle(
                    dir_name + "/" + d_name + "/" + f_name
                )
            elif re.match(r"y15_.*", f_name):
                y15_test_dict[f_name] = pd.read_pickle(
                    dir_name + "/" + d_name + "/" + f_name
                )

X5_train_dict = dict(sorted(X5_train_dict.items()))
X10_train_dict = dict(sorted(X10_train_dict.items()))
X15_train_dict = dict(sorted(X15_train_dict.items()))

y5_train_dict = dict(sorted(y5_train_dict.items()))
y10_train_dict = dict(sorted(y10_train_dict.items()))
y15_train_dict = dict(sorted(y15_train_dict.items()))

X5_test_dict = dict(sorted(X5_test_dict.items()))
X10_test_dict = dict(sorted(X10_test_dict.items()))
X15_test_dict = dict(sorted(X15_test_dict.items()))

y5_test_dict = dict(sorted(y5_test_dict.items()))
y10_test_dict = dict(sorted(y10_test_dict.items()))
y15_test_dict = dict(sorted(y15_test_dict.items()))

X_train_dict = {"X5": X5_train_dict, "X10": X10_train_dict, "X15": X15_train_dict}
y_train_dict = {"y5": y5_train_dict, "y10": y10_train_dict, "y15": y15_train_dict}

X_test_dict = {"X5": X5_test_dict, "X10": X10_test_dict, "X15": X15_test_dict}
y_test_dict = {"X5": y5_test_dict, "X10": y10_test_dict, "X15": y15_test_dict}

# train, testの包括
X_dict = {"train": X_train_dict, "test": X_test_dict}
y_dict = {"train": y_train_dict, "test": y_test_dict}

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:00<00:00, 4213.97it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:00<00:00, 4221.86it/s]


# 各種類ベースラインの設定

In [4]:
for xt, yt in zip(X_train_dict.values(), y_train_dict.values()):
    print("-----" * 10)
    for (kx, xtc), (ky, ytc) in zip(xt.items(), yt.items()):
        assert kx.split("_")[1] == ky.split("_")[1]
        if accuracy_score(ytc, np.zeros(len(ytc))) >= 0.5:
            print('"0">"1"', ky, round(accuracy_score(ytc, np.zeros(len(ytc))), 3))
        else:
            print('"1">"0"', ky, round(accuracy_score(ytc, np.ones(len(ytc))), 3))

--------------------------------------------------
"0">"1" y5_Basal.pickle 0.655
"0">"1" y5_Her2.pickle 0.667
"0">"1" y5_LumA.pickle 0.923
"0">"1" y5_LumB.pickle 0.788
"0">"1" y5_Normal.pickle 0.821
"0">"1" y5_claudin-low.pickle 0.812
--------------------------------------------------
"0">"1" y10_Basal.pickle 0.557
"1">"0" y10_Her2.pickle 0.558
"0">"1" y10_LumA.pickle 0.783
"0">"1" y10_LumB.pickle 0.54
"0">"1" y10_Normal.pickle 0.646
"0">"1" y10_claudin-low.pickle 0.692
--------------------------------------------------
"1">"0" y15_Basal.pickle 0.577
"1">"0" y15_Her2.pickle 0.684
"0">"1" y15_LumA.pickle 0.626
"1">"0" y15_LumB.pickle 0.675
"1">"0" y15_Normal.pickle 0.571
"0">"1" y15_claudin-low.pickle 0.575


In [7]:
for xt, yt in zip(X_train_dict.values(), y_train_dict.values()):
    print("-----" * 10)
    for (kx, xtc), (ky, ytc) in zip(xt.items(), yt.items()):
        assert kx.split("_")[1] == ky.split("_")[1]
        print(kx)
        print("使用特徴量", xtc.columns)
        print("学習サンプルサイズ：", xtc.shape)
        print(ytc.value_counts())
        display(compare_bcms(xtc, ytc))

--------------------------------------------------
X5_Basal.pickle
使用特徴量 Index(['IGHG1', 'CSN3'], dtype='object')
学習サンプルサイズ： (148, 2)
0    97
1    51
Name: OS_5years, dtype: int64


11it [00:02,  4.31it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.638138,0.588095,0.512309,0.437377
Naive Bayes,0.641151,0.580952,0.511327,0.423694
Nearest Neighbors,0.729722,0.620476,0.540085,0.32422
AdaBoost,0.863377,0.585238,0.776003,0.316168
Decision Tree,0.728212,0.627619,0.562794,0.303126
Logistic Regression,0.659887,0.626667,0.313227,0.258506
Random Forest,0.682415,0.6,0.247872,0.116667
RBF SVM,0.662883,0.619048,0.086547,0.09
Polynomial SVM,0.657631,0.625714,0.047727,0.025
Linear SVM,0.655392,0.654286,0.0,0.0


X5_Her2.pickle
使用特徴量 Index(['GFRA1', 'U79293', 'CLCA2', 'ESR1'], dtype='object')
学習サンプルサイズ： (153, 4)
0    102
1     51
Name: OS_5years, dtype: int64


11it [00:01,  7.09it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.650725,0.649583,0.587675,0.588939
AdaBoost,0.963683,0.680833,0.944919,0.491791
Quadratic Discriminant Analysis,0.677594,0.64125,0.545073,0.452122
Decision Tree,0.761806,0.622083,0.593756,0.337107
Nearest Neighbors,0.743653,0.6025,0.557835,0.305188
Random Forest,0.753845,0.6625,0.490552,0.291198
Logistic Regression,0.664503,0.635417,0.302668,0.274127
Polynomial SVM,0.689167,0.628333,0.249535,0.160714
RBF SVM,0.679028,0.62875,0.108308,0.053571
Linear SVM,0.666693,0.66875,0.0,0.0


X5_LumA.pickle
使用特徴量 Index(['CLIC6', 'KRT15', 'CPB1', 'MYBPC1', 'SLC30A8', 'CLEC3A', 'GRIA2',
       'S100P', 'VTCN1', 'PVALB', 'BEX1'],
      dtype='object')
学習サンプルサイズ： (466, 11)
0    430
1     36
Name: OS_5years, dtype: int64


11it [00:01,  6.36it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.868619,0.866883,0.178132,0.120325
Quadratic Discriminant Analysis,0.914401,0.881776,0.417623,0.062222
AdaBoost,0.968526,0.89704,0.749824,0.028571
Polynomial SVM,0.945635,0.901249,0.459124,0.025
Decision Tree,0.924651,0.914061,0.166954,0.0
Linear SVM,0.922745,0.922618,0.0,0.0
Logistic Regression,0.922745,0.922618,0.0,0.0
Nearest Neighbors,0.92656,0.916189,0.165277,0.0
RBF SVM,0.922745,0.922618,0.0,0.0
Random Forest,0.922745,0.922618,0.0,0.0


X5_LumB.pickle
使用特徴量 Index(['SHISA2', 'IGKC', 'PDZK1', 'DB005376'], dtype='object')
学習サンプルサイズ： (306, 4)
0    241
1     65
Name: OS_5years, dtype: int64


11it [00:10,  1.04it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.767974,0.757204,0.295129,0.239916
Nearest Neighbors,0.824611,0.751075,0.428523,0.220761
Decision Tree,0.821357,0.735376,0.455947,0.213586
AdaBoost,0.870731,0.712366,0.631762,0.105315
Quadratic Discriminant Analysis,0.775244,0.754409,0.229286,0.096753
Linear SVM,0.78758,0.787419,0.0,0.0
Logistic Regression,0.786854,0.784086,0.006564,0.0
Polynomial SVM,0.78758,0.787419,0.0,0.0
RBF SVM,0.78758,0.787419,0.0,0.0
Random Forest,0.792665,0.784194,0.046554,0.0


X5_Normal.pickle
使用特徴量 Index(['CALML5', 'CLIC6', 'TCN1', 'CPB1'], dtype='object')
学習サンプルサイズ： (95, 4)
0    78
1    17
Name: OS_5years, dtype: int64


11it [00:01,  9.25it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoost,1.0,0.787778,1.0,0.22381
Naive Bayes,0.801108,0.752222,0.454005,0.208571
Quadratic Discriminant Analysis,0.80461,0.754444,0.426238,0.208571
Nearest Neighbors,0.837373,0.785556,0.267106,0.128571
Polynomial SVM,0.831518,0.815556,0.173238,0.1
Decision Tree,0.835021,0.721111,0.289072,0.04
Linear SVM,0.820999,0.816667,0.0,0.0
Logistic Regression,0.835048,0.805556,0.267253,0.0
RBF SVM,0.823338,0.816667,0.022222,0.0
Random Forest,0.820999,0.816667,0.0,0.0


X5_claudin-low.pickle
使用特徴量 Index(['GABRP', 'C4orf7', 'SFRP1', 'ROPN1', 'SNAR-A3'], dtype='object')
学習サンプルサイズ： (138, 5)
0    112
1     26
Name: OS_5years, dtype: int64


11it [00:01,  9.25it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoost,1.0,0.833516,1.0,0.464286
Quadratic Discriminant Analysis,0.839006,0.803846,0.44038,0.321111
Naive Bayes,0.794665,0.747253,0.470026,0.312222
Decision Tree,0.843774,0.811538,0.432511,0.27
Logistic Regression,0.851865,0.841209,0.414817,0.25
Polynomial SVM,0.841381,0.826923,0.326193,0.206667
Nearest Neighbors,0.830116,0.782967,0.379023,0.18
Linear SVM,0.821252,0.812637,0.133576,0.05
RBF SVM,0.842194,0.798352,0.314488,0.0
Random Forest,0.811613,0.813187,0.0,0.0


--------------------------------------------------
X10_Basal.pickle
使用特徴量 Index(['IGHG1', 'CSN3'], dtype='object')
学習サンプルサイズ： (122, 2)
0    68
1    54
Name: OS_10years, dtype: int64


11it [00:01, 10.04it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.642052,0.605769,0.64107,0.599221
Naive Bayes,0.64387,0.597436,0.641711,0.59043
Linear SVM,0.643862,0.613462,0.58168,0.567576
Decision Tree,0.705822,0.582051,0.678105,0.559942
Logistic Regression,0.636606,0.612821,0.574314,0.559697
Polynomial SVM,0.645671,0.621795,0.584157,0.559264
RBF SVM,0.641134,0.613462,0.573184,0.553636
Nearest Neighbors,0.68397,0.582051,0.649596,0.516278
Random Forest,0.704896,0.589103,0.647162,0.508196
AdaBoost,0.880701,0.558333,0.862461,0.488797


X10_Her2.pickle
使用特徴量 Index(['GFRA1', 'U79293', 'CLCA2', 'ESR1'], dtype='object')
学習サンプルサイズ： (129, 4)
1    72
0    57
Name: OS_10years, dtype: int64


11it [00:01,  9.45it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.707147,0.659615,0.763947,0.719409
Polynomial SVM,0.698534,0.644231,0.756874,0.715974
Linear SVM,0.661538,0.652564,0.72544,0.715235
Sigmoid SVM,0.558134,0.557692,0.716317,0.708039
Logistic Regression,0.657221,0.644872,0.720562,0.705922
Random Forest,0.692507,0.621795,0.746919,0.690807
RBF SVM,0.679627,0.60641,0.744049,0.686975
Naive Bayes,0.631359,0.621154,0.693776,0.683431
AdaBoost,0.929384,0.567308,0.937552,0.617399
Decision Tree,0.719194,0.535256,0.745271,0.580813


X10_LumA.pickle
使用特徴量 Index(['CLIC6', 'KRT15', 'CPB1', 'MYBPC1', 'SLC30A8', 'CLEC3A', 'GRIA2',
       'S100P', 'VTCN1', 'PVALB', 'BEX1'],
      dtype='object')
学習サンプルサイズ： (359, 11)
0    281
1     78
Name: OS_10years, dtype: int64


11it [00:01,  6.49it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.743116,0.724444,0.426149,0.393407
Quadratic Discriminant Analysis,0.804705,0.732778,0.564892,0.365735
Decision Tree,0.82946,0.760317,0.502271,0.291076
Logistic Regression,0.800374,0.785714,0.345473,0.258615
AdaBoost,0.906529,0.724365,0.753273,0.252339
Polynomial SVM,0.868153,0.716032,0.627085,0.199598
Nearest Neighbors,0.823581,0.729921,0.507451,0.190823
Linear SVM,0.782731,0.782857,0.0,0.0
RBF SVM,0.79697,0.777302,0.125574,0.0
Random Forest,0.791396,0.782857,0.077512,0.0


X10_LumB.pickle
使用特徴量 Index(['SHISA2', 'IGKC', 'PDZK1', 'DB005376'], dtype='object')
学習サンプルサイズ： (252, 4)
0    136
1    116
Name: OS_10years, dtype: int64


11it [00:01,  8.53it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.606261,0.615231,0.594422,0.593363
Quadratic Discriminant Analysis,0.604507,0.568,0.59642,0.553158
RBF SVM,0.661378,0.583692,0.641604,0.551599
Linear SVM,0.600103,0.583846,0.524619,0.493969
Random Forest,0.717391,0.564154,0.68166,0.479643
AdaBoost,0.827172,0.544462,0.810753,0.479008
Nearest Neighbors,0.694909,0.556308,0.668613,0.474125
Logistic Regression,0.598778,0.571846,0.514466,0.464411
Polynomial SVM,0.6261,0.560462,0.535734,0.434256
Decision Tree,0.697979,0.488308,0.647901,0.4208


X10_Normal.pickle
使用特徴量 Index(['CALML5', 'CLIC6', 'TCN1', 'CPB1'], dtype='object')
学習サンプルサイズ： (79, 4)
0    51
1    28
Name: OS_10years, dtype: int64


11it [00:00, 11.39it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.765121,0.733929,0.670125,0.54381
Quadratic Discriminant Analysis,0.760876,0.658929,0.62287,0.474286
Naive Bayes,0.732786,0.658929,0.591656,0.439524
Linear SVM,0.752465,0.698214,0.578436,0.434524
Logistic Regression,0.742625,0.685714,0.572088,0.40119
AdaBoost,1.0,0.558929,1.0,0.35
Nearest Neighbors,0.745423,0.596429,0.581566,0.304762
RBF SVM,0.759487,0.646429,0.571271,0.296667
Polynomial SVM,0.784859,0.583929,0.629633,0.28
Random Forest,0.741217,0.646429,0.527037,0.22


X10_claudin-low.pickle
使用特徴量 Index(['GABRP', 'C4orf7', 'SFRP1', 'ROPN1', 'SNAR-A3'], dtype='object')
学習サンプルサイズ： (107, 5)
0    74
1    33
Name: OS_10years, dtype: int64


11it [00:01, 10.64it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoost,1.0,0.765455,1.0,0.58619
Naive Bayes,0.695737,0.68,0.584864,0.534026
Linear SVM,0.78503,0.773636,0.520384,0.475397
Logistic Regression,0.778791,0.735455,0.563805,0.442063
Quadratic Discriminant Analysis,0.781894,0.707273,0.601594,0.419762
Polynomial SVM,0.817225,0.708182,0.625231,0.368571
Decision Tree,0.785062,0.652727,0.630033,0.341371
Nearest Neighbors,0.791237,0.679091,0.579016,0.322143
RBF SVM,0.780885,0.728182,0.513114,0.307143
Random Forest,0.772573,0.718182,0.48329,0.268571


--------------------------------------------------
X15_Basal.pickle
使用特徴量 Index(['IGHG1', 'CSN3'], dtype='object')
学習サンプルサイズ： (97, 2)
1    56
0    41
Name: OS_15years, dtype: int64


11it [00:00, 11.83it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sigmoid SVM,0.57726,0.572222,0.731687,0.696954
Naive Bayes,0.655238,0.622222,0.741159,0.687088
Quadratic Discriminant Analysis,0.657524,0.622222,0.742253,0.687088
Polynomial SVM,0.660959,0.6,0.749904,0.676089
Logistic Regression,0.655277,0.611111,0.729359,0.675119
Linear SVM,0.657497,0.601111,0.738892,0.673928
RBF SVM,0.659835,0.59,0.740851,0.670421
Random Forest,0.672375,0.598889,0.742312,0.64989
Nearest Neighbors,0.726293,0.581111,0.769703,0.624252
Decision Tree,0.676972,0.577778,0.717161,0.592415


X15_Her2.pickle
使用特徴量 Index(['GFRA1', 'U79293', 'CLCA2', 'ESR1'], dtype='object')
学習サンプルサイズ： (114, 4)
1    78
0    36
Name: OS_15years, dtype: int64


11it [00:01,  8.66it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear SVM,0.68419,0.682576,0.812352,0.800045
Sigmoid SVM,0.68419,0.682576,0.812352,0.800045
RBF SVM,0.703712,0.674242,0.821924,0.795302
Quadratic Discriminant Analysis,0.746583,0.684848,0.834281,0.789335
Polynomial SVM,0.766096,0.675,0.850507,0.783658
Logistic Regression,0.677403,0.640152,0.800368,0.770151
Random Forest,0.690063,0.613636,0.809946,0.753599
Nearest Neighbors,0.753436,0.613636,0.835623,0.735327
Decision Tree,0.731963,0.59697,0.820061,0.719169
Naive Bayes,0.644241,0.570455,0.762423,0.69818


X15_LumA.pickle
使用特徴量 Index(['CLIC6', 'KRT15', 'CPB1', 'MYBPC1', 'SLC30A8', 'CLEC3A', 'GRIA2',
       'S100P', 'VTCN1', 'PVALB', 'BEX1'],
      dtype='object')
学習サンプルサイズ： (270, 11)
0    169
1    101
Name: OS_15years, dtype: int64


11it [00:01,  6.75it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear SVM,0.716049,0.677778,0.607321,0.543617
Naive Bayes,0.663374,0.637037,0.576364,0.530611
Quadratic Discriminant Analysis,0.74321,0.625926,0.670602,0.500246
Logistic Regression,0.705761,0.662963,0.567897,0.492201
Decision Tree,0.753086,0.655556,0.637758,0.469192
Nearest Neighbors,0.709877,0.607407,0.593033,0.442828
Polynomial SVM,0.834156,0.585185,0.766561,0.435554
RBF SVM,0.769136,0.625926,0.663859,0.421383
AdaBoost,0.889712,0.574074,0.847247,0.409403
Random Forest,0.77284,0.633333,0.636315,0.34786


X15_LumB.pickle
使用特徴量 Index(['SHISA2', 'IGKC', 'PDZK1', 'DB005376'], dtype='object')
学習サンプルサイズ： (194, 4)
1    131
0     63
Name: OS_15years, dtype: int64


11it [00:01,  8.90it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sigmoid SVM,0.675264,0.675789,0.806105,0.802144
Linear SVM,0.678693,0.665263,0.803743,0.795371
Logistic Regression,0.69245,0.676316,0.798698,0.786851
Polynomial SVM,0.712496,0.675789,0.813941,0.784848
RBF SVM,0.733678,0.670789,0.8286,0.783453
Quadratic Discriminant Analysis,0.702177,0.696053,0.795862,0.782663
Random Forest,0.750887,0.660526,0.834996,0.777274
Naive Bayes,0.698184,0.685526,0.790912,0.774154
AdaBoost,0.894621,0.649737,0.92435,0.742046
Decision Tree,0.75487,0.654737,0.820513,0.735903


X15_Normal.pickle
使用特徴量 Index(['CALML5', 'CLIC6', 'TCN1', 'CPB1'], dtype='object')
学習サンプルサイズ： (56, 4)
1    32
0    24
Name: OS_15years, dtype: int64


11it [00:00, 11.99it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naive Bayes,0.779647,0.75,0.803128,0.738492
Linear SVM,0.795608,0.713333,0.819952,0.709524
Logistic Regression,0.775725,0.713333,0.806814,0.709524
Polynomial SVM,0.833255,0.676667,0.845074,0.706587
Sigmoid SVM,0.571412,0.57,0.727052,0.70583
Random Forest,0.750118,0.663333,0.803237,0.700079
RBF SVM,0.833373,0.656667,0.853165,0.685714
Quadratic Discriminant Analysis,0.793608,0.66,0.810888,0.675079
AdaBoost,1.0,0.68,1.0,0.654365
Nearest Neighbors,0.795529,0.673333,0.812633,0.628968


X15_claudin-low.pickle
使用特徴量 Index(['GABRP', 'C4orf7', 'SFRP1', 'ROPN1', 'SNAR-A3'], dtype='object')
学習サンプルサイズ： (80, 5)
0    46
1    34
Name: OS_15years, dtype: int64


11it [00:00, 11.06it/s]


Unnamed: 0_level_0,acc_train,acc_test,f1_train,f1_test
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoost,1.0,0.775,1.0,0.637143
Random Forest,0.730556,0.6875,0.6886,0.551667
Linear SVM,0.727778,0.7,0.677498,0.540981
Naive Bayes,0.709722,0.675,0.687103,0.537922
Logistic Regression,0.733333,0.6875,0.674204,0.509921
Quadratic Discriminant Analysis,0.727778,0.6375,0.6714,0.496537
Nearest Neighbors,0.811111,0.675,0.76972,0.482937
RBF SVM,0.763889,0.6125,0.715524,0.456061
Decision Tree,0.758333,0.6125,0.674765,0.413333
Polynomial SVM,0.776389,0.5125,0.721529,0.413016
