In [6]:
import os
import random
import itertools
import re

# 基本的なライブラリ
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

# 描画ライブラリ
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn_analyzer import CustomPairPlot
import graphviz
import pydotplus
from IPython.display import Image
from IPython.display import HTML
from six import StringIO
from ipywidgets import interact, FloatSlider

# データセット分割
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    ShuffleSplit,
    StratifiedShuffleSplit,
)

# 補完
from sklearn.experimental import (
    enable_iterative_imputer,
)  # IterativeImputerをimportするために必要
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

# エンコード
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

# サンプリング
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# 特徴量選択
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import (
    GenericUnivariateSelect,
    f_classif,
    mutual_info_classif,
    chi2,
)
from boruta import BorutaPy
from libraries.mrmr import mrmr

# models
from lightgbm import LGBMClassifier
import xgboost as xgb


# 学習中
import optuna
from tqdm import tqdm
from sklearn.model_selection import learning_curve, cross_validate, cross_val_score

# 評価指標
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import warnings


# config python file
import config

SEED = config.SEED
# INDEX_MICROARRAY = config.INDEX_MICROARRAY
INDEX_MICROARRAY = 2
from functions import *

fix_seed(SEED)


# 最大表示列数の指定（ここでは50列を指定）N
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

%matplotlib inline

# 目的
遺伝子学的分類に基づいた、予後の2値分類を実施する。  
分類はCLAUDIN_SUBTYPEに基づいて実施。  
予後は5年、10年、15年の3つの年次に分けている。Trueで死亡であることに注意すること。

# データ読み込み
読み込み元：
    config.INTERIM_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/claudin_subtype_chi2"

サブタイプ毎のデータを使用 

In [2]:
X_dict = pickle_load(config.INTERIM_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/X_dict.pickle")
y_dict = pickle_load(config.INTERIM_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/y_dict.pickle")

# 単一グル―プでのモデルのトレーニング

## ベースモデルの学習結果

各特徴量選択されたdfを用いて基本的な2値分類モデルを学習する

In [3]:
def validate_models(mircroarray_type: str, method: str, sampling_type: str = None):
    for year in range(15, 16, 5):  # 予後年数毎のループ
        prognosis_Xlabel = "X{0:0=2}".format(year)
        prognosis_ylabel = "y{0:0=2}".format(year)

        # compare_bcmsは内部でk分割交差検証を行うので、train_test_splitをしなくて良い
        X_train = X_dict[mircroarray_type][method]["train"][prognosis_Xlabel]
        y_train = y_dict[mircroarray_type][method]["train"][prognosis_ylabel]
        X_val = X_dict[mircroarray_type][method]["val"][prognosis_Xlabel]
        y_val = y_dict[mircroarray_type][method]["val"][prognosis_ylabel]

        # accuracyの表示
        print("----------" * 10)
        print("予後年数：{0:0=2}年:".format(year))
        if accuracy_score(y_train, np.zeros(len(y_train))) >= 0.5:
            score = (
                "0>1".format(year),
                round(accuracy_score(y_train, np.zeros(len(y_train))), 3),
            )
        else:
            score = (
                "0>1".format(year),
                round(accuracy_score(y_train, np.ones(len(y_train))), 3),
            )
        print("accuracyベースライン：", score)
        print("使用特徴量：", X_train.columns)
        print("学習サンプルサイズ：", X_train.shape)
        display("ラベル比率：", y_train.value_counts())
        save_dir = (
            config.TABLES_MODELS_PROGNOSIS_CROSS_DIR
            + "/scores_base-model/{0}/{1}".format(mircroarray_type, method)
        )
        make_dir(save_dir)
        sampling = None
        if sampling_type == "sm":
            sampling = SMOTE(random_state=SEED)
        display(
            compare_bcms(
                X_train,
                y_train,
                X_val,
                y_val,
                over_sampling_class=sampling,
                save_path=save_dir + "/{0:0=2}.csv".format(year),
            )
        )

In [4]:
for k in X_dict.keys():
    print(k)
    validate_models(mircroarray_type=k, method="mrmr", sampling_type=None)
    validate_models(mircroarray_type=k, method="mrmr", sampling_type="sm")
    # validate_models(mircroarray_type=k, method="boruta")

mrna_agilent_microarray_zscores_ref_all_samples
----------------------------------------------------------------------------------------------------
予後年数：15年:
accuracyベースライン： ('0>1', 0.533)
使用特徴量： Index(['RACGAP1', 'PFN4', 'NACAP1', 'RAPGEF5', 'EMR3', 'CXorf57', 'DNAJB7',
       'ENC1', 'PCM1', 'KRTAP12-4', 'STAT5B', 'AI659947', 'TOR1B', 'BP432397',
       'RNU11', 'UHRF1', 'RBBP8', 'USP30', 'AK022229', 'FGF13', 'BE858513',
       'HS3ST1', 'DFNB59', 'GAL3ST4', 'AURKA', 'DB341438', 'C9orf95', 'S100P',
       'ANKAR', 'WDR67', 'GABRB1', 'AW572907', 'HIST1H2BF', 'APLN', 'GTPBP5',
       'SPATA4', 'BG218808', 'AW444974', 'FGD3', 'INTS10', 'CBX7', 'BC033399',
       'TSPYL6', 'TMEM26', 'DA697821', 'MST1', 'IGDCC4', 'UBE2C', 'RAB3B'],
      dtype='object')
学習サンプルサイズ： (865, 49)


'ラベル比率：'

1    461
0    404
Name: OS_15years, dtype: int64

11it [00:02,  5.18it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Random Forest,0.87052,0.649485,0.881356,0.679245
Sigmoid SVM,0.757225,0.628866,0.781705,0.678571
AdaBoost,0.854335,0.639175,0.865385,0.672897
RBF SVM,0.915607,0.597938,0.920044,0.648649
Logistic Regression,0.806936,0.608247,0.821772,0.648148
Linear SVM,0.810405,0.587629,0.825532,0.636364
Polynomial SVM,0.966474,0.57732,0.968306,0.630631
Naive Bayes,0.736416,0.56701,0.753247,0.596154
Nearest Neighbors,0.8,0.56701,0.814973,0.596154
Decision Tree,0.820809,0.56701,0.826038,0.588235


----------------------------------------------------------------------------------------------------
予後年数：15年:
accuracyベースライン： ('0>1', 0.533)
使用特徴量： Index(['RACGAP1', 'PFN4', 'NACAP1', 'RAPGEF5', 'EMR3', 'CXorf57', 'DNAJB7',
       'ENC1', 'PCM1', 'KRTAP12-4', 'STAT5B', 'AI659947', 'TOR1B', 'BP432397',
       'RNU11', 'UHRF1', 'RBBP8', 'USP30', 'AK022229', 'FGF13', 'BE858513',
       'HS3ST1', 'DFNB59', 'GAL3ST4', 'AURKA', 'DB341438', 'C9orf95', 'S100P',
       'ANKAR', 'WDR67', 'GABRB1', 'AW572907', 'HIST1H2BF', 'APLN', 'GTPBP5',
       'SPATA4', 'BG218808', 'AW444974', 'FGD3', 'INTS10', 'CBX7', 'BC033399',
       'TSPYL6', 'TMEM26', 'DA697821', 'MST1', 'IGDCC4', 'UBE2C', 'RAB3B'],
      dtype='object')
学習サンプルサイズ： (865, 49)


'ラベル比率：'

1    461
0    404
Name: OS_15years, dtype: int64

11it [00:01,  6.32it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear SVM,0.804772,0.618557,0.805195,0.666667
Decision Tree,0.82538,0.608247,0.825569,0.648148
Sigmoid SVM,0.752711,0.597938,0.76,0.642202
Logistic Regression,0.795011,0.597938,0.796117,0.628571
RBF SVM,0.918655,0.57732,0.917127,0.616822
Naive Bayes,0.738612,0.587629,0.740581,0.615385
Random Forest,0.867679,0.587629,0.868817,0.607843
Quadratic Discriminant Analysis,0.881779,0.57732,0.87794,0.601942
AdaBoost,0.838395,0.587629,0.837514,0.6
Nearest Neighbors,0.808026,0.608247,0.793946,0.586957


mrna_agilent_microarray
----------------------------------------------------------------------------------------------------
予後年数：15年:
accuracyベースライン： ('0>1', 0.533)
使用特徴量： Index(['RACGAP1', 'SNX24', 'TUB', 'ARRDC3', 'STAT5B', 'PTPLAD1', 'RBBP8',
       'ENC1', 'RNU11', 'UHRF1', 'PPIL3', 'S100P', 'MST1', 'WARS2', 'FGF13',
       'C9orf95', 'WDR67', 'CBX7', 'INTS10', 'SPATA18', 'HIST1H2BF', 'AURKA',
       'TFPT', 'LRRC50', 'PDK3', 'IGDCC4', 'FGD3', 'AK3', 'LRRC48', 'PSTPIP2',
       'PKMYT1', 'CATSPERB', 'TMEM26', 'STAT5A', 'CCT6B', 'C14orf139',
       'C7orf63', 'CD44', 'KIAA1967', 'ATAD2', 'N4BP2L1', 'PIGV', 'GPRC5A',
       'UBE2C', 'OGT', 'TRNP1', 'PREX1', 'GPR172A', 'NAB1'],
      dtype='object')
学習サンプルサイズ： (865, 49)


'ラベル比率：'

1    461
0    404
Name: OS_15years, dtype: int64

11it [00:08,  1.23it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.817341,0.670103,0.833333,0.709091
Sigmoid SVM,0.532948,0.536082,0.695324,0.697987
Polynomial SVM,0.921387,0.608247,0.927505,0.648148
Linear SVM,0.758382,0.628866,0.775991,0.647059
RBF SVM,0.758382,0.597938,0.778367,0.621359
Logistic Regression,0.749133,0.608247,0.76841,0.62
Random Forest,0.836994,0.57732,0.852356,0.616822
Quadratic Discriminant Analysis,0.852023,0.597938,0.85872,0.613861
Nearest Neighbors,0.771098,0.556701,0.790254,0.590476
AdaBoost,0.824277,0.57732,0.837607,0.57732


----------------------------------------------------------------------------------------------------
予後年数：15年:
accuracyベースライン： ('0>1', 0.533)
使用特徴量： Index(['RACGAP1', 'SNX24', 'TUB', 'ARRDC3', 'STAT5B', 'PTPLAD1', 'RBBP8',
       'ENC1', 'RNU11', 'UHRF1', 'PPIL3', 'S100P', 'MST1', 'WARS2', 'FGF13',
       'C9orf95', 'WDR67', 'CBX7', 'INTS10', 'SPATA18', 'HIST1H2BF', 'AURKA',
       'TFPT', 'LRRC50', 'PDK3', 'IGDCC4', 'FGD3', 'AK3', 'LRRC48', 'PSTPIP2',
       'PKMYT1', 'CATSPERB', 'TMEM26', 'STAT5A', 'CCT6B', 'C14orf139',
       'C7orf63', 'CD44', 'KIAA1967', 'ATAD2', 'N4BP2L1', 'PIGV', 'GPRC5A',
       'UBE2C', 'OGT', 'TRNP1', 'PREX1', 'GPR172A', 'NAB1'],
      dtype='object')
学習サンプルサイズ： (865, 49)


'ラベル比率：'

1    461
0    404
Name: OS_15years, dtype: int64

11it [00:04,  2.65it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sigmoid SVM,0.5,0.536082,0.666667,0.697987
Decision Tree,0.798265,0.608247,0.798701,0.634615
Polynomial SVM,0.927332,0.608247,0.927568,0.634615
Linear SVM,0.761388,0.608247,0.763441,0.62
Logistic Regression,0.747289,0.618557,0.745911,0.618557
Quadratic Discriminant Analysis,0.85141,0.597938,0.848283,0.606061
Random Forest,0.845987,0.57732,0.848291,0.594059
AdaBoost,0.824295,0.597938,0.824295,0.589474
RBF SVM,0.75705,0.57732,0.760171,0.585859
Nearest Neighbors,0.784165,0.546392,0.77412,0.56


mrna_agilent_microarray_zscores_ref_diploid_samples
----------------------------------------------------------------------------------------------------
予後年数：15年:
accuracyベースライン： ('0>1', 0.533)
使用特徴量： Index(['RACGAP1', 'CYP2D6', 'PTX4', 'BIK', 'ENC1', 'DENND6A', 'STAT5B',
       'RNU11', 'DNAJB7', 'PPIL3', 'UHRF1', 'RBBP8', 'MIR137HG', 'GABRB1',
       'TOR1B', 'MST1', 'NMRK1', 'FGF13', 'HIST1H2BF', 'SPATA4', 'TBC1D31',
       'USP30', 'S100P', 'APLN', 'MTG2', 'GAL3ST4', 'FGD3', 'PJVK', 'INTS10',
       'CBX7', 'FCN2', 'RNASE9', 'SPATA18', 'ANKAR', 'SULT4A1', 'TSPYL6',
       'IGDCC4', 'TMEM26', 'PDK3', 'AURKA', 'PRR21', 'DNAAF1', 'GPR151',
       'GASK1A', 'NLRX1', 'RAB3B', 'HS3ST1', 'PKMYT1', 'CATSPERB'],
      dtype='object')
学習サンプルサイズ： (865, 49)


'ラベル比率：'

1    461
0    404
Name: OS_15years, dtype: int64

11it [00:01,  6.04it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.788439,0.649485,0.803437,0.673077
RBF SVM,0.893642,0.639175,0.900862,0.672897
Linear SVM,0.786127,0.618557,0.802982,0.654206
AdaBoost,0.856647,0.639175,0.865217,0.653465
Quadratic Discriminant Analysis,0.860116,0.639175,0.863585,0.653465
Sigmoid SVM,0.734104,0.618557,0.750542,0.647619
Random Forest,0.854335,0.597938,0.867925,0.635514
Nearest Neighbors,0.786127,0.597938,0.798694,0.628571
Naive Bayes,0.746821,0.597938,0.760131,0.613861
Decision Tree,0.80578,0.56701,0.826804,0.611111


----------------------------------------------------------------------------------------------------
予後年数：15年:
accuracyベースライン： ('0>1', 0.533)
使用特徴量： Index(['RACGAP1', 'CYP2D6', 'PTX4', 'BIK', 'ENC1', 'DENND6A', 'STAT5B',
       'RNU11', 'DNAJB7', 'PPIL3', 'UHRF1', 'RBBP8', 'MIR137HG', 'GABRB1',
       'TOR1B', 'MST1', 'NMRK1', 'FGF13', 'HIST1H2BF', 'SPATA4', 'TBC1D31',
       'USP30', 'S100P', 'APLN', 'MTG2', 'GAL3ST4', 'FGD3', 'PJVK', 'INTS10',
       'CBX7', 'FCN2', 'RNASE9', 'SPATA18', 'ANKAR', 'SULT4A1', 'TSPYL6',
       'IGDCC4', 'TMEM26', 'PDK3', 'AURKA', 'PRR21', 'DNAAF1', 'GPR151',
       'GASK1A', 'NLRX1', 'RAB3B', 'HS3ST1', 'PKMYT1', 'CATSPERB'],
      dtype='object')
学習サンプルサイズ： (865, 49)


'ラベル比率：'

1    461
0    404
Name: OS_15years, dtype: int64

11it [00:02,  4.99it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.869848,0.690722,0.864253,0.705882
Logistic Regression,0.780911,0.670103,0.781857,0.686275
Decision Tree,0.815618,0.639175,0.818376,0.672897
RBF SVM,0.903471,0.639175,0.901874,0.666667
Sigmoid SVM,0.732104,0.649485,0.730055,0.666667
Linear SVM,0.799349,0.628866,0.801715,0.660377
Random Forest,0.861171,0.628866,0.862069,0.64
AdaBoost,0.848156,0.618557,0.845815,0.626263
Naive Bayes,0.749458,0.597938,0.748092,0.613861
Nearest Neighbors,0.793926,0.618557,0.771635,0.602151


## lightGBM

In [11]:
X_val

Unnamed: 0,RACGAP1,CYP2D6,PTX4,BIK,ENC1,DENND6A,STAT5B,RNU11,DNAJB7,PPIL3,UHRF1,RBBP8,MIR137HG,GABRB1,TOR1B,MST1,NMRK1,FGF13,HIST1H2BF,SPATA4,TBC1D31,USP30,S100P,APLN,MTG2,GAL3ST4,FGD3,PJVK,INTS10,CBX7,FCN2,RNASE9,SPATA18,ANKAR,SULT4A1,TSPYL6,IGDCC4,TMEM26,PDK3,AURKA,PRR21,DNAAF1,GPR151,GASK1A,NLRX1,RAB3B,HS3ST1,PKMYT1,CATSPERB
503,0.2119,1.6529,-1.6855,-0.3577,1.4433,-0.6949,-1.2163,2.0433,0.1040,-0.4316,-0.9287,-0.1893,-0.9244,-0.7795,0.3221,-0.2165,1.2176,0.2797,-0.3739,-0.4375,1.2550,0.0295,0.2200,-0.4103,-0.6604,0.6354,0.4598,0.0183,-1.3312,-1.1823,-0.3141,0.8082,0.5815,-0.4098,0.5083,-1.7614,-0.6782,-0.5668,-0.9551,0.0098,0.8492,0.0100,-0.3438,-0.4686,-1.4428,-0.2313,1.1058,-0.2315,-0.1111
1619,0.1540,1.2383,-1.2778,0.4610,1.1818,-2.2326,-0.9307,-0.6721,-1.3129,0.6635,0.6451,0.3746,1.5337,-0.2018,0.6217,-0.1564,-0.3972,-0.9323,-0.5775,0.6461,0.7516,-0.4811,0.8039,-0.1072,-0.0131,0.5088,1.5258,-0.6589,0.6055,-0.2940,-0.2108,0.6357,0.8730,-0.0041,-0.3072,-0.2047,-0.4924,0.0109,-0.3500,-0.1039,2.0479,0.0358,-0.5422,-1.4356,0.8424,-0.1127,0.8175,-0.0237,0.5653
722,-1.2365,2.4154,0.5176,-2.2447,0.5665,-0.5589,1.6642,-0.6704,0.3184,1.4243,-1.0042,0.4578,-0.3716,0.5093,-0.0479,-0.5220,0.3520,0.2379,-0.6807,-0.5750,-0.4333,-0.3678,-0.9468,0.3422,-2.4815,-0.4230,-0.6113,-0.4163,0.3515,0.5409,-0.8213,-1.4515,0.3249,1.5461,-1.2862,0.3920,3.0093,0.0193,0.2767,-0.9837,0.9380,-0.7992,0.5991,-0.4121,0.3281,-0.6375,1.3134,-1.1243,-0.5217
789,0.6711,1.9597,0.9532,1.8771,0.3071,-0.8478,-1.6705,-0.4652,-0.3583,0.0861,0.0324,-1.6183,1.2612,-0.9678,-0.1074,-0.1820,1.6283,-0.4183,1.2188,-1.0779,4.1171,-0.9964,1.6918,-0.7426,4.0938,-0.0447,-2.1711,-1.6778,-1.8177,-1.7865,0.2300,0.3555,-1.2055,-1.4413,-1.0976,-0.3384,-0.7462,-1.0501,-0.8484,2.4203,-0.9967,-0.7822,-0.1686,-1.2336,0.5812,-0.7414,-1.2036,1.7510,0.6361
497,1.4885,1.5530,0.9744,0.0089,0.1938,1.3090,-1.7878,0.4877,-1.0598,-0.9903,0.6452,-1.5136,-0.4751,-0.8966,0.1184,0.6862,-1.9315,-0.7072,0.3332,-1.1547,1.3148,-0.0365,0.7017,1.0066,-1.0280,0.8721,-1.3742,-0.8728,-2.8240,-0.4807,1.0626,-0.4163,-0.2890,-1.4208,0.4496,1.4724,-0.2414,-0.3148,-0.3273,1.3388,2.2118,-0.2910,0.4340,-0.2061,-1.9057,-0.4447,0.5768,0.6725,-0.7837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865,-0.4633,-1.3550,-0.5411,-0.4718,0.9467,-0.4515,0.7681,-0.7827,0.1759,-0.8931,0.2157,0.8962,0.4334,0.6271,0.8521,0.3377,-1.1179,-0.5424,0.0593,-1.0078,-0.4840,0.2536,-0.0460,0.4492,0.1043,0.3842,0.6704,-0.9304,-0.3786,0.7756,0.3579,1.2485,0.5633,-0.5350,-0.9476,0.9232,0.2097,0.2471,0.7001,-0.5191,-0.2644,0.2208,-1.1700,-0.2046,0.3283,0.1034,0.0858,-0.0059,0.1257
808,-0.8221,-0.6471,-1.5889,0.9744,-1.5123,-0.0087,0.6661,-0.4318,-0.1932,-0.1695,-0.8400,-1.4407,-0.8217,0.3155,-0.8692,0.8099,-0.2108,0.3928,-0.6450,-0.7010,1.8789,-0.6556,0.4331,-0.9015,-0.4733,0.2179,0.6511,-0.1976,-1.0250,1.1881,-0.0852,0.7732,-0.7356,0.9526,1.5937,1.5990,-0.5641,0.7393,1.1776,-0.1609,-0.4536,-0.0077,0.6753,0.0342,-0.5892,-0.2515,-0.5434,1.1660,1.2291
10,0.1480,-0.5056,1.0631,1.5842,-0.9057,-1.2109,0.3117,0.8561,0.6465,-0.8311,-0.4831,2.4322,-1.0354,0.9987,0.3688,-0.2068,3.0108,-0.7361,0.2069,1.6876,0.6229,-1.0265,-0.7251,-0.4556,-0.0401,-0.5652,0.5322,-0.1461,0.0848,0.9116,0.8092,-1.2279,2.1052,-0.9137,0.2456,0.6939,-0.5060,0.1345,0.3200,-0.2325,0.0653,1.6664,-0.5129,-0.7258,-0.5341,0.2001,-0.0517,-0.9799,-0.8354
792,-1.1519,1.2391,-0.5695,-0.2012,-0.6040,0.4672,0.9425,-0.2611,0.1630,0.2460,-0.3687,0.0038,-0.1882,2.5137,-0.0245,3.0149,-0.1290,-1.2459,-0.3923,-0.7657,0.7849,-0.1015,1.3743,-0.3543,1.9273,0.9937,0.8515,0.2364,-0.5312,-0.9232,-0.3973,1.1085,-0.2612,1.5100,0.4218,0.5172,-0.2275,1.1722,-0.1942,0.2375,-0.1887,2.8834,-0.6299,0.1466,0.9487,2.3538,-0.7122,0.3911,1.7592


In [12]:
X_train

Unnamed: 0,RACGAP1,CYP2D6,PTX4,BIK,ENC1,DENND6A,STAT5B,RNU11,DNAJB7,PPIL3,UHRF1,RBBP8,MIR137HG,GABRB1,TOR1B,MST1,NMRK1,FGF13,HIST1H2BF,SPATA4,TBC1D31,USP30,S100P,APLN,MTG2,GAL3ST4,FGD3,PJVK,INTS10,CBX7,FCN2,RNASE9,SPATA18,ANKAR,SULT4A1,TSPYL6,IGDCC4,TMEM26,PDK3,AURKA,PRR21,DNAAF1,GPR151,GASK1A,NLRX1,RAB3B,HS3ST1,PKMYT1,CATSPERB
1179,-1.2052,-0.5416,-0.8235,-1.8149,0.7522,-2.9003,-0.0503,-0.7594,-0.5618,0.4401,-1.0881,-0.6077,0.8412,0.3662,0.3198,2.0915,-2.6340,1.8475,-0.7582,-0.3065,-0.5412,-0.3423,-1.1845,0.4054,1.8695,0.3880,1.7838,1.3993,0.3391,0.3806,1.4520,1.0033,0.2371,0.1927,-0.8906,-0.5021,0.3457,-0.3589,0.3413,-0.4634,-0.5894,0.3656,-0.2322,0.0565,1.1899,-0.7151,-1.0535,-0.8087,0.9016
248,0.4895,-1.9093,0.8939,-0.9732,0.9143,0.8007,-0.9288,0.7774,0.1719,-0.6777,1.3973,-0.2537,2.0394,-0.0619,-0.0492,-1.1150,-1.0804,-0.8170,0.6335,-0.8617,-1.0182,1.3031,1.2350,0.0489,0.1413,2.6842,0.2350,0.0664,-0.1106,-1.5606,1.3172,2.7435,0.9777,-0.1277,-0.1269,1.9955,-0.3947,0.1357,-1.2161,0.1960,-0.4391,-0.5278,-0.3571,0.2335,-0.0419,0.0480,0.8826,0.6808,-0.8725
1010,-0.0800,-2.0482,0.0876,0.7677,0.6644,-0.1323,-0.1755,-0.7657,-0.1460,1.4507,0.1935,1.0147,-0.8546,4.4736,1.0126,-0.5071,-0.0201,-0.7656,0.2892,0.2586,-0.5527,-0.9213,-0.8523,-0.1829,0.7216,-0.8011,0.3776,0.4003,0.5547,0.5586,-0.6819,1.0261,0.1293,1.2210,-1.0506,-0.2384,-0.4237,1.7814,1.3073,0.0931,-1.3498,0.2658,-2.2203,-0.0646,0.2661,-0.5567,2.0942,-0.5139,-0.4338
621,0.9754,-0.7190,0.6587,1.1590,-0.9804,0.9223,-0.7067,0.1708,0.5949,-0.9410,0.3535,0.0891,-1.4304,-0.1579,-0.2842,0.1167,1.0170,-0.3933,0.1584,-1.3519,2.1998,-0.7876,1.7422,0.9884,-2.4693,-0.8476,-0.0737,0.5092,-1.7729,-1.4024,0.6240,-1.4034,-0.6848,-0.3898,-0.2058,1.2852,-0.2067,-0.5263,-0.9119,1.0936,1.4112,0.5773,0.2804,-0.7494,-1.7774,0.7618,1.1356,0.5246,-0.3505
1586,-0.5590,0.9540,-0.2853,0.4371,0.6550,0.3766,-1.1007,-1.1501,-0.9873,0.0706,-0.3272,1.1101,-0.2191,-0.8191,0.4516,1.0980,-0.4956,-0.4220,-0.7367,-0.1836,0.3125,-1.3372,-0.4802,0.0500,0.9575,-1.0240,2.1390,-0.0670,0.5838,-0.6923,-1.4047,1.7806,1.2351,0.7211,0.3853,-0.0810,0.4171,1.0895,-0.3048,-0.6742,0.1007,1.6567,-0.5410,0.9979,0.2817,-0.3792,-0.3749,-0.5507,0.8085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1139,-1.9819,-0.5004,-0.3520,0.3210,-0.9905,-0.8926,0.5269,0.2579,-0.1180,-0.6313,-1.0042,-0.6637,0.5054,0.9669,-0.4197,1.3788,-0.1660,0.0268,-0.5464,1.6747,-0.6109,0.2531,0.0560,-0.7419,1.8338,-0.4417,0.5887,-0.5377,0.5541,0.0458,0.9950,-0.9646,0.2849,0.3530,1.5453,-0.5614,1.1392,-0.8010,0.6712,-1.1605,0.3506,2.3222,-1.0683,-0.4949,1.9617,-0.3372,-0.7802,-0.6478,-0.7934
846,0.5347,-0.1673,-0.9221,-0.8003,2.0579,-0.1110,-1.3246,-0.4863,-1.8789,-2.6483,-1.0150,-2.3997,0.2032,0.3169,-0.6258,-0.7072,-0.6211,-0.6492,-1.0752,-0.6281,-0.8792,0.6618,1.4519,-0.7643,0.1859,-1.2399,-0.5343,-1.3684,-2.3740,-0.1573,-1.0669,1.2588,-0.6320,-1.1254,-0.4061,-0.9063,0.6077,-1.0246,1.0623,0.5338,-0.7874,-1.0402,0.7516,-0.2885,1.4220,0.2831,0.4587,0.6277,-0.0078
1615,-0.1569,0.3820,-0.8164,-1.4429,1.1329,0.6285,0.4849,1.8331,-1.5506,0.0597,-0.1458,-0.0796,1.2602,-0.2840,-0.4032,-0.2932,0.5094,-0.6146,-0.6994,0.0936,0.3779,-0.3228,-1.4336,-0.2661,-1.7095,-0.5341,0.2066,0.1138,0.5729,1.4639,0.7047,0.3533,-0.3724,0.0305,-0.2723,-0.8876,2.7636,-0.5596,-0.7605,0.5564,-2.5788,-0.9060,-0.8240,-0.7412,1.4015,-0.2924,0.1744,-0.3437,-0.7508
563,1.0143,0.7089,0.1395,2.6576,0.8229,-0.4121,-2.0659,-0.4982,0.3784,0.4169,0.8140,-1.5891,0.3487,-0.4979,-1.9369,-0.7858,-1.2990,-0.9094,1.6310,-1.3859,1.7364,1.0635,1.1233,-0.5893,-0.4916,1.1157,-1.9027,0.5966,-1.4251,-1.1285,0.3671,0.9787,-0.7423,-0.8723,0.1512,0.4548,-0.4727,-0.7810,-0.3583,0.7437,1.7843,-0.7091,0.0713,-0.4332,0.9280,-0.4402,3.6294,0.5938,4.1227


In [13]:
X_train = X_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["train"]["X15"]
y_train = y_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["train"]["y15"]
X_val = X_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["val"]["X15"]
y_val = y_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["val"]["y15"]

params = {
    "max_bin": 100,
    "num_leaves": 30,
    "n_estimators": 800,
    "class_weight": "balanced",
    "random_state": SEED,
}

gbm = LGBMClassifier(**params)
gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_val)
show_scores(y_val, y_pred)

accuracy:  0.6391752577319587
precision:  0.6545454545454545
recall:  0.6923076923076923
f1 score:  0.6728971962616823


## xgboost

In [5]:
import xgboost as xgb

X_train = X_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["train"]["X15"]
y_train = y_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["train"]["y15"]
X_val = X_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["val"]["X15"]
y_val = y_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["val"]["y15"]

xgb_train = xgb.DMatrix(X_train, label=y_train, feature_names=X_train.columns)
xgb_val = xgb.DMatrix(X_val, label=y_val, feature_names=X_val.columns)


param = {
    # 二値分類問題
    "objective": "binary:logistic",
}
clf = xgb.train(param, xgb_train)
y_pred_proba = clf.predict(xgb_val)
y_pred = np.where(y_pred_proba > 0.5, 1, 0)

accuracy_score(y_val, y_pred), f1_score(y_val, y_pred)



(0.6494845360824743, 0.6730769230769231)

### 予測モデル、特徴量数、年数毎の比較

In [5]:
def model2result_list(
    clf: callable,
    X_train: pd.DataFrame(),
    y_train: pd.DataFrame(),
    X_test: pd.DataFrame(),
    y_test: pd.DataFrame(),
    model_name: str,
    year: int,
    k: int,
) -> list():
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc = accuracy_score(y_test, pred)
    pre = precision_score(y_test, pred)
    rec = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    return [year, model_name, acc, pre, rec, f1, k]

In [6]:
row = []
fold = 5
for year in tqdm(range(15, 16, 5)):  # 予後年数毎のループ
    prognosis_Xlabel = "X{0:0=2}".format(year)
    prognosis_ylabel = "y{0:0=2}".format(year)
    for k in range(10, 51, 10):
        tmp = None
        X = X_dict["mrna_agilent_microarray_zscores_ref_diploid_samples"]["mrmr"][
            "train"
        ][prognosis_Xlabel]
        y = y_dict["mrna_agilent_microarray_zscores_ref_diploid_samples"]["mrmr"][
            "train"
        ][prognosis_ylabel]

        skf = StratifiedKFold(n_splits=fold, shuffle=True, random_state=SEED)
        for train_index, val_indes in skf.split(X, y):
            X_train, X_val = X.iloc[train_index], X.iloc[val_indes]
            y_train, y_val = y.iloc[train_index], y.iloc[val_indes]

            assert X_train.shape[0] == y_train.shape[0], "train size is incorrect"
            assert X_val.shape[0] == y_val.shape[0], "test size is incorrect"

            # 特徴量選択の実施（mRMR）
            features = mrmr.mrmr_classif(
                X=X_train,
                y=y_train,
                K=k,
                show_progress=False,
            )

            # 特徴量選択で選ばれた特徴量の抽出（訓練、テストデータに適用）
            # list変換するのはfuture warningが出るため
            X_train = X_train[features]
            X_val = X_val[features]

            # スケーリングの実施
            X_train, X_val = transform_std(X_train, X_val)
            # X_train, X_val = transform_norm(X_train, X_val)

            # サンプリングの実装
            # positive_count_train = y_train.value_counts().sort_values()[0]
            rus = RandomUnderSampler(sampling_strategy="all", random_state=SEED)
            # smote = SMOTE(sampling_strategy="all", random_state=SEED)
            X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

            # accuracyの表示
            """
            print("予後年数：{0:0=2}年:".format(year))
            print("学習サンプルサイズ：", X_train.shape)
            display("ラベル比率：", y_train.value_counts())        
            """

            # ここからモデルのtrain、x軸をkとしたacc, f1の推移plot
            # print("Random Forest")
            params = {
                "n_estimators": 500,
                "criterion": "entropy",
                "max_depth": 5,
                "max_leaf_nodes": 20,
                "max_features": "log2",
                "class_weight": "balanced",
                "random_state": SEED,
            }
            rf = RandomForestClassifier(**params)
            tmp = model2result_list(rf, X_train, y_train, X_val, y_val, "rf", year, k)
            row.append(tmp)

            # print("Logistic Regression")
            params = {
                "penalty": "l2",
                "C": 0.5,
                "solver": "saga",
                "class_weight": "balanced",
            }
            lr = LogisticRegression(**params)
            tmp = model2result_list(lr, X_train, y_train, X_val, y_val, "lr", year, k)
            row.append(tmp)

            # print("Support Vector Machine")
            params = {
                "C": 50,
                "kernel": "sigmoid",
                "decision_function_shape": "ovr",
                "class_weight": "balanced",
            }
            svm = SVC(**params)
            tmp = model2result_list(svm, X_train, y_train, X_val, y_val, "svm", year, k)
            row.append(tmp)

    # 結果をdfにまとめている
    df_result = pd.DataFrame(
        row,
        columns=[
            "year",
            "model_name",
            "acc",
            "precision",
            "recall",
            "f1",
            "feature size",
        ],
    )

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [02:02<00:00, 122.37s/it]


In [7]:
df_result[df_result["model_name"] == "rf"].groupby(
    ["year", "feature size"]
).mean().sort_values("acc", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,acc,precision,recall,f1
year,feature size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
15,40,0.720231,0.734554,0.746166,0.739993
15,50,0.720231,0.734554,0.746166,0.739993
15,30,0.713295,0.725555,0.743922,0.734229
15,20,0.683237,0.694121,0.728822,0.710235
15,10,0.677457,0.695627,0.704979,0.699879


In [8]:
df_result[df_result["model_name"] == "lr"].groupby(
    ["year", "feature size"]
).mean().sort_values("acc", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,acc,precision,recall,f1
year,feature size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
15,40,0.732948,0.756673,0.737564,0.746544
15,50,0.732948,0.756673,0.737564,0.746544
15,30,0.715607,0.739345,0.722394,0.73034
15,20,0.695954,0.725919,0.689808,0.707205
15,10,0.690173,0.722748,0.681043,0.701143


In [9]:
df_result[df_result["model_name"] == "svm"].groupby(
    ["year", "feature size"]
).mean().sort_values("acc", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,acc,precision,recall,f1
year,feature size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
15,40,0.680925,0.712835,0.674638,0.692703
15,50,0.680925,0.712835,0.674638,0.692703
15,20,0.669364,0.697698,0.670266,0.683482
15,30,0.669364,0.700168,0.670196,0.683615
15,10,0.611561,0.653488,0.583567,0.615024


## 最適な特徴量数はいくつなのか実験

特徴量選択数を変化させ、学習曲線を描画する

# subtype毎のベースライン・学習

In [12]:
def validate_models_subtype(
    mircroarray_type: str,
    method: str,
):
    subtypes = [
        "claudin-low",
        "LumA",
        "LumB",
        "Her2",
        "Normal",
        "Basal",
    ]

    for year in range(15, 16, 5):  # 予後年数毎のループ
        print("====={0:0=2}".format(year) * 10)

        for subtype in subtypes:
            prognosis_Xlabel = "X{0:0=2}_{1}".format(year, subtype)
            prognosis_ylabel = "y{0:0=2}_{1}".format(year, subtype)
            X_train = X_dict[mircroarray_type]["claudin_subtype"][method]["train"][
                prognosis_Xlabel
            ]
            y_train = y_dict[mircroarray_type]["claudin_subtype"][method]["train"][
                prognosis_ylabel
            ]
            X_val = X_dict[mircroarray_type]["claudin_subtype"][method]["val"][
                prognosis_Xlabel
            ]
            y_val = y_dict[mircroarray_type]["claudin_subtype"][method]["val"][
                prognosis_ylabel
            ]
            assert X_train.shape[0] == y_train.shape[0], "train size is incorrect"
            assert X_val.shape[0] == y_val.shape[0], "val size is incorrect"
            if X_train.shape[1] == 0:
                print(
                    "**--WARNING: FEATURE NUM is 0! -> subtype {0} train is skipped!--**".format(
                        subtype
                    )
                )
                continue

            # accuracyの表示
            print("----------" * 10)
            print("subtype: ", subtype)
            print("予後年数：{0:0=2}年:".format(year))
            if accuracy_score(y_train, np.zeros(len(y_train))) >= 0.5:
                score = (
                    "0>1".format(year),
                    round(accuracy_score(y_train, np.zeros(len(y_train))), 3),
                )
            else:
                score = (
                    "0>1".format(year),
                    round(accuracy_score(y_train, np.ones(len(y_train))), 3),
                )
            print("accuracyベースライン：", score)
            print("使用特徴量：", X_train.columns)
            print("学習サンプルサイズ：", X_train.shape)
            display("ラベル比率：", y_train.value_counts())

            sm = SMOTE(random_state=SEED)
            display(
                compare_bcms(
                    X_train,
                    y_train,
                    X_val,
                    y_val,
                    over_sampling_class=sm,
                )
            )

In [13]:
for k in X_dict.keys():
    print(k)
    validate_models_subtype(mircroarray_type=k, method="mrmr")
    validate_models_subtype(mircroarray_type=k, method="boruta")

mrna_agilent_microarray_zscores_ref_all_samples
=====15=====15=====15=====15=====15=====15=====15=====15=====15=====15
----------------------------------------------------------------------------------------------------
subtype:  claudin-low
予後年数：15年:
accuracyベースライン： ('0>1', 0.562)
使用特徴量： Index(['BF511322', 'BIRC7', 'ZFP90', 'CHRNB3', 'IL34', 'AA939346', 'AW301012',
       'DB341932', 'MRPL53', 'CHDH', 'DA102929', 'GTPBP5', 'BF447974', 'NAV2',
       'C20orf29', 'NCRNA00160', 'PRPSAP1', 'CD511953', 'STX1B', 'COBL'],
      dtype='object')
学習サンプルサイズ： (89, 20)


'ラベル比率：'

0    50
1    39
Name: OS_15years, dtype: int64

11it [00:00, 41.34it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.83,0.6,0.852174,0.6
Quadratic Discriminant Analysis,1.0,0.6,1.0,0.5
Linear SVM,1.0,0.5,1.0,0.285714
Logistic Regression,1.0,0.4,1.0,0.25
Polynomial SVM,1.0,0.4,1.0,0.25
RBF SVM,1.0,0.4,1.0,0.25
Sigmoid SVM,0.95,0.4,0.951456,0.25
Nearest Neighbors,0.95,0.3,0.952381,0.222222
AdaBoost,1.0,0.2,1.0,0.0
Naive Bayes,0.99,0.4,0.990099,0.0


----------------------------------------------------------------------------------------------------
subtype:  LumA
予後年数：15年:
accuracyベースライン： ('0>1', 0.629)
使用特徴量： Index(['OS9', 'BX102680', 'MMP15', 'BCHE', 'OR2M7', 'PSMB11', 'ZIC2',
       'CEACAM3', 'YY2', 'S100P', 'ANGPT2', 'AW243302', 'FPGS', 'FLJ41170',
       'KLK1', 'CDH10', 'IL20RA', 'N90609', 'APOBEC3A', 'C15orf26'],
      dtype='object')
学習サンプルサイズ： (275, 20)


'ラベル比率：'

0    173
1    102
Name: OS_15years, dtype: int64

11it [00:00, 32.40it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.907514,0.83871,0.903614,0.782609
RBF SVM,0.947977,0.806452,0.947977,0.727273
Sigmoid SVM,0.812139,0.774194,0.811594,0.695652
AdaBoost,0.985549,0.774194,0.985673,0.666667
Logistic Regression,0.890173,0.774194,0.888235,0.666667
Decision Tree,0.82948,0.741935,0.820669,0.636364
Polynomial SVM,0.971098,0.741935,0.970414,0.636364
Naive Bayes,0.84104,0.774194,0.831804,0.631579
Linear SVM,0.872832,0.709677,0.871345,0.571429
Random Forest,0.916185,0.645161,0.915452,0.47619


----------------------------------------------------------------------------------------------------
subtype:  LumB
予後年数：15年:
accuracyベースライン： ('0>1', 0.648)
使用特徴量： Index(['ENC1', 'AI093492', 'C4orf32', 'SUN1', 'C14orf49', 'CYP2C19',
       'LOC645177', 'TET3', 'CELSR2', 'CR743466', 'LPAR3', 'GPR32', 'TTTY8B',
       'TP53BP2', 'AW118757', 'WDR60', 'MBD3', 'TTC1', 'LRRC50', 'BI481412'],
      dtype='object')
学習サンプルサイズ： (213, 20)


'ラベル比率：'

1    138
0     75
Name: OS_15years, dtype: int64

11it [00:00, 36.77it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Polynomial SVM,0.967391,0.625,0.968421,0.727273
Quadratic Discriminant Analysis,0.967391,0.583333,0.967033,0.705882
Random Forest,0.949275,0.583333,0.948529,0.666667
Naive Bayes,0.90942,0.541667,0.90566,0.645161
RBF SVM,0.971014,0.541667,0.970149,0.645161
Decision Tree,0.82971,0.583333,0.815686,0.642857
AdaBoost,1.0,0.5,1.0,0.625
Logistic Regression,0.945652,0.458333,0.943396,0.580645
Sigmoid SVM,0.905797,0.5,0.9,0.571429
Linear SVM,0.942029,0.458333,0.938931,0.518519


----------------------------------------------------------------------------------------------------
subtype:  Her2
予後年数：15年:
accuracyベースライン： ('0>1', 0.68)
使用特徴量： Index(['MICAL1', 'PCSK1N', 'AI797128', 'MGAT4C', 'LINC00307', 'DDX25', 'ACRV1',
       'OR13H1', 'MTTP', 'AW593287', 'AA405052', 'CA449938', 'PPIE_1', 'SETD5',
       'INSC', 'KRTAP21-1', 'LOC441617', 'GRIP1', 'WARS2', 'ESYT1'],
      dtype='object')
学習サンプルサイズ： (122, 20)


'ラベル比率：'

1    83
0    39
Name: OS_15years, dtype: int64

11it [00:00, 38.38it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Polynomial SVM,0.957831,0.785714,0.959538,0.842105
Naive Bayes,0.963855,0.714286,0.964286,0.777778
Quadratic Discriminant Analysis,0.981928,0.714286,0.982249,0.777778
RBF SVM,0.993976,0.642857,0.994012,0.705882
Sigmoid SVM,0.927711,0.642857,0.925926,0.705882
AdaBoost,1.0,0.642857,1.0,0.666667
Logistic Regression,0.96988,0.571429,0.97006,0.625
Random Forest,0.957831,0.571429,0.958084,0.625
Nearest Neighbors,0.96988,0.5,0.969325,0.533333
Linear SVM,0.96988,0.428571,0.969697,0.5


----------------------------------------------------------------------------------------------------
subtype:  Normal
予後年数：15年:
accuracyベースライン： ('0>1', 0.569)
使用特徴量： Index(['KIF13B', 'ZNF833P', 'RNF170', 'T80781', 'SLC25A15', 'ENO4', 'HCN1',
       'Z38762', 'AI797584', 'LOC389033', 'AP2A2', 'ZFAND2A', 'AI745455',
       'TBC1D14', 'JMJD6', 'NT5M', 'RPGRIP1L', 'SPTAN1', 'LOXL4', 'ARSG'],
      dtype='object')
学習サンプルサイズ： (58, 20)


'ラベル比率：'

1    33
0    25
Name: OS_15years, dtype: int64

11it [00:00, 48.24it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.833333,0.714286,0.825397,0.75
Naive Bayes,0.969697,0.714286,0.969697,0.75
Nearest Neighbors,1.0,0.714286,1.0,0.75
Random Forest,0.954545,0.714286,0.953846,0.75
Quadratic Discriminant Analysis,1.0,0.571429,1.0,0.727273
AdaBoost,1.0,0.571429,1.0,0.666667
Logistic Regression,1.0,0.571429,1.0,0.666667
Polynomial SVM,0.984848,0.571429,0.985075,0.666667
RBF SVM,1.0,0.571429,1.0,0.666667
Sigmoid SVM,1.0,0.571429,1.0,0.666667


----------------------------------------------------------------------------------------------------
subtype:  Basal
予後年数：15年:
accuracyベースライン： ('0>1', 0.61)
使用特徴量： Index(['FBXO31', 'AW572907', 'AI939551', 'FAM24A', 'BX099468', 'TUBB3',
       'DB312513', 'OR51I2', 'PAPOLA', 'TRIM77P', 'CXCL13', 'BM676522',
       'STATH', 'HSD3B2', 'NUDT12', 'PTX4', 'CCDC141', 'AW118163', 'AW293618',
       'ELAVL2'],
      dtype='object')
学習サンプルサイズ： (105, 20)


'ラベル比率：'

1    64
0    41
Name: OS_15years, dtype: int64

11it [00:00, 41.23it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.992188,0.583333,0.992126,0.705882
Decision Tree,0.851562,0.5,0.861314,0.666667
Polynomial SVM,0.992188,0.416667,0.992248,0.588235
Naive Bayes,0.953125,0.333333,0.953125,0.5
Random Forest,0.976562,0.333333,0.976378,0.5
AdaBoost,1.0,0.333333,1.0,0.428571
Nearest Neighbors,0.90625,0.333333,0.9,0.428571
Linear SVM,1.0,0.25,1.0,0.4
Logistic Regression,1.0,0.25,1.0,0.4
RBF SVM,1.0,0.25,1.0,0.4


=====15=====15=====15=====15=====15=====15=====15=====15=====15=====15
----------------------------------------------------------------------------------------------------
subtype:  claudin-low
予後年数：15年:
accuracyベースライン： ('0>1', 0.562)
使用特徴量： Index(['CGNL1', 'AW301012', 'COBL', 'TMEM101', 'ZGLP1', 'ALDH9A1', 'MYO5C',
       'DNASE1', 'ZNF750', 'RHOG', 'THRB', 'SHANK2', 'NAV2', 'BF447974',
       'IL10RB', 'BF511322', 'STX1B', 'DUT', 'GABRA1', 'TMEM231', 'BQ428953'],
      dtype='object')
学習サンプルサイズ： (89, 21)


'ラベル比率：'

0    50
1    39
Name: OS_15years, dtype: int64

11it [00:00, 40.72it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.83,0.6,0.852174,0.5
AdaBoost,1.0,0.6,1.0,0.333333
Sigmoid SVM,0.91,0.4,0.912621,0.25
Naive Bayes,0.96,0.3,0.960784,0.222222
Linear SVM,0.98,0.5,0.98,0.0
Logistic Regression,0.96,0.5,0.96,0.0
Nearest Neighbors,0.92,0.5,0.921569,0.0
Polynomial SVM,0.95,0.6,0.947368,0.0
Quadratic Discriminant Analysis,0.99,0.4,0.989899,0.0
RBF SVM,0.97,0.4,0.969697,0.0


----------------------------------------------------------------------------------------------------
subtype:  LumA
予後年数：15年:
accuracyベースライン： ('0>1', 0.629)
使用特徴量： Index(['ZIC2', 'RPL5', 'BM713638', 'OS9', 'S100P', 'PIGV', 'RACGAP1', 'GDF5',
       'QARS1', 'PDZK1IP1', 'CXCL14', 'OXSM', 'LSR', 'PEX11G', 'CPSF3',
       'RPUSD2', 'GNL1', 'FPGS', 'AA643892'],
      dtype='object')
学習サンプルサイズ： (275, 19)


'ラベル比率：'

0    173
1    102
Name: OS_15years, dtype: int64

11it [00:00, 30.95it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Random Forest,0.864162,0.774194,0.86217,0.631579
Naive Bayes,0.789017,0.709677,0.787172,0.608696
Sigmoid SVM,0.696532,0.709677,0.690265,0.571429
Polynomial SVM,0.947977,0.741935,0.947059,0.555556
Linear SVM,0.774566,0.709677,0.767857,0.526316
Logistic Regression,0.763006,0.709677,0.761628,0.526316
RBF SVM,0.921965,0.709677,0.923077,0.526316
AdaBoost,0.965318,0.612903,0.965714,0.5
Nearest Neighbors,0.843931,0.677419,0.851648,0.444444
Decision Tree,0.809249,0.612903,0.815642,0.4


----------------------------------------------------------------------------------------------------
subtype:  LumB
予後年数：15年:
accuracyベースライン： ('0>1', 0.648)
使用特徴量： Index(['TMEM106B', 'CR743466', 'WDR60', 'CELSR2', 'EYA2', 'FCRLB', 'RNF225',
       'ZNF273', 'BI481412', 'BX102609', 'PRKAR1A', 'ENC1', 'ICA1L'],
      dtype='object')
学習サンプルサイズ： (213, 13)


'ラベル比率：'

1    138
0     75
Name: OS_15years, dtype: int64

11it [00:00, 41.53it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nearest Neighbors,0.826087,0.791667,0.815385,0.848485
Polynomial SVM,0.873188,0.625,0.885246,0.742857
Decision Tree,0.836957,0.625,0.835165,0.709677
Random Forest,0.891304,0.583333,0.889706,0.705882
AdaBoost,0.978261,0.5,0.978102,0.625
Linear SVM,0.797101,0.5,0.801418,0.625
Logistic Regression,0.797101,0.5,0.801418,0.625
RBF SVM,0.887681,0.5,0.888889,0.625
Naive Bayes,0.764493,0.458333,0.775087,0.580645
Quadratic Discriminant Analysis,0.84058,0.458333,0.846154,0.580645


----------------------------------------------------------------------------------------------------
subtype:  Her2
予後年数：15年:
accuracyベースライン： ('0>1', 0.68)
使用特徴量： Index(['CA773219', 'NBN', 'CCNJL', 'BU727048'], dtype='object')
学習サンプルサイズ： (122, 4)


'ラベル比率：'

1    83
0    39
Name: OS_15years, dtype: int64

11it [00:00, 48.93it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear SVM,0.783133,0.571429,0.783133,0.666667
Logistic Regression,0.777108,0.571429,0.778443,0.666667
Polynomial SVM,0.849398,0.5,0.86631,0.666667
Quadratic Discriminant Analysis,0.795181,0.571429,0.806818,0.666667
Random Forest,0.873494,0.571429,0.869565,0.666667
Sigmoid SVM,0.662651,0.571429,0.670588,0.666667
RBF SVM,0.885542,0.5,0.891429,0.631579
AdaBoost,0.987952,0.428571,0.987952,0.6
Decision Tree,0.825301,0.428571,0.84153,0.6
Nearest Neighbors,0.909639,0.428571,0.911243,0.6


----------------------------------------------------------------------------------------------------
subtype:  Normal
予後年数：15年:
accuracyベースライン： ('0>1', 0.569)
使用特徴量： Index(['DPY19L2', 'SESN1', 'AI912012', 'ENO4', 'C1orf144', 'ARSG', 'JMJD6',
       'SLC30A3', 'NT5M', 'KIF13B', 'AP2A2', 'LOC389033', 'FIGN'],
      dtype='object')
学習サンプルサイズ： (58, 13)


'ラベル比率：'

1    33
0    25
Name: OS_15years, dtype: int64

11it [00:00, 49.50it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoost,1.0,0.857143,1.0,0.888889
Linear SVM,1.0,0.857143,1.0,0.888889
Logistic Regression,1.0,0.857143,1.0,0.888889
Naive Bayes,0.924242,0.857143,0.923077,0.888889
Random Forest,0.954545,0.857143,0.953846,0.888889
Sigmoid SVM,0.939394,0.857143,0.9375,0.888889
Nearest Neighbors,0.939394,0.714286,0.939394,0.75
RBF SVM,1.0,0.714286,1.0,0.75
Polynomial SVM,0.954545,0.571429,0.956522,0.666667
Quadratic Discriminant Analysis,1.0,0.571429,1.0,0.666667


mrna_agilent_microarray
=====15=====15=====15=====15=====15=====15=====15=====15=====15=====15
----------------------------------------------------------------------------------------------------
subtype:  claudin-low
予後年数：15年:
accuracyベースライン： ('0>1', 0.562)
使用特徴量： Index(['COBL', 'CNOT1', 'HIST1H4E', 'EEF1A2', 'SPRR3', 'ACACA', 'SLC35C1',
       'MAGEA10', 'C1orf116', 'DA102929', 'SEPX1', 'SLC25A37_1', 'HADH',
       'PWWP2B', 'SERPINE1', 'SHANK2', 'AMH', 'NAV2', 'ADORA1', 'SLC27A5'],
      dtype='object')
学習サンプルサイズ： (89, 20)


'ラベル比率：'

0    50
1    39
Name: OS_15years, dtype: int64

11it [00:00, 43.91it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear SVM,0.97,0.6,0.970297,0.6
Logistic Regression,0.96,0.6,0.959184,0.6
Naive Bayes,0.95,0.6,0.948454,0.6
Sigmoid SVM,0.5,0.4,0.666667,0.571429
Decision Tree,0.78,0.4,0.810345,0.5
AdaBoost,1.0,0.5,1.0,0.444444
Polynomial SVM,1.0,0.5,1.0,0.444444
Quadratic Discriminant Analysis,0.99,0.5,0.989899,0.444444
RBF SVM,0.87,0.5,0.873786,0.444444
Random Forest,0.93,0.4,0.932039,0.4


----------------------------------------------------------------------------------------------------
subtype:  LumA
予後年数：15年:
accuracyベースライン： ('0>1', 0.629)
使用特徴量： Index(['ZIC2', 'GNA14', 'RPS17', 'ZMYND12', 'KRT18', 'ANGPT2', 'QARS1', 'LCT',
       'S100P', 'IL20RA', 'MANBA', 'PTPLAD1', 'LSR', 'MAGEA1', 'BCHE',
       'APOBEC3A', 'PAGE2B', 'DIRAS3', 'PCDHB17', 'BC042566'],
      dtype='object')
学習サンプルサイズ： (275, 20)


'ラベル比率：'

0    173
1    102
Name: OS_15years, dtype: int64

11it [00:00, 28.59it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nearest Neighbors,0.803468,0.83871,0.815217,0.761905
AdaBoost,0.947977,0.774194,0.947977,0.666667
Decision Tree,0.820809,0.741935,0.824859,0.636364
Logistic Regression,0.803468,0.741935,0.798817,0.636364
Naive Bayes,0.771676,0.741935,0.745981,0.636364
Polynomial SVM,0.84104,0.741935,0.844193,0.636364
Random Forest,0.919075,0.741935,0.918605,0.636364
Linear SVM,0.812139,0.709677,0.80826,0.608696
RBF SVM,0.774566,0.709677,0.769231,0.608696
Sigmoid SVM,0.5,0.354839,0.666667,0.52381


----------------------------------------------------------------------------------------------------
subtype:  LumB
予後年数：15年:
accuracyベースライン： ('0>1', 0.648)
使用特徴量： Index(['ENC1', 'C19orf43', 'PTPN2', 'PPIG', 'ZNF787', 'LY6D', 'SLC7A2',
       'EFCAB11', 'CAMK2D', 'C9orf95', 'PHF19', 'CELSR2', 'PUS7', 'TP53BP2',
       'GPX4', 'IER3', 'CWH43', 'NPDC1', 'LRRC50', 'SLFN5'],
      dtype='object')
学習サンプルサイズ： (213, 20)


'ラベル比率：'

1    138
0     75
Name: OS_15years, dtype: int64

11it [00:00, 29.32it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sigmoid SVM,0.5,0.666667,0.666667,0.8
Quadratic Discriminant Analysis,0.90942,0.708333,0.907063,0.774194
Naive Bayes,0.847826,0.666667,0.838462,0.692308
Logistic Regression,0.807971,0.625,0.805861,0.689655
Linear SVM,0.826087,0.583333,0.822222,0.666667
Random Forest,0.905797,0.583333,0.904412,0.666667
AdaBoost,0.98913,0.541667,0.989091,0.645161
Decision Tree,0.82971,0.541667,0.833922,0.645161
Nearest Neighbors,0.82971,0.583333,0.809717,0.642857
Polynomial SVM,0.90942,0.583333,0.907063,0.642857


----------------------------------------------------------------------------------------------------
subtype:  Her2
予後年数：15年:
accuracyベースライン： ('0>1', 0.68)
使用特徴量： Index(['MICAL1', 'PCSK1N', 'FBLN5', 'FAM3B', 'PITRM1', 'VAMP8', 'WARS2',
       'ANKRD36', 'CDC7', 'PITHD1', 'MAP6D1', 'SHMT1', 'SLC41A3', 'DIAPH1',
       'WIT1', 'PLIN4', 'TM4SF1', 'ABCA4', 'SERPINB7', 'CYC1'],
      dtype='object')
学習サンプルサイズ： (122, 20)


'ラベル比率：'

1    83
0    39
Name: OS_15years, dtype: int64

11it [00:00, 35.01it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.927711,0.714286,0.926829,0.75
Polynomial SVM,0.96988,0.714286,0.97006,0.75
Random Forest,0.945783,0.642857,0.945455,0.736842
Quadratic Discriminant Analysis,0.96988,0.642857,0.97006,0.705882
AdaBoost,1.0,0.571429,1.0,0.7
Linear SVM,0.939759,0.642857,0.939024,0.666667
Naive Bayes,0.915663,0.642857,0.916667,0.666667
RBF SVM,0.903614,0.642857,0.9,0.666667
Nearest Neighbors,0.879518,0.5,0.871795,0.533333
Decision Tree,0.843373,0.428571,0.843373,0.333333


----------------------------------------------------------------------------------------------------
subtype:  Normal
予後年数：15年:
accuracyベースライン： ('0>1', 0.569)
使用特徴量： Index(['KIF13B', 'RPS6KA2', 'NDUFS8', 'ARF5', 'SCG2', 'ARSG', 'LOXL4', 'RAB36',
       'LOC389033', 'DNAJB11', 'ZFAND2A', 'SPTAN1', 'IFT88', 'NFKBIA',
       'LCLAT1', 'CTDSPL', 'HOMER2', 'SLFN5', 'MAP2K4', 'SESN1'],
      dtype='object')
学習サンプルサイズ： (58, 20)


'ラベル比率：'

1    33
0    25
Name: OS_15years, dtype: int64

11it [00:00, 46.36it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nearest Neighbors,0.984848,0.857143,0.985075,0.888889
Decision Tree,0.80303,0.714286,0.811594,0.8
Naive Bayes,0.954545,0.714286,0.952381,0.75
Quadratic Discriminant Analysis,1.0,0.714286,1.0,0.75
RBF SVM,0.939394,0.714286,0.939394,0.75
Sigmoid SVM,0.5,0.571429,0.666667,0.727273
Linear SVM,1.0,0.571429,1.0,0.666667
Logistic Regression,0.984848,0.571429,0.984615,0.666667
Random Forest,0.939394,0.571429,0.935484,0.666667
AdaBoost,1.0,0.428571,1.0,0.5


----------------------------------------------------------------------------------------------------
subtype:  Basal
予後年数：15年:
accuracyベースライン： ('0>1', 0.61)
使用特徴量： Index(['FBXO31', 'ALDH4A1', 'TNFSF13', 'PLAC1', 'CD688177', 'NOS3', 'GTSF1',
       'KLHL13', 'CAMK1D', 'KLHL36', 'DEFB1', 'CXCL13', 'GPR83', 'IFT27',
       'TRAPPC2L', 'CAPZA1', 'NFE2', 'PEBP4', 'KLK13', 'DEF8'],
      dtype='object')
学習サンプルサイズ： (105, 20)


'ラベル比率：'

1    64
0    41
Name: OS_15years, dtype: int64

11it [00:00, 38.25it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sigmoid SVM,0.5,0.583333,0.666667,0.736842
Quadratic Discriminant Analysis,0.984375,0.666667,0.984615,0.714286
RBF SVM,0.820312,0.583333,0.821705,0.666667
Decision Tree,0.804688,0.5,0.814815,0.666667
Logistic Regression,0.875,0.583333,0.875,0.615385
Linear SVM,0.875,0.5,0.876923,0.571429
Nearest Neighbors,0.851562,0.5,0.840336,0.571429
AdaBoost,1.0,0.333333,1.0,0.5
Naive Bayes,0.898438,0.5,0.900763,0.5
Polynomial SVM,0.976562,0.5,0.976378,0.5


=====15=====15=====15=====15=====15=====15=====15=====15=====15=====15
----------------------------------------------------------------------------------------------------
subtype:  claudin-low
予後年数：15年:
accuracyベースライン： ('0>1', 0.562)
使用特徴量： Index(['CGNL1', 'COBL', 'DA102929', 'C1orf116', 'TMEM101', 'ACSL3', 'C7orf41',
       'ALDH9A1', 'MYO5C', 'DNASE1', 'DB451841', 'TMEM213', 'ZNF750', 'RHOG',
       'SHANK2', 'NAV2', 'MAGEA6', 'ANKRD39', 'DUT', 'RBBP8', 'ARPP19',
       'PTCHD1'],
      dtype='object')
学習サンプルサイズ： (89, 22)


'ラベル比率：'

0    50
1    39
Name: OS_15years, dtype: int64

11it [00:00, 39.29it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear SVM,0.92,0.6,0.921569,0.6
Naive Bayes,0.9,0.6,0.897959,0.6
Sigmoid SVM,0.5,0.4,0.666667,0.571429
Polynomial SVM,0.99,0.6,0.990099,0.5
Logistic Regression,0.94,0.5,0.941176,0.444444
Quadratic Discriminant Analysis,0.99,0.5,0.990099,0.444444
RBF SVM,0.87,0.4,0.873786,0.4
Decision Tree,0.82,0.3,0.839286,0.363636
Random Forest,0.95,0.4,0.950495,0.25
AdaBoost,1.0,0.3,1.0,0.222222


----------------------------------------------------------------------------------------------------
subtype:  LumA
予後年数：15年:
accuracyベースライン： ('0>1', 0.629)
使用特徴量： Index(['ZIC2', 'RPL5', 'MYBPC1', 'NUAK2', 'RPL26', 'ABHD14A', 'S100P', 'PIGV',
       'RACGAP1', 'RPS29', 'APOBEC3A', 'PDZK1IP1', 'FZD8', 'CXCL14', 'PCDHB17',
       'CHCHD4', 'MANBA', 'LSR', 'TMEM26', 'RPS6KA2', 'NAB1', 'DIRAS3',
       'CXCL17', 'NOC3L'],
      dtype='object')
学習サンプルサイズ： (275, 24)


'ラベル比率：'

0    173
1    102
Name: OS_15years, dtype: int64

11it [00:00, 24.62it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear SVM,0.763006,0.741935,0.758824,0.692308
Decision Tree,0.82948,0.741935,0.831909,0.636364
Random Forest,0.878613,0.741935,0.877193,0.636364
Logistic Regression,0.757225,0.677419,0.75,0.615385
RBF SVM,0.771676,0.645161,0.773639,0.592593
Nearest Neighbors,0.83237,0.677419,0.84492,0.583333
Naive Bayes,0.739884,0.709677,0.736842,0.571429
AdaBoost,0.976879,0.645161,0.976608,0.56
Polynomial SVM,0.913295,0.645161,0.912281,0.521739
Sigmoid SVM,0.50289,0.354839,0.657371,0.5


----------------------------------------------------------------------------------------------------
subtype:  LumB
予後年数：15年:
accuracyベースライン： ('0>1', 0.648)
使用特徴量： Index(['TMEM106B', 'NPDC1', 'C9orf95', 'WDR60', 'ITPRIP', 'CDK5', 'SNX10',
       'SLC7A2', 'CELSR2', 'PRKCD', 'EYA2', 'TRPV2', 'FCRLB', 'LRRC50',
       'ARPP19', 'ENC1'],
      dtype='object')
学習サンプルサイズ： (213, 16)


'ラベル比率：'

1    138
0     75
Name: OS_15years, dtype: int64

11it [00:00, 26.40it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sigmoid SVM,0.5,0.666667,0.666667,0.8
Naive Bayes,0.76087,0.708333,0.772414,0.774194
Quadratic Discriminant Analysis,0.82971,0.625,0.831541,0.727273
Logistic Regression,0.753623,0.625,0.753623,0.709677
Random Forest,0.862319,0.625,0.859259,0.709677
Decision Tree,0.826087,0.5,0.845161,0.647059
AdaBoost,0.981884,0.541667,0.981685,0.62069
Linear SVM,0.757246,0.5,0.754579,0.6
RBF SVM,0.811594,0.5,0.80597,0.6
Polynomial SVM,0.949275,0.458333,0.94697,0.580645


----------------------------------------------------------------------------------------------------
subtype:  Her2
予後年数：15年:
accuracyベースライン： ('0>1', 0.68)
使用特徴量： Index(['EFHA1', 'ASAH1', 'PPIE', 'CERK', 'BOD1'], dtype='object')
学習サンプルサイズ： (122, 5)


'ラベル比率：'

1    83
0    39
Name: OS_15years, dtype: int64

11it [00:00, 25.68it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sigmoid SVM,0.5,0.642857,0.666667,0.782609
Nearest Neighbors,0.855422,0.714286,0.85,0.777778
RBF SVM,0.746988,0.714286,0.761364,0.777778
Random Forest,0.795181,0.642857,0.797619,0.736842
Naive Bayes,0.753012,0.642857,0.763006,0.705882
AdaBoost,0.993976,0.5,0.994012,0.666667
Polynomial SVM,0.759036,0.571429,0.767442,0.625
Quadratic Discriminant Analysis,0.73494,0.5,0.738095,0.533333
Linear SVM,0.728916,0.428571,0.739884,0.428571
Logistic Regression,0.722892,0.428571,0.729412,0.428571


----------------------------------------------------------------------------------------------------
subtype:  Normal
予後年数：15年:
accuracyベースライン： ('0>1', 0.569)
使用特徴量： Index(['RERE', 'MAPKAP1', 'YPEL3', 'ZFAND2A', 'SESN1', 'SLFN5', 'KCNK12',
       'MAP2K4', 'WDR35', 'PHYHD1', 'GPR180', 'DNAJB11', 'ARSG', 'CCDC74A',
       'MST1', 'PEX13', 'KIF13B', 'C16orf80', 'IFT88', 'PSMD7', 'LOC389033',
       'SPTAN1', 'INTS3'],
      dtype='object')
学習サンプルサイズ： (58, 23)


'ラベル比率：'

1    33
0    25
Name: OS_15years, dtype: int64

11it [00:00, 42.22it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nearest Neighbors,0.924242,0.857143,0.927536,0.888889
Decision Tree,0.818182,0.714286,0.823529,0.8
Logistic Regression,0.954545,0.714286,0.955224,0.8
Naive Bayes,0.939394,0.714286,0.9375,0.8
RBF SVM,0.909091,0.714286,0.909091,0.8
Linear SVM,1.0,0.714286,1.0,0.75
Polynomial SVM,1.0,0.714286,1.0,0.75
Sigmoid SVM,0.5,0.571429,0.666667,0.727273
AdaBoost,1.0,0.571429,1.0,0.666667
Random Forest,0.984848,0.571429,0.985075,0.666667


mrna_agilent_microarray_zscores_ref_diploid_samples
=====15=====15=====15=====15=====15=====15=====15=====15=====15=====15
----------------------------------------------------------------------------------------------------
subtype:  claudin-low
予後年数：15年:
accuracyベースライン： ('0>1', 0.562)
使用特徴量： Index(['COBL', 'SLC36A3', 'ID3', 'TIMM22', 'OR7C2', 'SF3A1', 'MAST2',
       'LINC00160', 'NPFFR1', 'NAV2', 'AP5S1', 'STX1B', 'SHANK2', 'CPT1A',
       'ZGLP1', 'SNORD15A', 'FFAR3', 'THRB', 'C1orf116', 'PDZK1'],
      dtype='object')
学習サンプルサイズ： (89, 20)


'ラベル比率：'

0    50
1    39
Name: OS_15years, dtype: int64

11it [00:00, 43.98it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.99,0.7,0.989899,0.571429
Linear SVM,0.99,0.5,0.989899,0.444444
Naive Bayes,0.96,0.5,0.96,0.444444
Polynomial SVM,0.98,0.6,0.979592,0.333333
AdaBoost,1.0,0.4,1.0,0.25
Logistic Regression,0.99,0.4,0.989899,0.25
RBF SVM,0.98,0.4,0.98,0.25
Random Forest,0.95,0.4,0.950495,0.25
Sigmoid SVM,0.94,0.4,0.941176,0.25
Decision Tree,0.81,0.3,0.825688,0.222222


----------------------------------------------------------------------------------------------------
subtype:  LumA
予後年数：15年:
accuracyベースライン： ('0>1', 0.629)
使用特徴量： Index(['OS9', 'ELF3', 'HCRTR1', 'BAG5', 'RRP8', 'MTNR1A', 'PGK1', 'ZIC2',
       'BEX1', 'FOXR1', 'FPGS', 'ANGPT2', 'KLK1', 'CDH10', 'S100P', 'DIRAS3',
       'KIR3DL3', 'NLRP11', 'TMEM220', 'RACGAP1'],
      dtype='object')
学習サンプルサイズ： (275, 20)


'ラベル比率：'

0    173
1    102
Name: OS_15years, dtype: int64

11it [00:00, 32.31it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.83237,0.741935,0.837079,0.636364
Nearest Neighbors,0.878613,0.741935,0.887097,0.636364
Quadratic Discriminant Analysis,0.910405,0.709677,0.911681,0.608696
Naive Bayes,0.820809,0.677419,0.821839,0.545455
Sigmoid SVM,0.797688,0.677419,0.795322,0.545455
RBF SVM,0.953757,0.645161,0.954286,0.521739
Polynomial SVM,0.968208,0.677419,0.967359,0.5
Linear SVM,0.852601,0.612903,0.857143,0.5
Logistic Regression,0.843931,0.612903,0.841176,0.5
AdaBoost,0.976879,0.645161,0.977143,0.47619


----------------------------------------------------------------------------------------------------
subtype:  LumB
予後年数：15年:
accuracyベースライン： ('0>1', 0.648)
使用特徴量： Index(['ENC1', 'TTC1', 'OR52M1', 'HIST1H1E', 'GHSR', 'GPR32', 'OR2B11',
       'CELSR2', 'GAPDHS', 'NMRK1', 'HBG2', 'LINC02693', 'SPPL2B', 'TTC28',
       'PHF19', 'SLC7A2', 'LPAR3', 'SGTA', 'BPIFB6', 'ZNF273'],
      dtype='object')
学習サンプルサイズ： (213, 20)


'ラベル比率：'

1    138
0     75
Name: OS_15years, dtype: int64

11it [00:00, 36.27it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.956522,0.666667,0.956835,0.777778
RBF SVM,0.967391,0.666667,0.966292,0.75
Random Forest,0.945652,0.666667,0.945848,0.75
Sigmoid SVM,0.858696,0.625,0.859206,0.709677
AdaBoost,1.0,0.583333,1.0,0.6875
Naive Bayes,0.884058,0.583333,0.883212,0.6875
Logistic Regression,0.916667,0.583333,0.915129,0.666667
Polynomial SVM,0.971014,0.541667,0.971631,0.645161
Nearest Neighbors,0.855072,0.541667,0.84252,0.62069
Decision Tree,0.82971,0.458333,0.819923,0.580645


----------------------------------------------------------------------------------------------------
subtype:  Her2
予後年数：15年:
accuracyベースライン： ('0>1', 0.68)
使用特徴量： Index(['MICAL1', 'MGAT4C', 'TSBP1', 'NR5A1', 'PCSK1N', 'DOK5', 'ACRV1',
       'SH2D4B', 'KIAA1109', 'WARS2', 'MAGEA4', 'KCNH2', 'ANKRD36', 'CROCCP2',
       'GRIP1', 'OR4A16', 'VAMP8', 'CDHR1', 'PPIE_1', 'ESYT1'],
      dtype='object')
学習サンプルサイズ： (122, 20)


'ラベル比率：'

1    83
0    39
Name: OS_15years, dtype: int64

11it [00:00, 40.46it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.96988,0.714286,0.97076,0.8
AdaBoost,1.0,0.642857,1.0,0.736842
Nearest Neighbors,0.951807,0.642857,0.951807,0.705882
Linear SVM,0.987952,0.571429,0.988095,0.7
Logistic Regression,0.987952,0.571429,0.988095,0.7
Polynomial SVM,0.951807,0.571429,0.954023,0.7
RBF SVM,0.993976,0.571429,0.994012,0.7
Decision Tree,0.89759,0.571429,0.905028,0.666667
Naive Bayes,0.933735,0.571429,0.937143,0.666667
Sigmoid SVM,0.915663,0.5,0.918605,0.588235


----------------------------------------------------------------------------------------------------
subtype:  Normal
予後年数：15年:
accuracyベースライン： ('0>1', 0.569)
使用特徴量： Index(['KIF13B', 'TMEM38A', 'ZNF75D', 'CELP', 'TERB1', 'OSBPL3', 'SZRD1',
       'RNU6ATAC', 'ENO4', 'LCE1A', 'TP53BP1', 'ZFAND2A', 'AP2A2', 'DPY19L2',
       'SPTAN1', 'LOXL4', 'GPR180', 'NT5M', 'JMJD6', 'JAKMIP2'],
      dtype='object')
学習サンプルサイズ： (58, 20)


'ラベル比率：'

1    33
0    25
Name: OS_15years, dtype: int64

11it [00:00, 47.37it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.833333,0.714286,0.825397,0.75
Nearest Neighbors,0.984848,0.714286,0.985075,0.75
Random Forest,0.969697,0.714286,0.96875,0.75
Linear SVM,1.0,0.571429,1.0,0.666667
Logistic Regression,1.0,0.571429,1.0,0.666667
Naive Bayes,0.954545,0.571429,0.953846,0.666667
Polynomial SVM,0.984848,0.571429,0.985075,0.666667
Quadratic Discriminant Analysis,1.0,0.571429,1.0,0.666667
RBF SVM,1.0,0.571429,1.0,0.666667
Sigmoid SVM,0.969697,0.571429,0.969697,0.666667


----------------------------------------------------------------------------------------------------
subtype:  Basal
予後年数：15年:
accuracyベースライン： ('0>1', 0.61)
使用特徴量： Index(['FBXO31', 'OR2T29', 'OR2AG2', 'ALAS2', 'KLHL4', 'CAMK1D', 'YWHAEP7',
       'PTX4', 'KRTAP5-5', 'OR52N4', 'STATH', 'NUDT12', 'CLTRN', 'CXCL13',
       'OR8G1', 'HSD3B2', 'MAP1LC3B', 'TPH1', 'ELMSAN1', 'CYMP'],
      dtype='object')
学習サンプルサイズ： (105, 20)


'ラベル比率：'

1    64
0    41
Name: OS_15years, dtype: int64

11it [00:00, 42.11it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.8125,0.666667,0.8,0.714286
Linear SVM,0.976562,0.583333,0.976378,0.705882
Quadratic Discriminant Analysis,0.96875,0.583333,0.96875,0.705882
Random Forest,0.96875,0.583333,0.967742,0.705882
Sigmoid SVM,0.921875,0.583333,0.920635,0.705882
Logistic Regression,0.945312,0.5,0.944882,0.666667
AdaBoost,1.0,0.5,1.0,0.625
Polynomial SVM,0.929688,0.416667,0.934307,0.588235
Nearest Neighbors,0.9375,0.416667,0.934426,0.533333
Naive Bayes,0.929688,0.333333,0.928,0.5


=====15=====15=====15=====15=====15=====15=====15=====15=====15=====15
----------------------------------------------------------------------------------------------------
subtype:  claudin-low
予後年数：15年:
accuracyベースライン： ('0>1', 0.562)
使用特徴量： Index(['SOX10', 'COBL', 'ALDH9A1', 'ZNF750', 'FAM81B', 'RHOG', 'THRB',
       'SHANK2', 'NPFFR1', 'DUT', 'RBBP8', 'GABRA1', 'TMEM231'],
      dtype='object')
学習サンプルサイズ： (89, 13)


'ラベル比率：'

0    50
1    39
Name: OS_15years, dtype: int64

11it [00:00, 48.61it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nearest Neighbors,0.86,0.7,0.86,0.666667
Sigmoid SVM,0.77,0.6,0.757895,0.5
Naive Bayes,0.85,0.5,0.845361,0.444444
RBF SVM,0.93,0.5,0.932039,0.444444
Decision Tree,0.82,0.3,0.839286,0.363636
Logistic Regression,0.88,0.3,0.882353,0.363636
Random Forest,0.92,0.5,0.923077,0.285714
Quadratic Discriminant Analysis,0.92,0.4,0.921569,0.25
Linear SVM,0.88,0.2,0.884615,0.2
AdaBoost,1.0,0.3,1.0,0.0


----------------------------------------------------------------------------------------------------
subtype:  LumA
予後年数：15年:
accuracyベースライン： ('0>1', 0.629)
使用特徴量： Index(['ZIC2', 'OS9', 'S100P', 'RACGAP1', 'SPATA18', 'PDZK1IP1', 'HS3ST1',
       'PEX11G', 'TMEM26', 'CPSF3', 'RPUSD2', 'FPGS', 'RPS27'],
      dtype='object')
学習サンプルサイズ： (275, 13)


'ラベル比率：'

0    173
1    102
Name: OS_15years, dtype: int64

11it [00:00, 38.39it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoost,0.924855,0.741935,0.926136,0.636364
Linear SVM,0.800578,0.741935,0.805634,0.636364
Sigmoid SVM,0.716763,0.741935,0.72,0.636364
Decision Tree,0.812139,0.741935,0.816901,0.6
Naive Bayes,0.757225,0.741935,0.751479,0.6
RBF SVM,0.881503,0.709677,0.882521,0.571429
Quadratic Discriminant Analysis,0.815029,0.741935,0.811765,0.555556
Logistic Regression,0.777457,0.677419,0.77551,0.545455
Random Forest,0.861272,0.709677,0.863636,0.526316
Polynomial SVM,0.921965,0.677419,0.923944,0.5


----------------------------------------------------------------------------------------------------
subtype:  LumB
予後年数：15年:
accuracyベースライン： ('0>1', 0.648)
使用特徴量： Index(['KHK', 'TMEM106B', 'GATAD1', 'NMRK1', 'MYO5C', 'WDR60', 'ITPRIP',
       'CDK5', 'SLC7A2', 'CELSR2', 'OMP', 'SLTM', 'PRKCD', 'FCRLB', 'USP30',
       'DNAAF1', 'ZNF273', 'PRKAR1A', 'NMU', 'ENC1', 'ICA1L', 'SHISA2'],
      dtype='object')
学習サンプルサイズ： (213, 22)


'ラベル比率：'

1    138
0     75
Name: OS_15years, dtype: int64

11it [00:00, 35.28it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoost,0.996377,0.666667,0.996364,0.733333
Naive Bayes,0.735507,0.625,0.747405,0.727273
Polynomial SVM,0.92029,0.583333,0.925676,0.722222
Nearest Neighbors,0.873188,0.666667,0.86692,0.714286
Random Forest,0.891304,0.666667,0.891304,0.714286
Linear SVM,0.84058,0.625,0.835821,0.689655
Logistic Regression,0.804348,0.625,0.801471,0.689655
Decision Tree,0.811594,0.583333,0.796875,0.642857
Sigmoid SVM,0.735507,0.583333,0.726592,0.642857
Quadratic Discriminant Analysis,0.913043,0.5,0.912409,0.6


----------------------------------------------------------------------------------------------------
subtype:  Her2
予後年数：15年:
accuracyベースライン： ('0>1', 0.68)
使用特徴量： Index(['NBN', 'ASAH1', 'SELPLG', 'SPPL2C'], dtype='object')
学習サンプルサイズ： (122, 4)


'ラベル比率：'

1    83
0    39
Name: OS_15years, dtype: int64

11it [00:00, 49.75it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.759036,0.785714,0.772727,0.823529
Polynomial SVM,0.795181,0.714286,0.826531,0.8
RBF SVM,0.843373,0.714286,0.860215,0.777778
Decision Tree,0.795181,0.714286,0.795181,0.75
Linear SVM,0.710843,0.642857,0.717647,0.736842
Logistic Regression,0.710843,0.642857,0.714286,0.736842
AdaBoost,0.96988,0.642857,0.970414,0.705882
Naive Bayes,0.753012,0.642857,0.773481,0.705882
Nearest Neighbors,0.873494,0.642857,0.874251,0.705882
Random Forest,0.825301,0.642857,0.830409,0.705882


----------------------------------------------------------------------------------------------------
subtype:  Normal
予後年数：15年:
accuracyベースライン： ('0>1', 0.569)
使用特徴量： Index(['MAPKAP1', 'SESN1', 'GTF2H3', 'MYH10', 'KCNK12', 'ENO4', 'SZRD1',
       'GPR180', 'DNAJB11', 'ARSG', 'CCDC74A', 'TRPC1', 'JMJD6', 'DAZL',
       'PEX13', 'NT5M', 'KIF13B', 'IFT88', 'RIPK4', 'FIGN', 'SPTAN1'],
      dtype='object')
学習サンプルサイズ： (58, 21)


'ラベル比率：'

1    33
0    25
Name: OS_15years, dtype: int64

11it [00:00, 48.45it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear SVM,1.0,0.857143,1.0,0.888889
Logistic Regression,0.984848,0.857143,0.984615,0.888889
Naive Bayes,0.939394,0.857143,0.9375,0.888889
Nearest Neighbors,0.954545,0.857143,0.952381,0.888889
RBF SVM,0.984848,0.857143,0.984615,0.888889
AdaBoost,1.0,0.714286,1.0,0.8
Random Forest,0.969697,0.714286,0.96875,0.75
Sigmoid SVM,0.969697,0.714286,0.96875,0.75
Quadratic Discriminant Analysis,1.0,0.571429,1.0,0.727273
Polynomial SVM,0.954545,0.571429,0.956522,0.666667


