In [None]:
import os
import random
import itertools
import re

# 基本的なライブラリ
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

# 描画ライブラリ
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn_analyzer import CustomPairPlot
import graphviz
import pydotplus
from IPython.display import Image
from IPython.display import HTML
from six import StringIO
from ipywidgets import interact, FloatSlider

# データセット分割
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    ShuffleSplit,
    StratifiedShuffleSplit,
)

# 補完
from sklearn.experimental import (
    enable_iterative_imputer,
)  # IterativeImputerをimportするために必要
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

# エンコード
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

# サンプリング
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# 特徴量選択
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import (
    GenericUnivariateSelect,
    f_classif,
    mutual_info_classif,
    chi2,
)
from boruta import BorutaPy
from libraries.mrmr import mrmr

# models
from lightgbm import LGBMClassifier
import xgboost as xgb


# 学習中
import optuna
from tqdm import tqdm
from sklearn.model_selection import learning_curve, cross_validate, cross_val_score

# 評価指標
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import warnings


# config python file
import config

SEED = config.SEED
# INDEX_MICROARRAY = config.INDEX_MICROARRAY
INDEX_MICROARRAY = 2
from functions import *

fix_seed(SEED)


# 最大表示列数の指定（ここでは50列を指定）N
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

%matplotlib inline

# 目的
遺伝子学的分類に基づいた、予後の2値分類を実施する。  
分類はCLAUDIN_SUBTYPEに基づいて実施。  
予後は5年、10年、15年の3つの年次に分けている。Trueで死亡であることに注意すること。

# データ読み込み
読み込み元：
    config.INTERIM_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/claudin_subtype_chi2"

サブタイプ毎のデータを使用 

In [6]:
X_dict = pickle_load(config.INTERIM_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/X_dict.pickle")
y_dict = pickle_load(config.INTERIM_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/y_dict.pickle")

# 単一グル―プでのモデルのトレーニング

## ベースモデルの学習結果

各特徴量選択されたdfを用いて基本的な2値分類モデルを学習する

In [3]:
def validate_models(mircroarray_type: str, method: str, sampling_type: str = None):
    for year in range(15, 16, 5):  # 予後年数毎のループ
        prognosis_Xlabel = "X{0:0=2}".format(year)
        prognosis_ylabel = "y{0:0=2}".format(year)

        # compare_bcmsは内部でk分割交差検証を行うので、train_test_splitをしなくて良い
        X_train = X_dict[mircroarray_type][method]["train"][prognosis_Xlabel]
        y_train = y_dict[mircroarray_type][method]["train"][prognosis_ylabel]
        X_val = X_dict[mircroarray_type][method]["val"][prognosis_Xlabel]
        y_val = y_dict[mircroarray_type][method]["val"][prognosis_ylabel]

        # accuracyの表示
        print("----------" * 10)
        print("予後年数：{0:0=2}年:".format(year))
        if accuracy_score(y_train, np.zeros(len(y_train))) >= 0.5:
            score = (
                "0>1".format(year),
                round(accuracy_score(y_train, np.zeros(len(y_train))), 3),
            )
        else:
            score = (
                "0>1".format(year),
                round(accuracy_score(y_train, np.ones(len(y_train))), 3),
            )
        print("accuracyベースライン：", score)
        print("使用特徴量：", X_train.columns)
        print("学習サンプルサイズ：", X_train.shape)
        display("ラベル比率：", y_train.value_counts())
        save_dir = (
            config.TABLES_MODELS_PROGNOSIS_CROSS_DIR
            + "/scores_base-model/{0}/{1}".format(mircroarray_type, method)
        )
        make_dir(save_dir)
        sampling = None
        if sampling_type == "sm":
            sampling = SMOTE(random_state=SEED)
        display(
            compare_bcms(
                X_train,
                y_train,
                X_val,
                y_val,
                over_sampling_class=sampling,
                save_path=save_dir + "/{0:0=2}.csv".format(year),
            )
        )

In [4]:
for k in X_dict.keys():
    print(k)
    validate_models(mircroarray_type=k, method="mrmr", sampling_type=None)
    validate_models(mircroarray_type=k, method="mrmr", sampling_type="sm")
    # validate_models(mircroarray_type=k, method="boruta")

mrna_agilent_microarray_zscores_ref_all_samples
----------------------------------------------------------------------------------------------------
予後年数：15年:
accuracyベースライン： ('0>1', 0.533)
使用特徴量： Index(['RACGAP1', 'PFN4', 'NACAP1', 'RAPGEF5', 'EMR3', 'CXorf57', 'DNAJB7',
       'ENC1', 'PCM1', 'KRTAP12-4', 'STAT5B', 'AI659947', 'TOR1B', 'BP432397',
       'RNU11', 'UHRF1', 'RBBP8', 'USP30', 'AK022229', 'FGF13', 'BE858513',
       'HS3ST1', 'DFNB59', 'GAL3ST4', 'AURKA', 'DB341438', 'C9orf95', 'S100P',
       'ANKAR', 'WDR67', 'GABRB1', 'AW572907', 'HIST1H2BF', 'APLN', 'GTPBP5',
       'SPATA4', 'BG218808', 'AW444974', 'FGD3', 'INTS10', 'CBX7', 'BC033399',
       'TSPYL6', 'TMEM26', 'DA697821', 'MST1', 'IGDCC4', 'UBE2C', 'RAB3B'],
      dtype='object')
学習サンプルサイズ： (865, 49)


'ラベル比率：'

1    461
0    404
Name: OS_15years, dtype: int64

11it [00:02,  5.18it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Random Forest,0.87052,0.649485,0.881356,0.679245
Sigmoid SVM,0.757225,0.628866,0.781705,0.678571
AdaBoost,0.854335,0.639175,0.865385,0.672897
RBF SVM,0.915607,0.597938,0.920044,0.648649
Logistic Regression,0.806936,0.608247,0.821772,0.648148
Linear SVM,0.810405,0.587629,0.825532,0.636364
Polynomial SVM,0.966474,0.57732,0.968306,0.630631
Naive Bayes,0.736416,0.56701,0.753247,0.596154
Nearest Neighbors,0.8,0.56701,0.814973,0.596154
Decision Tree,0.820809,0.56701,0.826038,0.588235


----------------------------------------------------------------------------------------------------
予後年数：15年:
accuracyベースライン： ('0>1', 0.533)
使用特徴量： Index(['RACGAP1', 'PFN4', 'NACAP1', 'RAPGEF5', 'EMR3', 'CXorf57', 'DNAJB7',
       'ENC1', 'PCM1', 'KRTAP12-4', 'STAT5B', 'AI659947', 'TOR1B', 'BP432397',
       'RNU11', 'UHRF1', 'RBBP8', 'USP30', 'AK022229', 'FGF13', 'BE858513',
       'HS3ST1', 'DFNB59', 'GAL3ST4', 'AURKA', 'DB341438', 'C9orf95', 'S100P',
       'ANKAR', 'WDR67', 'GABRB1', 'AW572907', 'HIST1H2BF', 'APLN', 'GTPBP5',
       'SPATA4', 'BG218808', 'AW444974', 'FGD3', 'INTS10', 'CBX7', 'BC033399',
       'TSPYL6', 'TMEM26', 'DA697821', 'MST1', 'IGDCC4', 'UBE2C', 'RAB3B'],
      dtype='object')
学習サンプルサイズ： (865, 49)


'ラベル比率：'

1    461
0    404
Name: OS_15years, dtype: int64

11it [00:01,  6.32it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear SVM,0.804772,0.618557,0.805195,0.666667
Decision Tree,0.82538,0.608247,0.825569,0.648148
Sigmoid SVM,0.752711,0.597938,0.76,0.642202
Logistic Regression,0.795011,0.597938,0.796117,0.628571
RBF SVM,0.918655,0.57732,0.917127,0.616822
Naive Bayes,0.738612,0.587629,0.740581,0.615385
Random Forest,0.867679,0.587629,0.868817,0.607843
Quadratic Discriminant Analysis,0.881779,0.57732,0.87794,0.601942
AdaBoost,0.838395,0.587629,0.837514,0.6
Nearest Neighbors,0.808026,0.608247,0.793946,0.586957


mrna_agilent_microarray
----------------------------------------------------------------------------------------------------
予後年数：15年:
accuracyベースライン： ('0>1', 0.533)
使用特徴量： Index(['RACGAP1', 'SNX24', 'TUB', 'ARRDC3', 'STAT5B', 'PTPLAD1', 'RBBP8',
       'ENC1', 'RNU11', 'UHRF1', 'PPIL3', 'S100P', 'MST1', 'WARS2', 'FGF13',
       'C9orf95', 'WDR67', 'CBX7', 'INTS10', 'SPATA18', 'HIST1H2BF', 'AURKA',
       'TFPT', 'LRRC50', 'PDK3', 'IGDCC4', 'FGD3', 'AK3', 'LRRC48', 'PSTPIP2',
       'PKMYT1', 'CATSPERB', 'TMEM26', 'STAT5A', 'CCT6B', 'C14orf139',
       'C7orf63', 'CD44', 'KIAA1967', 'ATAD2', 'N4BP2L1', 'PIGV', 'GPRC5A',
       'UBE2C', 'OGT', 'TRNP1', 'PREX1', 'GPR172A', 'NAB1'],
      dtype='object')
学習サンプルサイズ： (865, 49)


'ラベル比率：'

1    461
0    404
Name: OS_15years, dtype: int64

11it [00:08,  1.23it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.817341,0.670103,0.833333,0.709091
Sigmoid SVM,0.532948,0.536082,0.695324,0.697987
Polynomial SVM,0.921387,0.608247,0.927505,0.648148
Linear SVM,0.758382,0.628866,0.775991,0.647059
RBF SVM,0.758382,0.597938,0.778367,0.621359
Logistic Regression,0.749133,0.608247,0.76841,0.62
Random Forest,0.836994,0.57732,0.852356,0.616822
Quadratic Discriminant Analysis,0.852023,0.597938,0.85872,0.613861
Nearest Neighbors,0.771098,0.556701,0.790254,0.590476
AdaBoost,0.824277,0.57732,0.837607,0.57732


----------------------------------------------------------------------------------------------------
予後年数：15年:
accuracyベースライン： ('0>1', 0.533)
使用特徴量： Index(['RACGAP1', 'SNX24', 'TUB', 'ARRDC3', 'STAT5B', 'PTPLAD1', 'RBBP8',
       'ENC1', 'RNU11', 'UHRF1', 'PPIL3', 'S100P', 'MST1', 'WARS2', 'FGF13',
       'C9orf95', 'WDR67', 'CBX7', 'INTS10', 'SPATA18', 'HIST1H2BF', 'AURKA',
       'TFPT', 'LRRC50', 'PDK3', 'IGDCC4', 'FGD3', 'AK3', 'LRRC48', 'PSTPIP2',
       'PKMYT1', 'CATSPERB', 'TMEM26', 'STAT5A', 'CCT6B', 'C14orf139',
       'C7orf63', 'CD44', 'KIAA1967', 'ATAD2', 'N4BP2L1', 'PIGV', 'GPRC5A',
       'UBE2C', 'OGT', 'TRNP1', 'PREX1', 'GPR172A', 'NAB1'],
      dtype='object')
学習サンプルサイズ： (865, 49)


'ラベル比率：'

1    461
0    404
Name: OS_15years, dtype: int64

11it [00:04,  2.65it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sigmoid SVM,0.5,0.536082,0.666667,0.697987
Decision Tree,0.798265,0.608247,0.798701,0.634615
Polynomial SVM,0.927332,0.608247,0.927568,0.634615
Linear SVM,0.761388,0.608247,0.763441,0.62
Logistic Regression,0.747289,0.618557,0.745911,0.618557
Quadratic Discriminant Analysis,0.85141,0.597938,0.848283,0.606061
Random Forest,0.845987,0.57732,0.848291,0.594059
AdaBoost,0.824295,0.597938,0.824295,0.589474
RBF SVM,0.75705,0.57732,0.760171,0.585859
Nearest Neighbors,0.784165,0.546392,0.77412,0.56


mrna_agilent_microarray_zscores_ref_diploid_samples
----------------------------------------------------------------------------------------------------
予後年数：15年:
accuracyベースライン： ('0>1', 0.533)
使用特徴量： Index(['RACGAP1', 'CYP2D6', 'PTX4', 'BIK', 'ENC1', 'DENND6A', 'STAT5B',
       'RNU11', 'DNAJB7', 'PPIL3', 'UHRF1', 'RBBP8', 'MIR137HG', 'GABRB1',
       'TOR1B', 'MST1', 'NMRK1', 'FGF13', 'HIST1H2BF', 'SPATA4', 'TBC1D31',
       'USP30', 'S100P', 'APLN', 'MTG2', 'GAL3ST4', 'FGD3', 'PJVK', 'INTS10',
       'CBX7', 'FCN2', 'RNASE9', 'SPATA18', 'ANKAR', 'SULT4A1', 'TSPYL6',
       'IGDCC4', 'TMEM26', 'PDK3', 'AURKA', 'PRR21', 'DNAAF1', 'GPR151',
       'GASK1A', 'NLRX1', 'RAB3B', 'HS3ST1', 'PKMYT1', 'CATSPERB'],
      dtype='object')
学習サンプルサイズ： (865, 49)


'ラベル比率：'

1    461
0    404
Name: OS_15years, dtype: int64

11it [00:01,  6.04it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.788439,0.649485,0.803437,0.673077
RBF SVM,0.893642,0.639175,0.900862,0.672897
Linear SVM,0.786127,0.618557,0.802982,0.654206
AdaBoost,0.856647,0.639175,0.865217,0.653465
Quadratic Discriminant Analysis,0.860116,0.639175,0.863585,0.653465
Sigmoid SVM,0.734104,0.618557,0.750542,0.647619
Random Forest,0.854335,0.597938,0.867925,0.635514
Nearest Neighbors,0.786127,0.597938,0.798694,0.628571
Naive Bayes,0.746821,0.597938,0.760131,0.613861
Decision Tree,0.80578,0.56701,0.826804,0.611111


----------------------------------------------------------------------------------------------------
予後年数：15年:
accuracyベースライン： ('0>1', 0.533)
使用特徴量： Index(['RACGAP1', 'CYP2D6', 'PTX4', 'BIK', 'ENC1', 'DENND6A', 'STAT5B',
       'RNU11', 'DNAJB7', 'PPIL3', 'UHRF1', 'RBBP8', 'MIR137HG', 'GABRB1',
       'TOR1B', 'MST1', 'NMRK1', 'FGF13', 'HIST1H2BF', 'SPATA4', 'TBC1D31',
       'USP30', 'S100P', 'APLN', 'MTG2', 'GAL3ST4', 'FGD3', 'PJVK', 'INTS10',
       'CBX7', 'FCN2', 'RNASE9', 'SPATA18', 'ANKAR', 'SULT4A1', 'TSPYL6',
       'IGDCC4', 'TMEM26', 'PDK3', 'AURKA', 'PRR21', 'DNAAF1', 'GPR151',
       'GASK1A', 'NLRX1', 'RAB3B', 'HS3ST1', 'PKMYT1', 'CATSPERB'],
      dtype='object')
学習サンプルサイズ： (865, 49)


'ラベル比率：'

1    461
0    404
Name: OS_15years, dtype: int64

11it [00:02,  4.99it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.869848,0.690722,0.864253,0.705882
Logistic Regression,0.780911,0.670103,0.781857,0.686275
Decision Tree,0.815618,0.639175,0.818376,0.672897
RBF SVM,0.903471,0.639175,0.901874,0.666667
Sigmoid SVM,0.732104,0.649485,0.730055,0.666667
Linear SVM,0.799349,0.628866,0.801715,0.660377
Random Forest,0.861171,0.628866,0.862069,0.64
AdaBoost,0.848156,0.618557,0.845815,0.626263
Naive Bayes,0.749458,0.597938,0.748092,0.613861
Nearest Neighbors,0.793926,0.618557,0.771635,0.602151


## lightGBM

In [13]:
X_train = X_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["train"]["X15"]
y_train = y_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["train"]["y15"]
X_val = X_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["val"]["X15"]
y_val = y_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["val"]["y15"]

params = {
    "max_bin": 100,
    "num_leaves": 30,
    "n_estimators": 800,
    "class_weight": "balanced",
    "random_state": SEED,
}

gbm = LGBMClassifier(**params)
gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_val)
show_scores(y_val, y_pred)

accuracy:  0.6391752577319587
precision:  0.6545454545454545
recall:  0.6923076923076923
f1 score:  0.6728971962616823


## xgboost

In [5]:
import xgboost as xgb

X_train = X_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["train"]["X15"]
y_train = y_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["train"]["y15"]
X_val = X_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["val"]["X15"]
y_val = y_dict[config.SET_NAME_MICROARRAY[INDEX_MICROARRAY]]["mrmr"]["val"]["y15"]

xgb_train = xgb.DMatrix(X_train, label=y_train, feature_names=X_train.columns)
xgb_val = xgb.DMatrix(X_val, label=y_val, feature_names=X_val.columns)


param = {
    # 二値分類問題
    "objective": "binary:logistic",
}
clf = xgb.train(param, xgb_train)
y_pred_proba = clf.predict(xgb_val)
y_pred = np.where(y_pred_proba > 0.5, 1, 0)

accuracy_score(y_val, y_pred), f1_score(y_val, y_pred)



(0.6494845360824743, 0.6730769230769231)

## モデル毎の比較（年数など）

### 予測モデル、特徴量数、年数毎の比較

In [5]:
def model2result_list(
    clf: callable,
    X_train: pd.DataFrame(),
    y_train: pd.DataFrame(),
    X_test: pd.DataFrame(),
    y_test: pd.DataFrame(),
    model_name: str,
    year: int,
    k: int,
) -> list():
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc = accuracy_score(y_test, pred)
    pre = precision_score(y_test, pred)
    rec = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    return [year, model_name, acc, pre, rec, f1, k]

In [6]:
row = []
fold = 5
for year in tqdm(range(15, 16, 5)):  # 予後年数毎のループ
    prognosis_Xlabel = "X{0:0=2}".format(year)
    prognosis_ylabel = "y{0:0=2}".format(year)
    for k in range(10, 51, 10):
        tmp = None
        X = X_dict["mrna_agilent_microarray_zscores_ref_diploid_samples"]["mrmr"][
            "train"
        ][prognosis_Xlabel]
        y = y_dict["mrna_agilent_microarray_zscores_ref_diploid_samples"]["mrmr"][
            "train"
        ][prognosis_ylabel]

        skf = StratifiedKFold(n_splits=fold, shuffle=True, random_state=SEED)
        for train_index, val_indes in skf.split(X, y):
            X_train, X_val = X.iloc[train_index], X.iloc[val_indes]
            y_train, y_val = y.iloc[train_index], y.iloc[val_indes]

            assert X_train.shape[0] == y_train.shape[0], "train size is incorrect"
            assert X_val.shape[0] == y_val.shape[0], "test size is incorrect"

            # 特徴量選択の実施（mRMR）
            features = mrmr.mrmr_classif(
                X=X_train,
                y=y_train,
                K=k,
                show_progress=False,
            )

            # 特徴量選択で選ばれた特徴量の抽出（訓練、テストデータに適用）
            # list変換するのはfuture warningが出るため
            X_train = X_train[features]
            X_val = X_val[features]

            # スケーリングの実施
            X_train, X_val = transform_std(X_train, X_val)
            # X_train, X_val = transform_norm(X_train, X_val)

            # サンプリングの実装
            # positive_count_train = y_train.value_counts().sort_values()[0]
            rus = RandomUnderSampler(sampling_strategy="all", random_state=SEED)
            # smote = SMOTE(sampling_strategy="all", random_state=SEED)
            X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

            # accuracyの表示
            """
            print("予後年数：{0:0=2}年:".format(year))
            print("学習サンプルサイズ：", X_train.shape)
            display("ラベル比率：", y_train.value_counts())        
            """

            # ここからモデルのtrain、x軸をkとしたacc, f1の推移plot
            # print("Random Forest")
            params = {
                "n_estimators": 500,
                "criterion": "entropy",
                "max_depth": 5,
                "max_leaf_nodes": 20,
                "max_features": "log2",
                "class_weight": "balanced",
                "random_state": SEED,
            }
            rf = RandomForestClassifier(**params)
            tmp = model2result_list(rf, X_train, y_train, X_val, y_val, "rf", year, k)
            row.append(tmp)

            # print("Logistic Regression")
            params = {
                "penalty": "l2",
                "C": 0.5,
                "solver": "saga",
                "class_weight": "balanced",
            }
            lr = LogisticRegression(**params)
            tmp = model2result_list(lr, X_train, y_train, X_val, y_val, "lr", year, k)
            row.append(tmp)

            # print("Support Vector Machine")
            params = {
                "C": 50,
                "kernel": "sigmoid",
                "decision_function_shape": "ovr",
                "class_weight": "balanced",
            }
            svm = SVC(**params)
            tmp = model2result_list(svm, X_train, y_train, X_val, y_val, "svm", year, k)
            row.append(tmp)

    # 結果をdfにまとめている
    df_result = pd.DataFrame(
        row,
        columns=[
            "year",
            "model_name",
            "acc",
            "precision",
            "recall",
            "f1",
            "feature size",
        ],
    )

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [02:02<00:00, 122.37s/it]


In [7]:
df_result[df_result["model_name"] == "rf"].groupby(
    ["year", "feature size"]
).mean().sort_values("acc", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,acc,precision,recall,f1
year,feature size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
15,40,0.720231,0.734554,0.746166,0.739993
15,50,0.720231,0.734554,0.746166,0.739993
15,30,0.713295,0.725555,0.743922,0.734229
15,20,0.683237,0.694121,0.728822,0.710235
15,10,0.677457,0.695627,0.704979,0.699879


In [8]:
df_result[df_result["model_name"] == "lr"].groupby(
    ["year", "feature size"]
).mean().sort_values("acc", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,acc,precision,recall,f1
year,feature size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
15,40,0.732948,0.756673,0.737564,0.746544
15,50,0.732948,0.756673,0.737564,0.746544
15,30,0.715607,0.739345,0.722394,0.73034
15,20,0.695954,0.725919,0.689808,0.707205
15,10,0.690173,0.722748,0.681043,0.701143


In [9]:
df_result[df_result["model_name"] == "svm"].groupby(
    ["year", "feature size"]
).mean().sort_values("acc", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,acc,precision,recall,f1
year,feature size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
15,40,0.680925,0.712835,0.674638,0.692703
15,50,0.680925,0.712835,0.674638,0.692703
15,20,0.669364,0.697698,0.670266,0.683482
15,30,0.669364,0.700168,0.670196,0.683615
15,10,0.611561,0.653488,0.583567,0.615024


# subtype毎のベースライン・学習

In [3]:
def validate_models_subtype(
    mircroarray_type: str,
    method: str,
):
    subtypes = [
        "claudin-low",
        "LumA",
        "LumB",
        "Her2",
        "Normal",
        "Basal",
    ]

    for year in range(15, 16, 5):  # 予後年数毎のループ
        print("====={0:0=2}".format(year) * 10)

        for subtype in subtypes:
            prognosis_Xlabel = "X{0:0=2}_{1}".format(year, subtype)
            prognosis_ylabel = "y{0:0=2}_{1}".format(year, subtype)
            X_train = X_dict[mircroarray_type]["claudin_subtype"][method]["train"][
                prognosis_Xlabel
            ]
            y_train = y_dict[mircroarray_type]["claudin_subtype"][method]["train"][
                prognosis_ylabel
            ]
            X_val = X_dict[mircroarray_type]["claudin_subtype"][method]["val"][
                prognosis_Xlabel
            ]
            y_val = y_dict[mircroarray_type]["claudin_subtype"][method]["val"][
                prognosis_ylabel
            ]
            assert X_train.shape[0] == y_train.shape[0], "train size is incorrect"
            assert X_val.shape[0] == y_val.shape[0], "val size is incorrect"
            if X_train.shape[1] == 0:
                print(
                    "**--WARNING: FEATURE NUM is 0! -> subtype {0} train is skipped!--**".format(
                        subtype
                    )
                )
                continue

            # accuracyの表示
            print("----------" * 10)
            print("subtype: ", subtype)
            print("予後年数：{0:0=2}年:".format(year))
            if accuracy_score(y_train, np.zeros(len(y_train))) >= 0.5:
                score = (
                    "0>1".format(year),
                    round(accuracy_score(y_train, np.zeros(len(y_train))), 3),
                )
            else:
                score = (
                    "0>1".format(year),
                    round(accuracy_score(y_train, np.ones(len(y_train))), 3),
                )
            print("accuracyベースライン：", score)
            print("使用特徴量：", X_train.columns)
            print("学習サンプルサイズ：", X_train.shape)
            display("ラベル比率：", y_train.value_counts())

            sm = SMOTE(random_state=SEED)
            display(
                compare_bcms(
                    X_train,
                    y_train,
                    X_val,
                    y_val,
                    over_sampling_class=sm,
                )
            )

In [7]:
for k in X_dict.keys():
    print(k)
    validate_models_subtype(mircroarray_type=k, method="mrmr")

mrna_agilent_microarray_zscores_ref_all_samples
=====15=====15=====15=====15=====15=====15=====15=====15=====15=====15
----------------------------------------------------------------------------------------------------
subtype:  claudin-low
予後年数：15年:
accuracyベースライン： ('0>1', 0.562)
使用特徴量： Index(['BF511322', 'BIRC7', 'ZFP90', 'CHRNB3', 'IL34', 'AA939346', 'AW301012',
       'DB341932', 'MRPL53', 'CHDH', 'DA102929', 'GTPBP5', 'BF447974', 'NAV2',
       'C20orf29', 'NCRNA00160', 'PRPSAP1', 'CD511953', 'STX1B', 'COBL'],
      dtype='object')
学習サンプルサイズ： (89, 20)


'ラベル比率：'

0    50
1    39
Name: OS_15years, dtype: int64

11it [00:00, 41.97it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.83,0.6,0.852174,0.6
Quadratic Discriminant Analysis,1.0,0.6,1.0,0.5
Linear SVM,1.0,0.5,1.0,0.285714
Logistic Regression,1.0,0.4,1.0,0.25
Polynomial SVM,1.0,0.4,1.0,0.25
RBF SVM,1.0,0.4,1.0,0.25
Sigmoid SVM,0.95,0.4,0.951456,0.25
Nearest Neighbors,0.95,0.3,0.952381,0.222222
AdaBoost,1.0,0.2,1.0,0.0
Naive Bayes,0.99,0.4,0.990099,0.0


----------------------------------------------------------------------------------------------------
subtype:  LumA
予後年数：15年:
accuracyベースライン： ('0>1', 0.629)
使用特徴量： Index(['OS9', 'BX102680', 'MMP15', 'BCHE', 'OR2M7', 'PSMB11', 'ZIC2',
       'CEACAM3', 'YY2', 'S100P', 'ANGPT2', 'AW243302', 'FPGS', 'FLJ41170',
       'KLK1', 'CDH10', 'IL20RA', 'N90609', 'APOBEC3A', 'C15orf26'],
      dtype='object')
学習サンプルサイズ： (275, 20)


'ラベル比率：'

0    173
1    102
Name: OS_15years, dtype: int64

11it [00:00, 34.07it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.907514,0.83871,0.903614,0.782609
RBF SVM,0.947977,0.806452,0.947977,0.727273
Sigmoid SVM,0.812139,0.774194,0.811594,0.695652
AdaBoost,0.985549,0.774194,0.985673,0.666667
Logistic Regression,0.890173,0.774194,0.888235,0.666667
Decision Tree,0.82948,0.741935,0.820669,0.636364
Polynomial SVM,0.971098,0.741935,0.970414,0.636364
Naive Bayes,0.84104,0.774194,0.831804,0.631579
Linear SVM,0.872832,0.709677,0.871345,0.571429
Random Forest,0.916185,0.645161,0.915452,0.47619


----------------------------------------------------------------------------------------------------
subtype:  LumB
予後年数：15年:
accuracyベースライン： ('0>1', 0.648)
使用特徴量： Index(['ENC1', 'AI093492', 'C4orf32', 'SUN1', 'C14orf49', 'CYP2C19',
       'LOC645177', 'TET3', 'CELSR2', 'CR743466', 'LPAR3', 'GPR32', 'TTTY8B',
       'TP53BP2', 'AW118757', 'WDR60', 'MBD3', 'TTC1', 'LRRC50', 'BI481412'],
      dtype='object')
学習サンプルサイズ： (213, 20)


'ラベル比率：'

1    138
0     75
Name: OS_15years, dtype: int64

11it [00:00, 35.86it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Polynomial SVM,0.967391,0.625,0.968421,0.727273
Quadratic Discriminant Analysis,0.967391,0.583333,0.967033,0.705882
Random Forest,0.949275,0.583333,0.948529,0.666667
Naive Bayes,0.90942,0.541667,0.90566,0.645161
RBF SVM,0.971014,0.541667,0.970149,0.645161
Decision Tree,0.82971,0.583333,0.815686,0.642857
AdaBoost,1.0,0.5,1.0,0.625
Logistic Regression,0.945652,0.458333,0.943396,0.580645
Sigmoid SVM,0.905797,0.5,0.9,0.571429
Linear SVM,0.942029,0.458333,0.938931,0.518519


----------------------------------------------------------------------------------------------------
subtype:  Her2
予後年数：15年:
accuracyベースライン： ('0>1', 0.68)
使用特徴量： Index(['MICAL1', 'PCSK1N', 'AI797128', 'MGAT4C', 'LINC00307', 'DDX25', 'ACRV1',
       'OR13H1', 'MTTP', 'AW593287', 'AA405052', 'CA449938', 'PPIE_1', 'SETD5',
       'INSC', 'KRTAP21-1', 'LOC441617', 'GRIP1', 'WARS2', 'ESYT1'],
      dtype='object')
学習サンプルサイズ： (122, 20)


'ラベル比率：'

1    83
0    39
Name: OS_15years, dtype: int64

11it [00:00, 41.17it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Polynomial SVM,0.957831,0.785714,0.959538,0.842105
Naive Bayes,0.963855,0.714286,0.964286,0.777778
Quadratic Discriminant Analysis,0.981928,0.714286,0.982249,0.777778
RBF SVM,0.993976,0.642857,0.994012,0.705882
Sigmoid SVM,0.927711,0.642857,0.925926,0.705882
AdaBoost,1.0,0.642857,1.0,0.666667
Logistic Regression,0.96988,0.571429,0.97006,0.625
Random Forest,0.957831,0.571429,0.958084,0.625
Nearest Neighbors,0.96988,0.5,0.969325,0.533333
Linear SVM,0.96988,0.428571,0.969697,0.5


----------------------------------------------------------------------------------------------------
subtype:  Normal
予後年数：15年:
accuracyベースライン： ('0>1', 0.569)
使用特徴量： Index(['KIF13B', 'ZNF833P', 'RNF170', 'T80781', 'SLC25A15', 'ENO4', 'HCN1',
       'Z38762', 'AI797584', 'LOC389033', 'AP2A2', 'ZFAND2A', 'AI745455',
       'TBC1D14', 'JMJD6', 'NT5M', 'RPGRIP1L', 'SPTAN1', 'LOXL4', 'ARSG'],
      dtype='object')
学習サンプルサイズ： (58, 20)


'ラベル比率：'

1    33
0    25
Name: OS_15years, dtype: int64

11it [00:00, 50.77it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.833333,0.714286,0.825397,0.75
Naive Bayes,0.969697,0.714286,0.969697,0.75
Nearest Neighbors,1.0,0.714286,1.0,0.75
Random Forest,0.954545,0.714286,0.953846,0.75
Quadratic Discriminant Analysis,1.0,0.571429,1.0,0.727273
AdaBoost,1.0,0.571429,1.0,0.666667
Logistic Regression,1.0,0.571429,1.0,0.666667
Polynomial SVM,0.984848,0.571429,0.985075,0.666667
RBF SVM,1.0,0.571429,1.0,0.666667
Sigmoid SVM,1.0,0.571429,1.0,0.666667


----------------------------------------------------------------------------------------------------
subtype:  Basal
予後年数：15年:
accuracyベースライン： ('0>1', 0.61)
使用特徴量： Index(['FBXO31', 'AW572907', 'AI939551', 'FAM24A', 'BX099468', 'TUBB3',
       'DB312513', 'OR51I2', 'PAPOLA', 'TRIM77P', 'CXCL13', 'BM676522',
       'STATH', 'HSD3B2', 'NUDT12', 'PTX4', 'CCDC141', 'AW118163', 'AW293618',
       'ELAVL2'],
      dtype='object')
学習サンプルサイズ： (105, 20)


'ラベル比率：'

1    64
0    41
Name: OS_15years, dtype: int64

11it [00:00, 45.35it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.992188,0.583333,0.992126,0.705882
Decision Tree,0.851562,0.5,0.861314,0.666667
Polynomial SVM,0.992188,0.416667,0.992248,0.588235
Naive Bayes,0.953125,0.333333,0.953125,0.5
Random Forest,0.976562,0.333333,0.976378,0.5
AdaBoost,1.0,0.333333,1.0,0.428571
Nearest Neighbors,0.90625,0.333333,0.9,0.428571
Linear SVM,1.0,0.25,1.0,0.4
Logistic Regression,1.0,0.25,1.0,0.4
RBF SVM,1.0,0.25,1.0,0.4


mrna_agilent_microarray
=====15=====15=====15=====15=====15=====15=====15=====15=====15=====15
----------------------------------------------------------------------------------------------------
subtype:  claudin-low
予後年数：15年:
accuracyベースライン： ('0>1', 0.562)
使用特徴量： Index(['COBL', 'CNOT1', 'HIST1H4E', 'EEF1A2', 'SPRR3', 'ACACA', 'SLC35C1',
       'MAGEA10', 'C1orf116', 'DA102929', 'SEPX1', 'SLC25A37_1', 'HADH',
       'PWWP2B', 'SERPINE1', 'SHANK2', 'AMH', 'NAV2', 'ADORA1', 'SLC27A5'],
      dtype='object')
学習サンプルサイズ： (89, 20)


'ラベル比率：'

0    50
1    39
Name: OS_15years, dtype: int64

11it [00:00, 43.05it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear SVM,0.97,0.6,0.970297,0.6
Logistic Regression,0.96,0.6,0.959184,0.6
Naive Bayes,0.95,0.6,0.948454,0.6
Sigmoid SVM,0.5,0.4,0.666667,0.571429
Decision Tree,0.78,0.4,0.810345,0.5
AdaBoost,1.0,0.5,1.0,0.444444
Polynomial SVM,1.0,0.5,1.0,0.444444
Quadratic Discriminant Analysis,0.99,0.5,0.989899,0.444444
RBF SVM,0.87,0.5,0.873786,0.444444
Random Forest,0.93,0.4,0.932039,0.4


----------------------------------------------------------------------------------------------------
subtype:  LumA
予後年数：15年:
accuracyベースライン： ('0>1', 0.629)
使用特徴量： Index(['ZIC2', 'GNA14', 'RPS17', 'ZMYND12', 'KRT18', 'ANGPT2', 'QARS1', 'LCT',
       'S100P', 'IL20RA', 'MANBA', 'PTPLAD1', 'LSR', 'MAGEA1', 'BCHE',
       'APOBEC3A', 'PAGE2B', 'DIRAS3', 'PCDHB17', 'BC042566'],
      dtype='object')
学習サンプルサイズ： (275, 20)


'ラベル比率：'

0    173
1    102
Name: OS_15years, dtype: int64

11it [00:00, 29.12it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nearest Neighbors,0.803468,0.83871,0.815217,0.761905
AdaBoost,0.947977,0.774194,0.947977,0.666667
Decision Tree,0.820809,0.741935,0.824859,0.636364
Logistic Regression,0.803468,0.741935,0.798817,0.636364
Naive Bayes,0.771676,0.741935,0.745981,0.636364
Polynomial SVM,0.84104,0.741935,0.844193,0.636364
Random Forest,0.919075,0.741935,0.918605,0.636364
Linear SVM,0.812139,0.709677,0.80826,0.608696
RBF SVM,0.774566,0.709677,0.769231,0.608696
Sigmoid SVM,0.5,0.354839,0.666667,0.52381


----------------------------------------------------------------------------------------------------
subtype:  LumB
予後年数：15年:
accuracyベースライン： ('0>1', 0.648)
使用特徴量： Index(['ENC1', 'C19orf43', 'PTPN2', 'PPIG', 'ZNF787', 'LY6D', 'SLC7A2',
       'EFCAB11', 'CAMK2D', 'C9orf95', 'PHF19', 'CELSR2', 'PUS7', 'TP53BP2',
       'GPX4', 'IER3', 'CWH43', 'NPDC1', 'LRRC50', 'SLFN5'],
      dtype='object')
学習サンプルサイズ： (213, 20)


'ラベル比率：'

1    138
0     75
Name: OS_15years, dtype: int64

11it [00:00, 30.59it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sigmoid SVM,0.5,0.666667,0.666667,0.8
Quadratic Discriminant Analysis,0.90942,0.708333,0.907063,0.774194
Naive Bayes,0.847826,0.666667,0.838462,0.692308
Logistic Regression,0.807971,0.625,0.805861,0.689655
Linear SVM,0.826087,0.583333,0.822222,0.666667
Random Forest,0.905797,0.583333,0.904412,0.666667
AdaBoost,0.98913,0.541667,0.989091,0.645161
Decision Tree,0.82971,0.541667,0.833922,0.645161
Nearest Neighbors,0.82971,0.583333,0.809717,0.642857
Polynomial SVM,0.90942,0.583333,0.907063,0.642857


----------------------------------------------------------------------------------------------------
subtype:  Her2
予後年数：15年:
accuracyベースライン： ('0>1', 0.68)
使用特徴量： Index(['MICAL1', 'PCSK1N', 'FBLN5', 'FAM3B', 'PITRM1', 'VAMP8', 'WARS2',
       'ANKRD36', 'CDC7', 'PITHD1', 'MAP6D1', 'SHMT1', 'SLC41A3', 'DIAPH1',
       'WIT1', 'PLIN4', 'TM4SF1', 'ABCA4', 'SERPINB7', 'CYC1'],
      dtype='object')
学習サンプルサイズ： (122, 20)


'ラベル比率：'

1    83
0    39
Name: OS_15years, dtype: int64

11it [00:00, 35.16it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.927711,0.714286,0.926829,0.75
Polynomial SVM,0.96988,0.714286,0.97006,0.75
Random Forest,0.945783,0.642857,0.945455,0.736842
Quadratic Discriminant Analysis,0.96988,0.642857,0.97006,0.705882
AdaBoost,1.0,0.571429,1.0,0.7
Linear SVM,0.939759,0.642857,0.939024,0.666667
Naive Bayes,0.915663,0.642857,0.916667,0.666667
RBF SVM,0.903614,0.642857,0.9,0.666667
Nearest Neighbors,0.879518,0.5,0.871795,0.533333
Decision Tree,0.843373,0.428571,0.843373,0.333333


----------------------------------------------------------------------------------------------------
subtype:  Normal
予後年数：15年:
accuracyベースライン： ('0>1', 0.569)
使用特徴量： Index(['KIF13B', 'RPS6KA2', 'NDUFS8', 'ARF5', 'SCG2', 'ARSG', 'LOXL4', 'RAB36',
       'LOC389033', 'DNAJB11', 'ZFAND2A', 'SPTAN1', 'IFT88', 'NFKBIA',
       'LCLAT1', 'CTDSPL', 'HOMER2', 'SLFN5', 'MAP2K4', 'SESN1'],
      dtype='object')
学習サンプルサイズ： (58, 20)


'ラベル比率：'

1    33
0    25
Name: OS_15years, dtype: int64

11it [00:00, 45.16it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nearest Neighbors,0.984848,0.857143,0.985075,0.888889
Decision Tree,0.80303,0.714286,0.811594,0.8
Naive Bayes,0.954545,0.714286,0.952381,0.75
Quadratic Discriminant Analysis,1.0,0.714286,1.0,0.75
RBF SVM,0.939394,0.714286,0.939394,0.75
Sigmoid SVM,0.5,0.571429,0.666667,0.727273
Linear SVM,1.0,0.571429,1.0,0.666667
Logistic Regression,0.984848,0.571429,0.984615,0.666667
Random Forest,0.939394,0.571429,0.935484,0.666667
AdaBoost,1.0,0.428571,1.0,0.5


----------------------------------------------------------------------------------------------------
subtype:  Basal
予後年数：15年:
accuracyベースライン： ('0>1', 0.61)
使用特徴量： Index(['FBXO31', 'ALDH4A1', 'TNFSF13', 'PLAC1', 'CD688177', 'NOS3', 'GTSF1',
       'KLHL13', 'CAMK1D', 'KLHL36', 'DEFB1', 'CXCL13', 'GPR83', 'IFT27',
       'TRAPPC2L', 'CAPZA1', 'NFE2', 'PEBP4', 'KLK13', 'DEF8'],
      dtype='object')
学習サンプルサイズ： (105, 20)


'ラベル比率：'

1    64
0    41
Name: OS_15years, dtype: int64

11it [00:00, 39.87it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sigmoid SVM,0.5,0.583333,0.666667,0.736842
Quadratic Discriminant Analysis,0.984375,0.666667,0.984615,0.714286
RBF SVM,0.820312,0.583333,0.821705,0.666667
Decision Tree,0.804688,0.5,0.814815,0.666667
Logistic Regression,0.875,0.583333,0.875,0.615385
Linear SVM,0.875,0.5,0.876923,0.571429
Nearest Neighbors,0.851562,0.5,0.840336,0.571429
AdaBoost,1.0,0.333333,1.0,0.5
Naive Bayes,0.898438,0.5,0.900763,0.5
Polynomial SVM,0.976562,0.5,0.976378,0.5


mrna_agilent_microarray_zscores_ref_diploid_samples
=====15=====15=====15=====15=====15=====15=====15=====15=====15=====15
----------------------------------------------------------------------------------------------------
subtype:  claudin-low
予後年数：15年:
accuracyベースライン： ('0>1', 0.562)
使用特徴量： Index(['COBL', 'SLC36A3', 'ID3', 'TIMM22', 'OR7C2', 'SF3A1', 'MAST2',
       'LINC00160', 'NPFFR1', 'NAV2'],
      dtype='object')
学習サンプルサイズ： (89, 10)


'ラベル比率：'

0    50
1    39
Name: OS_15years, dtype: int64

11it [00:00, 50.53it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.84,0.4,0.836735,0.5
Logistic Regression,0.94,0.6,0.94,0.5
Linear SVM,0.95,0.5,0.950495,0.444444
RBF SVM,0.96,0.5,0.960784,0.444444
Sigmoid SVM,0.92,0.5,0.92,0.444444
AdaBoost,1.0,0.6,1.0,0.333333
Nearest Neighbors,0.85,0.6,0.859813,0.333333
Polynomial SVM,0.95,0.6,0.947368,0.333333
Quadratic Discriminant Analysis,0.94,0.5,0.94,0.285714
Naive Bayes,0.9,0.4,0.9,0.25


----------------------------------------------------------------------------------------------------
subtype:  LumA
予後年数：15年:
accuracyベースライン： ('0>1', 0.629)
使用特徴量： Index(['OS9', 'ELF3', 'HCRTR1', 'BAG5', 'RRP8', 'MTNR1A', 'PGK1', 'ZIC2',
       'BEX1', 'FOXR1', 'FPGS', 'ANGPT2', 'KLK1', 'CDH10', 'S100P', 'DIRAS3'],
      dtype='object')
学習サンプルサイズ： (275, 16)


'ラベル比率：'

0    173
1    102
Name: OS_15years, dtype: int64

11it [00:00, 32.89it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nearest Neighbors,0.875723,0.806452,0.883469,0.727273
Polynomial SVM,0.965318,0.677419,0.965517,0.545455
Random Forest,0.904624,0.645161,0.905983,0.521739
Linear SVM,0.817919,0.580645,0.819484,0.518519
Quadratic Discriminant Analysis,0.872832,0.677419,0.873563,0.5
Naive Bayes,0.803468,0.612903,0.804598,0.5
Logistic Regression,0.809249,0.580645,0.810345,0.48
Sigmoid SVM,0.765896,0.580645,0.769231,0.48
Decision Tree,0.817919,0.645161,0.825485,0.47619
RBF SVM,0.936416,0.645161,0.938202,0.47619


----------------------------------------------------------------------------------------------------
subtype:  LumB
予後年数：15年:
accuracyベースライン： ('0>1', 0.648)
使用特徴量： Index(['ENC1', 'TTC1', 'OR52M1', 'HIST1H1E', 'GHSR', 'GPR32', 'OR2B11',
       'CELSR2', 'GAPDHS', 'NMRK1', 'HBG2', 'LINC02693', 'SPPL2B', 'TTC28',
       'PHF19', 'SLC7A2', 'LPAR3', 'SGTA', 'BPIFB6', 'ZNF273'],
      dtype='object')
学習サンプルサイズ： (213, 20)


'ラベル比率：'

1    138
0     75
Name: OS_15years, dtype: int64

11it [00:00, 37.61it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Quadratic Discriminant Analysis,0.956522,0.666667,0.956835,0.777778
RBF SVM,0.967391,0.666667,0.966292,0.75
Random Forest,0.945652,0.666667,0.945848,0.75
Sigmoid SVM,0.858696,0.625,0.859206,0.709677
AdaBoost,1.0,0.583333,1.0,0.6875
Naive Bayes,0.884058,0.583333,0.883212,0.6875
Logistic Regression,0.916667,0.583333,0.915129,0.666667
Polynomial SVM,0.971014,0.541667,0.971631,0.645161
Nearest Neighbors,0.855072,0.541667,0.84252,0.62069
Decision Tree,0.82971,0.458333,0.819923,0.580645


----------------------------------------------------------------------------------------------------
subtype:  Her2
予後年数：15年:
accuracyベースライン： ('0>1', 0.68)
使用特徴量： Index(['MICAL1', 'MGAT4C', 'TSBP1', 'NR5A1', 'PCSK1N', 'DOK5', 'ACRV1',
       'SH2D4B', 'KIAA1109', 'WARS2', 'MAGEA4', 'KCNH2', 'ANKRD36', 'CROCCP2',
       'GRIP1', 'OR4A16', 'VAMP8', 'CDHR1', 'PPIE_1', 'ESYT1', 'ANKS6',
       'SHISA8', 'MPZL3', 'RFPL1S', 'KRTAP21-1', 'CDKN1A', 'NBN', 'SHE',
       'PITHD1', 'MTTP', 'PRNT', 'MAP6D1', 'OR51A4', 'CLDN25', 'DNAH9',
       'POU4F2'],
      dtype='object')
学習サンプルサイズ： (122, 36)


'ラベル比率：'

1    83
0    39
Name: OS_15years, dtype: int64

11it [00:00, 36.50it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Polynomial SVM,0.987952,0.714286,0.988095,0.8
Quadratic Discriminant Analysis,1.0,0.642857,1.0,0.782609
Decision Tree,0.855422,0.714286,0.865169,0.777778
Naive Bayes,0.957831,0.714286,0.959538,0.777778
RBF SVM,0.987952,0.642857,0.988095,0.736842
AdaBoost,1.0,0.642857,1.0,0.705882
Random Forest,0.957831,0.642857,0.959538,0.705882
Linear SVM,1.0,0.5,1.0,0.631579
Logistic Regression,1.0,0.5,1.0,0.588235
Nearest Neighbors,0.945783,0.428571,0.946108,0.5


----------------------------------------------------------------------------------------------------
subtype:  Normal
予後年数：15年:
accuracyベースライン： ('0>1', 0.569)
使用特徴量： Index(['KIF13B', 'TMEM38A', 'ZNF75D', 'CELP', 'TERB1', 'OSBPL3', 'SZRD1',
       'RNU6ATAC', 'ENO4', 'LCE1A', 'TP53BP1', 'ZFAND2A', 'AP2A2', 'DPY19L2',
       'SPTAN1', 'LOXL4'],
      dtype='object')
学習サンプルサイズ： (58, 16)


'ラベル比率：'

1    33
0    25
Name: OS_15years, dtype: int64

11it [00:00, 48.24it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.863636,0.714286,0.876712,0.8
Random Forest,0.969697,0.714286,0.969697,0.75
Linear SVM,1.0,0.571429,1.0,0.666667
Logistic Regression,1.0,0.571429,1.0,0.666667
Naive Bayes,0.939394,0.571429,0.9375,0.666667
Polynomial SVM,0.954545,0.571429,0.956522,0.666667
Quadratic Discriminant Analysis,1.0,0.571429,1.0,0.666667
RBF SVM,0.984848,0.571429,0.985075,0.666667
Sigmoid SVM,0.939394,0.571429,0.941176,0.666667
AdaBoost,1.0,0.428571,1.0,0.5


----------------------------------------------------------------------------------------------------
subtype:  Basal
予後年数：15年:
accuracyベースライン： ('0>1', 0.61)
使用特徴量： Index(['FBXO31', 'OR2T29', 'OR2AG2', 'ALAS2', 'KLHL4', 'CAMK1D', 'YWHAEP7',
       'PTX4', 'KRTAP5-5', 'OR52N4', 'STATH', 'NUDT12'],
      dtype='object')
学習サンプルサイズ： (105, 12)


'ラベル比率：'

1    64
0    41
Name: OS_15years, dtype: int64

11it [00:00, 49.39it/s]


Unnamed: 0_level_0,acc_train,acc_val,f1_train,f1_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.828125,0.75,0.822581,0.769231
Linear SVM,0.867188,0.666667,0.866142,0.75
Logistic Regression,0.867188,0.666667,0.868217,0.75
Sigmoid SVM,0.835938,0.583333,0.839695,0.705882
AdaBoost,1.0,0.583333,1.0,0.666667
Random Forest,0.945312,0.583333,0.944882,0.666667
RBF SVM,0.96875,0.5,0.968254,0.625
Naive Bayes,0.859375,0.416667,0.867647,0.588235
Polynomial SVM,0.953125,0.416667,0.95082,0.533333
Quadratic Discriminant Analysis,0.90625,0.416667,0.904762,0.533333
