In [1]:
import os
import random
import itertools
import re

# 基本的なライブラリ
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

# 描画ライブラリ
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn_analyzer import CustomPairPlot
import graphviz
import pydotplus
from IPython.display import Image
from IPython.display import HTML
from six import StringIO
from ipywidgets import interact, FloatSlider
from dtreeviz.trees import *

# データセット分割
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit

# 補完
from sklearn.experimental import (
    enable_iterative_imputer,
)  # IterativeImputerをimportするために必要
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

# エンコード
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

# サンプリング
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# 特徴量選択
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import (
    GenericUnivariateSelect,
    f_classif,
    mutual_info_classif,
    chi2,
)
from boruta import BorutaPy
from libraries.mrmr import mrmr

# 学習中
import optuna
from tqdm import tqdm
from sklearn.model_selection import learning_curve, cross_validate, cross_val_score

# 評価指標
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# models
from lightgbm import LGBMClassifier

# XAI
import shap

import warnings


# config python file
import config

SEED = config.SEED

from functions import *

fix_seed(SEED)


# 最大表示列数の指定（ここでは50列を指定）N
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

%matplotlib inline

# データ読み込み

In [4]:
X_dict = pickle_load(config.INTERIM_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/X_dict.pickle")
y_dict = pickle_load(config.INTERIM_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/y_dict.pickle")

print(X_dict.keys())

SET_NAME_MICROARRAY = (
    "df_mrna_agilent_microarray",
    "df_mrna_agilent_microarray_zscores_ref_all_samples",
    "df_mrna_agilent_microarray_zscores_ref_diploid_samples",
)
INDEX_MICROARRAY = 2

dict_keys(['mrna_agilent_microarray_zscores_ref_all_samples', 'mrna_agilent_microarray', 'mrna_agilent_microarray_zscores_ref_diploid_samples'])


In [5]:
X_train = X_dict[SET_NAME_MICROARRAY[INDEX_MICROARRAY][3:]]["mrmr"]["train"]["X15"]
y_train = y_dict[SET_NAME_MICROARRAY[INDEX_MICROARRAY][3:]]["mrmr"]["train"]["y15"]
X_val = X_dict[SET_NAME_MICROARRAY[INDEX_MICROARRAY][3:]]["mrmr"]["val"]["X15"]
y_val = y_dict[SET_NAME_MICROARRAY[INDEX_MICROARRAY][3:]]["mrmr"]["val"]["y15"]

assert X_train.shape[0] == y_train.shape[0], "train size is incorrect"
assert X_val.shape[0] == y_val.shape[0], "test size is incorrect"
assert X_train.shape[1] == X_val.shape[1], "feature size is incorrect"

# accuracyの表示
print("----------" * 10)
print("使用特徴量：", X_train.columns)
print("学習サンプルサイズ：", X_train.shape)
display("ラベル比率：", y_train.value_counts())

----------------------------------------------------------------------------------------------------
使用特徴量： Index(['RACGAP1', 'CYP2D6', 'PTX4', 'BIK', 'ENC1', 'DENND6A', 'STAT5B',
       'RNU11', 'DNAJB7', 'PPIL3', 'UHRF1', 'RBBP8', 'MIR137HG', 'GABRB1',
       'TOR1B', 'MST1', 'NMRK1', 'FGF13', 'HIST1H2BF', 'SPATA4', 'TBC1D31',
       'USP30', 'S100P', 'APLN', 'MTG2', 'GAL3ST4', 'FGD3', 'PJVK', 'INTS10',
       'CBX7', 'FCN2', 'RNASE9', 'SPATA18', 'ANKAR', 'SULT4A1'],
      dtype='object')
学習サンプルサイズ： (865, 35)


'ラベル比率：'

1    461
0    404
Name: OS_15years, dtype: int64

# 最適化(all data)

分類器を学習させ、パラメータのチューニングを行い、高い予測精度を目指す。

## Optuna

### Random Forest

In [28]:
def objective(trial):
    # ランダムフォレストのパラメータチューニング
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 50, log=True),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 100),
        "class_weight": trial.suggest_categorical(
            "class_weight", ["balanced", "balanced_subsample"]
        ),
        "random_state": SEED,
    }

    clf = RandomForestClassifier(**params)
    # 10分割交差検証によるテストデータのaccuracyの出力
    score = cross_val_score(
        clf, X_train, y_train, n_jobs=-1, cv=10, scoring=make_scorer(f1_score)
    )
    return score.mean()


study = optuna.create_study(
    direction="maximize", sampler=optuna.samplers.RandomSampler(seed=SEED)
)
study.optimize(objective, n_trials=100, show_progress_bar=True)

[32m[I 2022-09-03 12:57:56,573][0m A new study created in memory with name: no-name-9e407ff8-edd5-4ed6-9470-9808fd0bd933[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[32m[I 2022-09-03 12:57:58,914][0m Trial 0 finished with value: 0.730235781137756 and parameters: {'n_estimators': 548, 'max_depth': 4, 'max_leaf_nodes': 44, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.730235781137756.[0m
[32m[I 2022-09-03 12:57:59,941][0m Trial 1 finished with value: 0.7476018814483331 and parameters: {'n_estimators': 130, 'max_depth': 16, 'max_leaf_nodes': 83, 'class_weight': 'balanced_subsample'}. Best is trial 1 with value: 0.7476018814483331.[0m
[32m[I 2022-09-03 12:58:02,379][0m Trial 2 finished with value: 0.7232782409003276 and parameters: {'n_estimators': 893, 'max_depth': 3, 'max_leaf_nodes': 20, 'class_weight': 'balanced_subsample'}. Best is trial 1 with value: 0.7476018814483331.[0m
[32m[I 2022-09-03 12:58:05,114][0m Trial 3 finished with value: 0.7328254482774693 and parameters: {'n_estimators': 979, 'max_depth': 26, 'max_leaf_nodes': 19, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.7476018814483331.[0m
[32m[I 2022

In [29]:
study.best_trial.params, study.best_trial.values

({'n_estimators': 791,
  'max_depth': 10,
  'max_leaf_nodes': 79,
  'class_weight': 'balanced'},
 [0.7515140834775214])

### LogisticRegression

In [30]:
def objective(trial):
    # ランダムフォレストのパラメータチューニング
    params = {
        "penalty": trial.suggest_categorical("penalty", ["l1", "l2"]),
        "C": trial.suggest_float("C", 1e-5, 1e5),
        "max_iter": 1000,
        "class_weight": "balanced",
        "random_state": SEED,
    }

    clf = LogisticRegression(**params)
    # 10分割交差検証によるテストデータのaccuracyの出力
    score = cross_val_score(
        clf, X_train, y_train, n_jobs=-1, cv=10, scoring=make_scorer(f1_score)
    )
    return score.mean()


study = optuna.create_study(
    direction="maximize", sampler=optuna.samplers.RandomSampler(seed=SEED)
)
study.optimize(objective, n_trials=10)

[32m[I 2022-09-03 13:00:29,370][0m A new study created in memory with name: no-name-127d9aa6-458d-48a7-a9a2-ffc61d0c788a[0m
[33m[W 2022-09-03 13:00:29,385][0m Trial 0 failed, because the objective function returned nan.[0m
[33m[W 2022-09-03 13:00:29,399][0m Trial 1 failed, because the objective function returned nan.[0m
[32m[I 2022-09-03 13:00:29,424][0m Trial 2 finished with value: 0.7412896305631556 and parameters: {'penalty': 'l2', 'C': 13670.65897712823}. Best is trial 2 with value: 0.7412896305631556.[0m
[32m[I 2022-09-03 13:00:29,451][0m Trial 3 finished with value: 0.7412896305631556 and parameters: {'penalty': 'l2', 'C': 20920.212219626937}. Best is trial 2 with value: 0.7412896305631556.[0m
[33m[W 2022-09-03 13:00:29,468][0m Trial 4 failed, because the objective function returned nan.[0m
[33m[W 2022-09-03 13:00:29,484][0m Trial 5 failed, because the objective function returned nan.[0m
[33m[W 2022-09-03 13:00:29,499][0m Trial 6 failed, because the objecti

In [31]:
study.best_trial.params, study.best_trial.values

({'penalty': 'l2', 'C': 13670.65897712823}, [0.7412896305631556])

### lightGBM

In [32]:
def objective(trial):
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "max_bin": trial.suggest_int("max_bin", 100, 300),
        "num_leaves": trial.suggest_int("num_leaves", 20, 50),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
        "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
        "class_weight": "balanced",
        "random_state": SEED,
    }
    clf = LGBMClassifier(boosting_type="gbdt", **params)
    # 10分割交差検証によるテストデータのaccuracyの出力
    score = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=10)
    return score.mean()


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, show_progress_bar=True)

[32m[I 2022-09-03 13:00:29,581][0m A new study created in memory with name: no-name-aea5672f-708a-4678-a429-7bbed6e5b832[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[32m[I 2022-09-03 13:00:30,297][0m Trial 0 finished with value: 0.716840417000802 and parameters: {'lambda_l1': 1.4986058527220454, 'lambda_l2': 6.176627516796918e-05, 'max_bin': 224, 'num_leaves': 23, 'learning_rate': 0.08934504862935556, 'n_estimators': 924}. Best is trial 0 with value: 0.716840417000802.[0m
[32m[I 2022-09-03 13:00:32,349][0m Trial 1 finished with value: 0.7168136861801657 and parameters: {'lambda_l1': 1.1844492670267338e-06, 'lambda_l2': 0.001925957612498235, 'max_bin': 132, 'num_leaves': 22, 'learning_rate': 0.026777556988617345, 'n_estimators': 866}. Best is trial 0 with value: 0.716840417000802.[0m
[32m[I 2022-09-03 13:00:33,622][0m Trial 2 finished with value: 0.7133386794974605 and parameters: {'lambda_l1': 2.8896235573995474e-05, 'lambda_l2': 0.008460966001215394, 'max_bin': 110, 'num_leaves': 39, 'learning_rate': 0.0017625860845904935, 'n_estimators': 540}. Best is trial 0 with value: 0.716840417000802.[0m
[32m[I 2022-09-03 13:00:34,792][0m Trial 3

In [33]:
study.best_trial.params, study.best_trial.values

({'lambda_l1': 4.2187115947258607e-07,
  'lambda_l2': 0.01940432465412003,
  'max_bin': 182,
  'num_leaves': 48,
  'learning_rate': 0.05125399049654937,
  'n_estimators': 835},
 [0.7353381448810479])



# 最適化(subtypes)

分類器を学習させ、パラメータのチューニングを行い、高い予測精度を目指す。

In [8]:
dict_subtype_values = dict()
dict_subtype_params = dict()

## RandomForest

In [9]:
def objective(trial):
    warnings.simplefilter("ignore")
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 50, log=True),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 100),
        "class_weight": trial.suggest_categorical(
            "class_weight", ["balanced", "balanced_subsample"]
        ),
        "random_state": SEED,
    }

    clf = RandomForestClassifier(**params)
    # 10分割交差検証によるテストデータのaccuracyの出力
    score = cross_val_score(clf, X_train_subtype, y_train_subtype, n_jobs=-1, cv=10)
    return score.mean()

In [10]:
optuna.logging.disable_default_handler()
warnings.simplefilter("ignore")
subtypes = [
    "claudin-low",
    "LumA",
    "LumB",
    "Her2",
    "Normal",
    "Basal",
]
dict_subtype_values["rf"] = dict()
dict_subtype_params["rf"] = dict()
for year in range(15, 16, 5):  # 予後年数毎のループ
    for subtype in subtypes:
        dict_subtype_values["rf"][subtype] = dict()
        dict_subtype_params["rf"][subtype] = dict()
        prognosis_Xlabel = "X{0:0=2}_{1}".format(year, subtype)
        prognosis_ylabel = "y{0:0=2}_{1}".format(year, subtype)
        X_train_subtype = X_dict[SET_NAME_MICROARRAY[INDEX_MICROARRAY][3:]][
            "claudin_subtype"
        ]["mrmr"]["train"][prognosis_Xlabel]
        y_train_subtype = y_dict[SET_NAME_MICROARRAY[INDEX_MICROARRAY][3:]][
            "claudin_subtype"
        ]["mrmr"]["train"][prognosis_ylabel]
        assert (
            X_train_subtype.shape[0] == y_train_subtype.shape[0]
        ), "train size is incorrect"

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=100, show_progress_bar=True)
        dict_subtype_values["rf"][subtype][year] = study.best_trial.values
        dict_subtype_params["rf"][subtype][year] = study.best_trial.params

100%|██████████████████████████████████████████████████████| 6/6 [10:57<00:00, 109.62s/it]


## LogitsticRegression

In [17]:
def objective(trial):
    warnings.simplefilter("ignore")
    params = {
        "penalty": trial.suggest_categorical("penalty", ["l1", "l2"]),
        "C": trial.suggest_float("C", 1e-5, 1e5),
        "max_iter": 1000,
        "class_weight": "balanced",
        "random_state": SEED,
    }

    clf = LogisticRegression(**params)
    # 10分割交差検証によるテストデータのaccuracyの出力
    score = cross_val_score(clf, X_train_subtype, y_train_subtype, n_jobs=-1, cv=10)
    return score.mean()

In [18]:
optuna.logging.disable_default_handler()
warnings.simplefilter("ignore")
subtypes = [
    "claudin-low",
    "LumA",
    "LumB",
    "Her2",
    "Normal",
    "Basal",
]
dict_subtype_values["lr"] = dict()
dict_subtype_params["lr"] = dict()
for year in range(15, 16, 5):  # 予後年数毎のループ
    print("====={0:0=2}".format(year) * 10)
    for subtype in subtypes:
        print("----------" * 10)
        print(subtype)
        dict_subtype_values["lr"][subtype] = dict()
        dict_subtype_params["lr"][subtype] = dict()
        prognosis_Xlabel = "X{0:0=2}_{1}".format(year, subtype)
        prognosis_ylabel = "y{0:0=2}_{1}".format(year, subtype)
        X_train_subtype = X_dict[SET_NAME_MICROARRAY[INDEX_MICROARRAY][3:]][
            "claudin_subtype"
        ]["mrmr"]["train"][prognosis_Xlabel]
        y_train_subtype = y_dict[SET_NAME_MICROARRAY[INDEX_MICROARRAY][3:]][
            "claudin_subtype"
        ]["mrmr"]["train"][prognosis_ylabel]
        assert (
            X_train_subtype.shape[0] == y_train_subtype.shape[0]
        ), "train size is incorrect"

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=100, show_progress_bar=True)
        dict_subtype_values["lr"][subtype][year] = study.best_trial.values
        dict_subtype_params["lr"][subtype][year] = study.best_trial.params

=====15=====15=====15=====15=====15=====15=====15=====15=====15=====15
----------------------------------------------------------------------------------------------------
claudin-low


  0%|          | 0/100 [00:00<?, ?it/s]

[33m[W 2022-09-02 04:32:05,555][0m Trial 0 failed, because the objective function returned nan.[0m
[32m[I 2022-09-02 04:32:05,580][0m Trial 1 finished with value: 0.875 and parameters: {'penalty': 'l2', 'C': 67264.13883788427}. Best is trial 1 with value: 0.875.[0m
[33m[W 2022-09-02 04:32:05,595][0m Trial 2 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:05,608][0m Trial 3 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:05,622][0m Trial 4 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:05,639][0m Trial 5 failed, because the objective function returned nan.[0m
[32m[I 2022-09-02 04:32:05,666][0m Trial 6 finished with value: 0.8861111111111111 and parameters: {'penalty': 'l2', 'C': 10849.90971444913}. Best is trial 6 with value: 0.8861111111111111.[0m
[32m[I 2022-09-02 04:32:05,695][0m Trial 7 finished with value: 0.898611111111111 and parameters: {'penalty': 'l2', 'C': 3

[32m[I 2022-09-02 04:32:07,235][0m A new study created in memory with name: no-name-c4d64a6c-89d8-48fc-bc5b-a9b5de80544c[0m


[33m[W 2022-09-02 04:32:07,232][0m Trial 99 failed, because the objective function returned nan.[0m
----------------------------------------------------------------------------------------------------
LumA


  0%|          | 0/100 [00:00<?, ?it/s]

[32m[I 2022-09-02 04:32:07,270][0m Trial 0 finished with value: 0.7997354497354497 and parameters: {'penalty': 'l2', 'C': 45008.96198964705}. Best is trial 0 with value: 0.7997354497354497.[0m
[33m[W 2022-09-02 04:32:07,282][0m Trial 1 failed, because the objective function returned nan.[0m
[32m[I 2022-09-02 04:32:07,302][0m Trial 2 finished with value: 0.7997354497354497 and parameters: {'penalty': 'l2', 'C': 41375.610999929995}. Best is trial 0 with value: 0.7997354497354497.[0m
[32m[I 2022-09-02 04:32:07,320][0m Trial 3 finished with value: 0.7997354497354497 and parameters: {'penalty': 'l2', 'C': 48151.98857328154}. Best is trial 0 with value: 0.7997354497354497.[0m
[32m[I 2022-09-02 04:32:07,340][0m Trial 4 finished with value: 0.7997354497354497 and parameters: {'penalty': 'l2', 'C': 24240.528756262454}. Best is trial 0 with value: 0.7997354497354497.[0m
[33m[W 2022-09-02 04:32:07,356][0m Trial 5 failed, because the objective function returned nan.[0m
[33m[W 20

[32m[I 2022-09-02 04:32:08,867][0m A new study created in memory with name: no-name-0ad8d0c1-f01b-45b5-9757-b0055753b1f3[0m


[33m[W 2022-09-02 04:32:08,735][0m Trial 91 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:08,751][0m Trial 92 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:08,766][0m Trial 93 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:08,783][0m Trial 94 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:08,798][0m Trial 95 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:08,818][0m Trial 96 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:08,832][0m Trial 97 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:08,848][0m Trial 98 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:08,864][0m Trial 99 failed, because the objective function returned nan.[0m
-------------------------------------------------------------------------

  0%|          | 0/100 [00:00<?, ?it/s]

[32m[I 2022-09-02 04:32:08,903][0m Trial 0 finished with value: 0.8547619047619047 and parameters: {'penalty': 'l2', 'C': 5518.273040198382}. Best is trial 0 with value: 0.8547619047619047.[0m
[32m[I 2022-09-02 04:32:08,923][0m Trial 1 finished with value: 0.8547619047619047 and parameters: {'penalty': 'l2', 'C': 94651.26191053625}. Best is trial 0 with value: 0.8547619047619047.[0m
[33m[W 2022-09-02 04:32:08,934][0m Trial 2 failed, because the objective function returned nan.[0m
[32m[I 2022-09-02 04:32:08,955][0m Trial 3 finished with value: 0.8547619047619047 and parameters: {'penalty': 'l2', 'C': 99602.28703414781}. Best is trial 0 with value: 0.8547619047619047.[0m
[32m[I 2022-09-02 04:32:08,974][0m Trial 4 finished with value: 0.8547619047619047 and parameters: {'penalty': 'l2', 'C': 8646.02068673071}. Best is trial 0 with value: 0.8547619047619047.[0m
[33m[W 2022-09-02 04:32:08,988][0m Trial 5 failed, because the objective function returned nan.[0m
[32m[I 2022-

[32m[I 2022-09-02 04:32:10,552][0m A new study created in memory with name: no-name-f916d4de-bc5f-49a2-984a-a42e1dee389d[0m


[33m[W 2022-09-02 04:32:10,423][0m Trial 92 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:10,445][0m Trial 93 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:10,461][0m Trial 94 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:10,482][0m Trial 95 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:10,496][0m Trial 96 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:10,515][0m Trial 97 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:10,532][0m Trial 98 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:10,549][0m Trial 99 failed, because the objective function returned nan.[0m
----------------------------------------------------------------------------------------------------
Her2


  0%|          | 0/100 [00:00<?, ?it/s]

[33m[W 2022-09-02 04:32:10,583][0m Trial 0 failed, because the objective function returned nan.[0m
[32m[I 2022-09-02 04:32:10,606][0m Trial 1 finished with value: 0.908974358974359 and parameters: {'penalty': 'l2', 'C': 73188.67598349586}. Best is trial 1 with value: 0.908974358974359.[0m
[32m[I 2022-09-02 04:32:10,628][0m Trial 2 finished with value: 0.9256410256410257 and parameters: {'penalty': 'l2', 'C': 25733.870118773557}. Best is trial 2 with value: 0.9256410256410257.[0m
[33m[W 2022-09-02 04:32:10,641][0m Trial 3 failed, because the objective function returned nan.[0m
[32m[I 2022-09-02 04:32:10,663][0m Trial 4 finished with value: 0.908974358974359 and parameters: {'penalty': 'l2', 'C': 85932.61340393519}. Best is trial 2 with value: 0.9256410256410257.[0m
[32m[I 2022-09-02 04:32:10,689][0m Trial 5 finished with value: 0.9173076923076924 and parameters: {'penalty': 'l2', 'C': 41357.50698459852}. Best is trial 2 with value: 0.9256410256410257.[0m
[33m[W 2022-0

[32m[I 2022-09-02 04:32:12,214][0m A new study created in memory with name: no-name-1e591dd1-0bad-471f-b9b9-62210b81372f[0m


[33m[W 2022-09-02 04:32:12,082][0m Trial 91 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:12,099][0m Trial 92 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:12,114][0m Trial 93 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:12,130][0m Trial 94 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:12,149][0m Trial 95 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:12,166][0m Trial 96 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:12,181][0m Trial 97 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:12,196][0m Trial 98 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:12,211][0m Trial 99 failed, because the objective function returned nan.[0m
-------------------------------------------------------------------------

  0%|          | 0/100 [00:00<?, ?it/s]

[33m[W 2022-09-02 04:32:12,243][0m Trial 0 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:12,255][0m Trial 1 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:12,269][0m Trial 2 failed, because the objective function returned nan.[0m
[32m[I 2022-09-02 04:32:12,288][0m Trial 3 finished with value: 0.9833333333333334 and parameters: {'penalty': 'l2', 'C': 54074.44833192041}. Best is trial 3 with value: 0.9833333333333334.[0m
[32m[I 2022-09-02 04:32:12,308][0m Trial 4 finished with value: 0.9666666666666668 and parameters: {'penalty': 'l2', 'C': 3233.9747982715094}. Best is trial 3 with value: 0.9833333333333334.[0m
[32m[I 2022-09-02 04:32:12,329][0m Trial 5 finished with value: 0.9833333333333334 and parameters: {'penalty': 'l2', 'C': 99917.34802571345}. Best is trial 3 with value: 0.9833333333333334.[0m
[33m[W 2022-09-02 04:32:12,345][0m Trial 6 failed, because the objective function returned nan.[0m
[3

[32m[I 2022-09-02 04:32:13,828][0m A new study created in memory with name: no-name-f3354810-2f41-4e29-91e9-46a53106ac7a[0m


[33m[W 2022-09-02 04:32:13,750][0m Trial 94 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:13,765][0m Trial 95 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:13,779][0m Trial 96 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:13,797][0m Trial 97 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:13,811][0m Trial 98 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:13,825][0m Trial 99 failed, because the objective function returned nan.[0m
----------------------------------------------------------------------------------------------------
Basal


  0%|          | 0/100 [00:00<?, ?it/s]

[32m[I 2022-09-02 04:32:13,874][0m Trial 0 finished with value: 0.8763636363636363 and parameters: {'penalty': 'l2', 'C': 48305.366579906215}. Best is trial 0 with value: 0.8763636363636363.[0m
[33m[W 2022-09-02 04:32:13,886][0m Trial 1 failed, because the objective function returned nan.[0m
[33m[W 2022-09-02 04:32:13,900][0m Trial 2 failed, because the objective function returned nan.[0m
[32m[I 2022-09-02 04:32:13,931][0m Trial 3 finished with value: 0.8763636363636363 and parameters: {'penalty': 'l2', 'C': 31518.269841882877}. Best is trial 0 with value: 0.8763636363636363.[0m
[33m[W 2022-09-02 04:32:13,948][0m Trial 4 failed, because the objective function returned nan.[0m
[32m[I 2022-09-02 04:32:13,976][0m Trial 5 finished with value: 0.8763636363636363 and parameters: {'penalty': 'l2', 'C': 32899.86845158977}. Best is trial 0 with value: 0.8763636363636363.[0m
[33m[W 2022-09-02 04:32:13,989][0m Trial 6 failed, because the objective function returned nan.[0m
[

### lightGBM

In [20]:
def objective(trial):
    warnings.simplefilter("ignore")
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "max_bin": trial.suggest_int("max_bin", 100, 300),
        "num_leaves": trial.suggest_int("num_leaves", 20, 50),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
        "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
        "class_weight": "balanced",
        "random_state": SEED,
    }
    clf = LGBMClassifier(boosting_type="gbdt", **params)
    # 10分割交差検証によるテストデータのaccuracyの出力
    score = cross_val_score(clf, X_train_subtype, y_train_subtype, n_jobs=-1, cv=10)
    return score.mean()

In [14]:
optuna.logging.disable_default_handler()
warnings.simplefilter("ignore")
subtypes = [
    "claudin-low",
    "LumA",
    "LumB",
    "Her2",
    "Normal",
    "Basal",
]
dict_subtype_values["lgb"] = dict()
dict_subtype_params["lgb"] = dict()
for year in range(15, 16, 5):  # 予後年数毎のループ
    print("====={0:0=2}".format(year) * 10)
    for subtype in tqdm(subtypes):
        print("----------" * 10)
        print(subtype)
        dict_subtype_values["lgb"][subtype] = dict()
        dict_subtype_params["lgb"][subtype] = dict()
        prognosis_Xlabel = "X{0:0=2}_{1}".format(year, subtype)
        prognosis_ylabel = "y{0:0=2}_{1}".format(year, subtype)
        X_train_subtype = X_dict[SET_NAME_MICROARRAY[INDEX_MICROARRAY][3:]][
            "claudin_subtype"
        ]["mrmr"]["train"][prognosis_Xlabel]
        y_train_subtype = y_dict[SET_NAME_MICROARRAY[INDEX_MICROARRAY][3:]][
            "claudin_subtype"
        ]["mrmr"]["train"][prognosis_ylabel]
        assert (
            X_train_subtype.shape[0] == y_train_subtype.shape[0]
        ), "train size is incorrect"

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=100, show_progress_bar=True)
        dict_subtype_values["lgb"][subtype][year] = study.best_trial.values
        dict_subtype_params["lgb"][subtype][year] = study.best_trial.params

=====15=====15=====15=====15=====15=====15=====15=====15=====15=====15


  0%|                                                               | 0/6 [00:00<?, ?it/s]

----------------------------------------------------------------------------------------------------
claudin-low


 17%|█████████▏                                             | 1/6 [00:14<01:14, 14.86s/it]

----------------------------------------------------------------------------------------------------
LumA


 33%|██████████████████▎                                    | 2/6 [00:49<01:45, 26.39s/it]

----------------------------------------------------------------------------------------------------
LumB


 50%|███████████████████████████▌                           | 3/6 [01:18<01:23, 27.79s/it]

----------------------------------------------------------------------------------------------------
Her2


 67%|████████████████████████████████████▋                  | 4/6 [01:36<00:47, 23.98s/it]

----------------------------------------------------------------------------------------------------
Normal


 83%|█████████████████████████████████████████████▊         | 5/6 [01:48<00:19, 19.35s/it]

----------------------------------------------------------------------------------------------------
Basal


100%|███████████████████████████████████████████████████████| 6/6 [02:04<00:00, 20.81s/it]






In [19]:
dict_subtype_values

{'rf': {'claudin-low': {15: [0.898611111111111]},
  'LumA': {15: [0.8007936507936508]},
  'LumB': {15: [0.8311688311688311]},
  'Her2': {15: [0.8775641025641026]},
  'Normal': {15: [0.9166666666666667]},
  'Basal': {15: [0.82]}},
 'lr': {'claudin-low': {15: [0.898611111111111]},
  'LumA': {15: [0.7997354497354497]},
  'LumB': {15: [0.8547619047619047]},
  'Her2': {15: [0.9256410256410257]},
  'Normal': {15: [0.9833333333333334]},
  'Basal': {15: [0.8772727272727272]}},
 'lgb': {'claudin-low': {15: [0.8652777777777777]},
  'LumA': {15: [0.7887566137566138]},
  'LumB': {15: [0.8170995670995671]},
  'Her2': {15: [0.8602564102564102]},
  'Normal': {15: [0.9100000000000001]},
  'Basal': {15: [0.8590909090909091]}}}

In [16]:
dict_subtype_params

{'rf': {'claudin-low': {15: {'n_estimators': 287,
    'max_depth': 42,
    'max_leaf_nodes': 11,
    'class_weight': 'balanced'}},
  'LumA': {15: {'n_estimators': 236,
    'max_depth': 3,
    'max_leaf_nodes': 25,
    'class_weight': 'balanced'}},
  'LumB': {15: {'n_estimators': 116,
    'max_depth': 21,
    'max_leaf_nodes': 4,
    'class_weight': 'balanced_subsample'}},
  'Her2': {15: {'n_estimators': 546,
    'max_depth': 4,
    'max_leaf_nodes': 72,
    'class_weight': 'balanced'}},
  'Normal': {15: {'n_estimators': 416,
    'max_depth': 2,
    'max_leaf_nodes': 5,
    'class_weight': 'balanced_subsample'}},
  'Basal': {15: {'n_estimators': 931,
    'max_depth': 10,
    'max_leaf_nodes': 9,
    'class_weight': 'balanced_subsample'}}},
 'lr': {'claudin-low': {15: {'penalty': 'l2', 'C': 15053.998177472511}},
  'LumA': {15: {'penalty': 'l2', 'C': 5810.414600205819}},
  'LumB': {15: {'penalty': 'l2', 'C': 19119.85093227589}},
  'Her2': {15: {'penalty': 'l2', 'C': 5287.626822967708}},
 

# save param info

In [21]:
make_dir(config.INTERIM_TUNING_PROGNOSIS_CROSS_DIR)
pickle_dump(
    dict_subtype_values,
    config.INTERIM_TUNING_PROGNOSIS_CROSS_DIR + "/subtype_values.pickle",
)
pickle_dump(
    dict_subtype_params,
    config.INTERIM_TUNING_PROGNOSIS_CROSS_DIR + "/subtype_params.pickle",
)


