In [1]:
import os
import random
import itertools
import re

# 基本的なライブラリ
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

# 描画ライブラリ
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn_analyzer import CustomPairPlot
import graphviz
import pydotplus
from IPython.display import Image
from IPython.display import HTML
from six import StringIO
from ipywidgets import interact, FloatSlider
from dtreeviz.trees import *

# データセット分割
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit

# 補完
from sklearn.experimental import (
    enable_iterative_imputer,
)  # IterativeImputerをimportするために必要
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

# エンコード
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

# サンプリング
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# 特徴量選択
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import (
    GenericUnivariateSelect,
    f_classif,
    mutual_info_classif,
    chi2,
)
from boruta import BorutaPy
from libraries.mrmr import mrmr

# 学習中
import optuna
from tqdm import tqdm
from sklearn.model_selection import learning_curve, cross_validate, cross_val_score

# 評価指標
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# models
from lightgbm import LGBMClassifier

# XAI
import shap

import warnings


# config python file
import config

SEED = config.SEED

from functions import *

fix_seed(SEED)


# 最大表示列数の指定（ここでは50列を指定）N
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

%matplotlib inline

# データ読み込み

In [2]:
X_dict = pickle_load(config.INTERIM_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/X_dict.pickle")
y_dict = pickle_load(config.INTERIM_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/y_dict.pickle")

In [3]:
X_dict["microarray_z_all"].keys()

dict_keys(['train', 'val', 'test', 'claudin_subtype', 'mrmr', 'boruta'])

In [4]:
X_train = X_dict["microarray_z_all"]["mrmr"]["train"]["X15"]
y_train = y_dict["microarray_z_all"]["mrmr"]["train"]["y15"]
X_val = X_dict["microarray_z_all"]["mrmr"]["val"]["X15"]
y_val = y_dict["microarray_z_all"]["mrmr"]["val"]["y15"]

assert X_train.shape[0] == y_train.shape[0], "train size is incorrect"
assert X_val.shape[0] == y_val.shape[0], "test size is incorrect"
assert X_train.shape[1] == X_val.shape[1], "feature size is incorrect"

# accuracyの表示
print("----------" * 10)
print("使用特徴量：", X_train.columns)
print("学習サンプルサイズ：", X_train.shape)
display("ラベル比率：", y_train.value_counts())

----------------------------------------------------------------------------------------------------
使用特徴量： Index(['RACGAP1', 'WRN', 'PTX4', 'RNASE9', 'AI912012', 'GOLGA6L1', 'AGRP',
       'RNU11', 'STAT5B', 'OSCP1', 'AI985115', 'C9orf95', 'TOR1B', 'FGF13',
       'PPIL3', 'HCRTR1', 'RBBP8', 'USP30', 'EMR3', 'DFNB59', 'AA897398',
       'GSTM2', 'PROL1', 'COX7B2', 'SERPINE1', 'S100P', 'AK056943', 'BG218808',
       'GTPBP5', 'HIST1H2BF', 'SPATA4', 'BM714043', 'VANGL1', 'SPATA18',
       'ENC1'],
      dtype='object')
学習サンプルサイズ： (858, 35)


'ラベル比率：'

1    460
0    398
Name: OS_15years, dtype: int64

# 最適化(all data)

分類器を学習させ、パラメータのチューニングを行い、高い予測精度を目指す。

## Optuna

### Random Forest

In [5]:
def objective(trial):
    # ランダムフォレストのパラメータチューニング
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 50, log=True),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 100),
        "class_weight": trial.suggest_categorical(
            "class_weight", ["balanced", "balanced_subsample"]
        ),
        "random_state": SEED,
    }

    clf = RandomForestClassifier(**params)
    # 10分割交差検証によるテストデータのaccuracyの出力
    score = cross_val_score(
        clf, X_train, y_train, n_jobs=-1, cv=10, scoring=make_scorer(f1_score)
    )
    return score.mean()


study = optuna.create_study(
    direction="maximize", sampler=optuna.samplers.RandomSampler(seed=SEED)
)
study.optimize(objective, n_trials=100, show_progress_bar=True)

[32m[I 2022-09-01 08:12:07,277][0m A new study created in memory with name: no-name-da883e70-e34f-4638-a0cb-ca777d4049e7[0m
[32m[I 2022-09-01 08:12:09,595][0m Trial 0 finished with value: 0.7359705795531155 and parameters: {'n_estimators': 548, 'max_depth': 4, 'max_leaf_nodes': 44, 'class_wight': 'balanced'}. Best is trial 0 with value: 0.7359705795531155.[0m
[32m[I 2022-09-01 08:12:10,578][0m Trial 1 finished with value: 0.7632914807588467 and parameters: {'n_estimators': 130, 'max_depth': 16, 'max_leaf_nodes': 83, 'class_wight': 'balanced_subsample'}. Best is trial 1 with value: 0.7632914807588467.[0m
[32m[I 2022-09-01 08:12:12,506][0m Trial 2 finished with value: 0.7256714089508822 and parameters: {'n_estimators': 893, 'max_depth': 3, 'max_leaf_nodes': 20, 'class_wight': 'balanced_subsample'}. Best is trial 1 with value: 0.7632914807588467.[0m
[32m[I 2022-09-01 08:12:14,754][0m Trial 3 finished with value: 0.746201487628882 and parameters: {'n_estimators': 979, 'max_de

In [6]:
study.best_trial.params, study.best_trial.values

{'n_estimators': 130,
 'max_depth': 16,
 'max_leaf_nodes': 83,
 'class_wight': 'balanced_subsample'}

### LogisticRegression

In [7]:
def objective(trial):
    # ランダムフォレストのパラメータチューニング
    params = {
        "penalty": trial.suggest_categorical("penalty", ["l1", "l2"]),
        "C": trial.suggest_float("C", 1e-5, 1e5),
        "max_iter": 1000,
        "class_weight": "balanced",
        "random_state": SEED,
    }

    clf = LogisticRegression(**params)
    # 10分割交差検証によるテストデータのaccuracyの出力
    score = cross_val_score(
        clf, X_train, y_train, n_jobs=-1, cv=10, scoring=make_scorer(f1_score)
    )
    return score.mean()


study = optuna.create_study(
    direction="maximize", sampler=optuna.samplers.RandomSampler(seed=SEED)
)
study.optimize(objective, n_trials=10)

[32m[I 2022-09-01 08:14:35,277][0m A new study created in memory with name: no-name-0faffe65-2b8b-4e68-87eb-75c146b6f2e2[0m
[33m[W 2022-09-01 08:14:35,295][0m Trial 0 failed, because the objective function returned nan.[0m
[33m[W 2022-09-01 08:14:35,311][0m Trial 1 failed, because the objective function returned nan.[0m
[32m[I 2022-09-01 08:14:35,336][0m Trial 2 finished with value: 0.7480370274529636 and parameters: {'penalty': 'l2', 'C': 13670.65897712823}. Best is trial 2 with value: 0.7480370274529636.[0m
[32m[I 2022-09-01 08:14:35,363][0m Trial 3 finished with value: 0.7480370274529636 and parameters: {'penalty': 'l2', 'C': 20920.212219626937}. Best is trial 2 with value: 0.7480370274529636.[0m
[33m[W 2022-09-01 08:14:35,381][0m Trial 4 failed, because the objective function returned nan.[0m
[33m[W 2022-09-01 08:14:35,397][0m Trial 5 failed, because the objective function returned nan.[0m
[33m[W 2022-09-01 08:14:35,415][0m Trial 6 failed, because the objecti

In [8]:
study.best_trial.params, study.best_trial.values

({'penalty': 'l2', 'C': 13670.65897712823}, [0.7480370274529636])

### SVC

In [None]:
def objective(trial):
    # ランダムフォレストのパラメータチューニング
    params = {
        "C": trial.suggest_float("C", 1e-5, 1e5),
        "kernel": trial.suggest_categorical(
            "kernel",
            [
                "linear",
                "poly",
                "rbf",
                "sigmoid",
            ],
        ),
        "gamma": trial.suggest_categorical("gamma", ["scale", "auto"]),
        "shrinking": trial.suggest_categorical("shrinking", [True, False]),
        "decision_function_shape": trial.suggest_categorical(
            "decision_function_shape", ["ovo", "ovr"]
        ),
        "class_weight": "balanced",
        "random_state": SEED,
    }

    clf = SVC(**params)
    # 10分割交差検証によるテストデータのaccuracyの出力
    score = cross_val_score(
        clf, X_train, y_train, n_jobs=-1, cv=10, scoring=make_scorer(f1_score)
    )
    return score.mean()


study = optuna.create_study(
    direction="maximize", sampler=optuna.samplers.RandomSampler(seed=SEED)
)
study.optimize(objective, n_trials=100, show_progress_bar=True)

[32m[I 2022-09-01 09:26:35,329][0m A new study created in memory with name: no-name-d5f0cffa-3f8d-487b-ba95-41fb0c03f0ec[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[32m[I 2022-09-01 09:26:36,486][0m Trial 0 finished with value: 0.7260850622116812 and parameters: {'C': 54340.49418366249, 'kernel': 'rbf', 'gamma': 'auto', 'shrinking': True, 'decision_function_shape': 'ovr'}. Best is trial 0 with value: 0.7260850622116812.[0m
[32m[I 2022-09-01 09:26:37,037][0m Trial 1 finished with value: 0.722323906808479 and parameters: {'C': 20920.212219626937, 'kernel': 'sigmoid', 'gamma': 'scale', 'shrinking': True, 'decision_function_shape': 'ovr'}. Best is trial 0 with value: 0.7260850622116812.[0m
[32m[I 2022-09-01 09:26:37,098][0m Trial 2 finished with value: 0.7260850622116812 and parameters: {'C': 81764.93787949624, 'kernel': 'rbf', 'gamma': 'auto', 'shrinking': False, 'decision_function_shape': 'ovo'}. Best is trial 0 with value: 0.7260850622116812.[0m
[32m[I 2022-09-01 09:26:37,156][0m Trial 3 finished with value: 0.7260850622116812 and parameters: {'C': 38194.34450049157, 'kernel': 'rbf', 'gamma': 'scale', 'shrinking': True, 'decision_functi

In [None]:
study.best_trial.params, study.best_trial.values

### lightGBM

In [None]:
def objective(trial):
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "max_bin": trial.suggest_int("max_bin", 100, 300),
        "num_leaves": trial.suggest_int("num_leaves", 20, 50),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
        "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
        "class_weight": "balanced",
        "random_state": SEED,
    }
    clf = LGBMClassifier(boosting_type="gbdt", **params)
    # 10分割交差検証によるテストデータのaccuracyの出力
    score = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=10)
    return score.mean()


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

In [None]:
study.best_trial.params, study.best_trial.values

# 最適化(subtypes)

分類器を学習させ、パラメータのチューニングを行い、高い予測精度を目指す。

In [None]:
dict_subtype_values = dict()
dict_subtype_params = dict()

## RandomForest

In [None]:
def objective(trial):
    warnings.simplefilter("ignore")
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 50, log=True),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 100),
        "class_weight": trial.suggest_categorical(
            "class_weight", ["balanced", "balanced_subsample"]
        ),
        "random_state": SEED,
    }

    clf = RandomForestClassifier(**params)
    # 10分割交差検証によるテストデータのaccuracyの出力
    score = cross_val_score(clf, X_train_subtype, y_train_subtype, n_jobs=-1, cv=10)
    return score.mean()

In [None]:
optuna.logging.disable_default_handler()
warnings.simplefilter("ignore")
subtypes = [
    "claudin-low",
    "LumA",
    "LumB",
    "Her2",
    "Normal",
    "Basal",
]
dict_subtype_values["rf"] = dict()
dict_subtype_params["rf"] = dict()
for year in range(15, 16, 5):  # 予後年数毎のループ
    for subtype in tqdm(subtypes):
        dict_subtype_values["rf"][subtype] = dict()
        dict_subtype_params["rf"][subtype] = dict()
        prognosis_Xlabel = "X{0:0=2}_{1}".format(year, subtype)
        prognosis_ylabel = "y{0:0=2}_{1}".format(year, subtype)
        X_train_subtype = X_dict["microarray_z_all"]["claudin_subtype"]["mrmr"][
            "train"
        ][prognosis_Xlabel]
        y_train_subtype = y_dict["microarray_z_all"]["claudin_subtype"]["mrmr"][
            "train"
        ][prognosis_ylabel]
        assert (
            X_train_subtype.shape[0] == y_train_subtype.shape[0]
        ), "train size is incorrect"

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=100)
        dict_subtype_values["rf"][subtype][year] = study.best_trial.values
        dict_subtype_params["rf"][subtype][year] = study.best_trial.params

## LogitsticRegression

In [77]:
def objective(trial):
    warnings.simplefilter("ignore")
    params = {
        "penalty": trial.suggest_categorical("penalty", ["l1", "l2"]),
        "C": trial.suggest_float("C", 1e-5, 1e5),
        "max_iter": 1000,
        "class_weight": "balanced",
        "random_state": SEED,
    }

    clf = LogisticRegression(**params)
    # 10分割交差検証によるテストデータのaccuracyの出力
    score = cross_val_score(clf, X_train_subtype, y_train_subtype, n_jobs=-1, cv=10)
    return score.mean()

In [None]:
optuna.logging.disable_default_handler()
warnings.simplefilter("ignore")
subtypes = [
    "claudin-low",
    "LumA",
    "LumB",
    "Her2",
    "Normal",
    "Basal",
]
dict_subtype_values["lr"] = dict()
dict_subtype_params["lr"] = dict()
for year in range(15, 16, 5):  # 予後年数毎のループ
    print("====={0:0=2}".format(year) * 10)
    for subtype in subtypes:
        print("----------" * 10)
        print(subtype)
        dict_subtype_values["lr"][subtype] = dict()
        dict_subtype_params["lr"][subtype] = dict()
        prognosis_Xlabel = "X{0:0=2}_{1}".format(year, subtype)
        prognosis_ylabel = "y{0:0=2}_{1}".format(year, subtype)
        X_train_subtype = X_dict["microarray_z_all"]["claudin_subtype"]["mrmr"][
            "train"
        ][prognosis_Xlabel]
        y_train_subtype = y_dict["microarray_z_all"]["claudin_subtype"]["mrmr"][
            "train"
        ][prognosis_ylabel]
        assert (
            X_train_subtype.shape[0] == y_train_subtype.shape[0]
        ), "train size is incorrect"

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=100)
        dict_subtype_values["lr"][subtype][year] = study.best_trial.values
        dict_subtype_params["lr"][subtype][year] = study.best_trial.params

### lightGBM

In [None]:
def objective(trial):
    warnings.simplefilter("ignore")
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "max_bin": trial.suggest_int("max_bin", 100, 300),
        "num_leaves": trial.suggest_int("num_leaves", 20, 50),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
        "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
        "class_weight": "balanced",
        "random_state": SEED,
    }
    clf = LGBMClassifier(boosting_type="gbdt", **params)
    # 10分割交差検証によるテストデータのaccuracyの出力
    score = cross_val_score(clf, X_train_subtype, y_train_subtype, n_jobs=-1, cv=10)
    return score.mean()

In [None]:
optuna.logging.disable_default_handler()
warnings.simplefilter("ignore")
subtypes = [
    "claudin-low",
    "LumA",
    "LumB",
    "Her2",
    "Normal",
    "Basal",
]
dict_subtype_values["lgb"] = dict()
dict_subtype_params["lgb"] = dict()
for year in range(15, 16, 5):  # 予後年数毎のループ
    print("====={0:0=2}".format(year) * 10)
    for subtype in tqdm(subtypes):
        print("----------" * 10)
        print(subtype)
        dict_subtype_values["lgb"][subtype] = dict()
        dict_subtype_params["lgb"][subtype] = dict()
        prognosis_Xlabel = "X{0:0=2}_{1}".format(year, subtype)
        prognosis_ylabel = "y{0:0=2}_{1}".format(year, subtype)
        X_train_subtype = X_dict["microarray_z_all"]["claudin_subtype"]["mrmr"][
            "train"
        ][prognosis_Xlabel]
        y_train_subtype = y_dict["microarray_z_all"]["claudin_subtype"]["mrmr"][
            "train"
        ][prognosis_ylabel]
        assert (
            X_train_subtype.shape[0] == y_train_subtype.shape[0]
        ), "train size is incorrect"

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=100)
        dict_subtype_values["lgb"][subtype][year] = study.best_trial.values
        dict_subtype_params["lgb"][subtype][year] = study.best_trial.params

In [86]:
make_dir(config.INTERIM_TUNING_PROGNOSIS_CROSS_DIR)
pickle_dump(
    dict_subtype_values,
    config.INTERIM_TUNING_PROGNOSIS_CROSS_DIR + "/subtype_values.pickle",
)
pickle_dump(
    dict_subtype_params,
    config.INTERIM_TUNING_PROGNOSIS_CROSS_DIR + "/subtype_params.pickle",
)

In [87]:
dict_subtype_values

{'rf': {'claudin-low': {15: [0.8861111111111111]},
  'LumA': {15: [0.7667989417989418]},
  'LumB': {15: [0.7809523809523808]},
  'Her2': {15: [0.808974358974359]},
  'Normal': {15: [0.9133333333333333]},
  'Basal': {15: [0.8300000000000001]}},
 'lr': {'claudin-low': {15: [0.8763888888888889]},
  'LumA': {15: [0.748015873015873]},
  'LumB': {15: [0.780952380952381]},
  'Her2': {15: [0.7769230769230769]},
  'Normal': {15: [0.9466666666666667]},
  'Basal': {15: [0.8727272727272727]}},
 'lgb': {'claudin-low': {15: [0.8319444444444445]},
  'LumA': {15: [0.8064814814814815]},
  'LumB': {15: [0.780952380952381]},
  'Her2': {15: [0.7756410256410257]},
  'Normal': {15: [0.9066666666666668]},
  'Basal': {15: [0.8036363636363637]}}}

In [88]:
dict_subtype_params

{'rf': {'claudin-low': {15: {'n_estimators': 478,
    'max_depth': 19,
    'max_leaf_nodes': 9,
    'class_weight': 'balanced_subsample'}},
  'LumA': {15: {'n_estimators': 882,
    'max_depth': 20,
    'max_leaf_nodes': 35,
    'class_weight': 'balanced'}},
  'LumB': {15: {'n_estimators': 97,
    'max_depth': 12,
    'max_leaf_nodes': 81,
    'class_weight': 'balanced_subsample'}},
  'Her2': {15: {'n_estimators': 608,
    'max_depth': 3,
    'max_leaf_nodes': 89,
    'class_weight': 'balanced'}},
  'Normal': {15: {'n_estimators': 91,
    'max_depth': 7,
    'max_leaf_nodes': 23,
    'class_weight': 'balanced'}},
  'Basal': {15: {'n_estimators': 517,
    'max_depth': 4,
    'max_leaf_nodes': 43,
    'class_weight': 'balanced'}}},
 'lr': {'claudin-low': {15: {'penalty': 'l2', 'C': 36156.402308163364}},
  'LumA': {15: {'penalty': 'l2', 'C': 9654.664188336264}},
  'LumB': {15: {'penalty': 'l2', 'C': 51528.43835221508}},
  'Her2': {15: {'penalty': 'l2', 'C': 5076.262956771448}},
  'Normal':