# 目的
遺伝子学的分類に基づいた、再発の2値分類を実施する。  

In [10]:
# 基本的なライブラリ
import sys
from tqdm import tqdm

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 50)

import matplotlib.pyplot as plt

%matplotlib inline

# パイプライン（imblearn）
from imblearn.pipeline import Pipeline
from imblearn import FunctionSampler

# モデル
from sksurv.ensemble import (
    ComponentwiseGradientBoostingSurvivalAnalysis,
    GradientBoostingSurvivalAnalysis,
    RandomSurvivalForest,
    ExtraSurvivalTrees,
)
from sksurv.svm import FastKernelSurvivalSVM
from sksurv.linear_model import IPCRidge


# 評価指標
from sklearn.metrics import mean_squared_error

import config
import functions

SEED = config.SEED
TARGET_NAME = ["RFS_STATUS", "RFS_MONTHS"]
functions.fix_seed(SEED)


df_recurrenced = pd.read_pickle(
    config.INTERIM_PREPROCESSED_RECURRENCE_DIR + "/df_recurrenced4numeric.pkl"
)

# データ全体での予測モデル検証

## ベースモデルの学習結果

各特徴量選択されたdfを用いて基本的な2値分類モデルを学習する

In [11]:
def smote4survival(X, y):
    _y = pd.DataFrame.from_records(y)
    _y_rm = _y.RFS_STATUS
    _y_rs = _y.RFS_MONTHS

    X_resample, y_rm_resample = SMOTE(random_state=config.SEED).fit_resample(X, _y_rm)
    X_resample, y_rs_resample = SMOTE(random_state=config.SEED).fit_resample(X, _y_rs)
    y_resample = pd.merge(
        y_rm_resample, y_rs_resample, right_index=True, left_index=True
    ).to_records(index=False)

    return X_resample, y_resample

In [12]:
def save_model_pipeline(
    pipeline: callable,
    val_step: str,
    save_file_path: str = ".",
):
    # 検証用データセットの保存
    train_path = "{0}/{1}/train".format(save_file_path, val_step)
    test_path = "{0}/{1}/test".format(save_file_path, val_step)

    functions.make_dir(train_path)
    functions.pickle_dump(pipeline, "{0}/model_pipeline.pkl".format(train_path))
    functions.make_dir(test_path)
    functions.pickle_dump(pipeline, "{0}/model_pipeline.pkl".format(test_path))

In [13]:
def validate_model(
    model_order: dict = {"original_data": []},
    val_step="val_set",
    input_file_path: str = None,
    output_file_path: str = None,
    init_model_pipeline: bool = True,
):
    try:
        _X_train = pd.read_pickle(
            "{0}/{1}/train/X.pkl".format(input_file_path, val_step)
        )
        _y_train = pd.read_pickle(
            "{0}/{1}/train/y.pkl".format(input_file_path, val_step)
        ).to_records(index=False)
        _X_test = pd.read_pickle("{0}/{1}/test/X.pkl".format(input_file_path, val_step))
        _y_test = pd.read_pickle(
            "{0}/{1}/test/y.pkl".format(input_file_path, val_step)
        ).to_records(index=False)
    except:
        print("preprocess pkl file dose not exist")
        return

    if init_model_pipeline:
        # 学習用パイプラインの構築
        # パイプラインメソッドの関係上、オーバーサンプリングもこちらで行う
        # ※imblearnではオーバーサンプリング後のデータの保存が難しい
        # こちらはimblearnのPipelineなので注意
        _model_pipeline = Pipeline(steps=model_order)
        _model_pipeline.fit(_X_train, _y_train)
    else:
        # 学習済みパイプラインの使用
        _model_pipeline = functions.pickle_load(
            "{0}/{1}/train/model_pipeline.pkl".format(output_file_path, val_step)
        )

    _y_train_pred = _model_pipeline.predict(_X_train)
    _y_test_pred = _model_pipeline.predict(_X_test)
    print(mean_squared_error(_y_test.RFS_MONTHS, _y_test_pred, squared=False))

    if output_file_path:
        save_model_pipeline(
            pipeline=_model_pipeline, val_step=val_step, save_file_path=output_file_path
        )

In [17]:
# 実施したい学習の定義
model_order_dicts = [
    {
        "model_name": "ComponentwiseGradientBoostingSurvivalAnalysis",
        "preprocess_name": "preprocess4numeric_1",
        "model_order": [
            # ("smote", FunctionSampler(func=smote4survival)),# 目的変数が連続値なので、SMOTEは実施しない
            (
                "ComponentwiseGradientBoostingSurvivalAnalysis",
                ComponentwiseGradientBoostingSurvivalAnalysis(
                    random_state=config.SEED,
                ),
            ),
        ],
    },
    {
        "model_name": "GradientBoostingSurvivalAnalysis",
        "preprocess_name": "preprocess4numeric_1",
        "model_order": [
            (
                "GradientBoostingSurvivalAnalysis",
                GradientBoostingSurvivalAnalysis(
                    random_state=config.SEED,
                ),
            ),
        ],
    },
    {
        "model_name": "RandomSurvivalForest",
        "preprocess_name": "preprocess4numeric_1",
        "model_order": [
            (
                "RandomSurvivalForest",
                RandomSurvivalForest(
                    n_estimators=1000,
                    min_samples_split=10,
                    min_samples_leaf=15,
                    n_jobs=-1,
                    random_state=config.SEED,
                ),
            ),
        ],
    },
    {
        "model_name": "ExtraSurvivalTrees",
        "preprocess_name": "preprocess4numeric_1",
        "model_order": [
            (
                "ExtraSurvivalTrees",
                ExtraSurvivalTrees(
                    random_state=config.SEED,
                ),
            ),
        ],
    },
    {
        "model_name": "FastKernelSurvivalSVM",
        "preprocess_name": "preprocess4numeric_1",
        "model_order": [
            (
                "FastKernelSurvivalSVM",
                FastKernelSurvivalSVM(
                    kernel="rbf",
                    random_state=config.SEED,
                ),
            ),
        ],
    },
]

In [18]:
# 層別化する特徴量
subgroup_columns = [
    "CLAUDIN_SUBTYPE",
    # "THREEGENE",
]
# 遺伝子発現量以外の特徴量を削除
drop_columns = [
    "CLAUDIN_SUBTYPE",
    "THREEGENE",
]


for subgroup_column in subgroup_columns:  # 各サブグループへの適用
    print("##########" * 10)
    print("subgroup_column: ", subgroup_column)
    for subgroup in df_recurrenced[subgroup_column].unique():  # サブグループの要素毎への適用
        print("==========" * 10)
        print("subgroup: ", subgroup)
        for model_order_dict in model_order_dicts:
            print(".........." * 10)
            print("model_order_dict name: ", model_order_dict["model_name"])
            input_file_path = "./{0}/{1}/{2}/{3}".format(
                config.INTERIM_PREPROCESSED_RECURRENCE_DIR,
                subgroup_column,
                subgroup,
                model_order_dict["preprocess_name"],
            )
            output_file_path = "./{0}/{1}/{2}/{3}".format(
                config.INTERIM_MODELS_RECURRENCE_DIR,
                subgroup_column,
                subgroup,
                model_order_dict["model_name"],
            )
            validate_model(
                model_order=model_order_dict["model_order"],
                input_file_path=input_file_path,
                output_file_path=output_file_path,
            )

####################################################################################################
subgroup_column:  CLAUDIN_SUBTYPE
subgroup:  claudin-low
....................................................................................................
model_order_dict name:  ComponentwiseGradientBoostingSurvivalAnalysis
152.17106216770816
....................................................................................................
model_order_dict name:  GradientBoostingSurvivalAnalysis
152.65572777678707
....................................................................................................
model_order_dict name:  RandomSurvivalForest
145.6370641642193
....................................................................................................
model_order_dict name:  ExtraSurvivalTrees
145.60015797458803
....................................................................................................
model_order_dict name:  FastKernelSurvivalSVM
1