# 目的
遺伝子学的分類に基づいた、再発の2値分類を実施する。  

In [1]:
# 基本的なライブラリ
import os
from tqdm import tqdm

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 50)

import matplotlib.pyplot as plt

%matplotlib inline


# パイプライン
from imblearn.pipeline import Pipeline

# サンプリング
from imblearn.over_sampling import SMOTE


# モデル
from sklearn.linear_model import LogisticRegression  # ロジスティック回帰
from sklearn.neighbors import KNeighborsClassifier  # K近傍法
from sklearn.svm import SVC  # サポートベクターマシン
from sklearn.tree import DecisionTreeClassifier, export_graphviz  # 決定木
from sklearn.ensemble import RandomForestClassifier  # ランダムフォレスト
from sklearn.ensemble import AdaBoostClassifier  # AdaBoost
from sklearn.naive_bayes import GaussianNB  # ナイーブ・ベイズ
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA  # 二次判別分析
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier


# 評価指標
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

import config
import functions

SEED = config.SEED
TARGET_NAME = ["RFS_STATUS", "RFS_MONTHS"]
functions.fix_seed(SEED)


df_recurrenced = pd.read_pickle(
    config.INTERIM_PREPROCESSED_RECURRENCE_DIR + "/df_recurrenced4numeric.pkl"
)

  y_train: pd.Series(),
  y_val: pd.Series(),


# データ全体での予測モデル検証

## ベースモデルの学習結果

各特徴量選択されたdfを用いて基本的な2値分類モデルを学習する

In [2]:
def save_model_pipeline(
    pipeline: callable,
    val_step: str,
    save_file_path: str = ".",
):
    # fit後pipelineの保存
    pipeline_path = "{0}/{1}".format(save_file_path, val_step)
    functions.make_dir(pipeline_path)
    functions.pickle_dump(pipeline, "{0}/model_pipeline.pkl".format(pipeline_path))

In [3]:
def exists_pipeline(path: str) -> bool:
    return os.path.exists(path)

In [18]:
def validate_model(
    model_order: dict = {"original_data": []},
    val_step="val_set",
    input_data_path: str = None,
    output_data_path: str = None,
    is_fitted: bool = False,
):

    try:
        _X_train = pd.read_pickle(
            "{0}/{1}/train/X.pkl".format(input_data_path, val_step)
        )
        _y_train = pd.read_pickle(
            "{0}/{1}/train/y.pkl".format(input_data_path, val_step)
        )
        _X_test = pd.read_pickle("{0}/{1}/test/X.pkl".format(input_data_path, val_step))
        _y_test = pd.read_pickle("{0}/{1}/test/y.pkl".format(input_data_path, val_step))
    except:
        print("preprocess pkl file dose not exist")
        return

    pipeline_path = "{0}/{1}".format(output_data_path, val_step)
    if exists_pipeline(pipeline_path) and is_fitted:
        # 学習済みパイプラインの使用
        _model_pipeline = functions.pickle_load(
            "{0}/model_pipeline.pkl".format(pipeline_path)
        )
    else:
        # パイプラインの学習の実施
        _model_pipeline = Pipeline(steps=model_order)
        _model_pipeline.fit(_X_train, _y_train)

    _y_train_pred = _model_pipeline.predict(_X_train)
    _y_test_pred = _model_pipeline.predict(_X_test)

    _estimator = _model_pipeline[-1]

    # print(classification_report(_y_train, _y_train_pred))
    functions.show_scores(_y_train, _y_train_pred)
    functions.show_scores(_y_test, _y_test_pred)

    if output_data_path:
        save_model_pipeline(
            pipeline=_model_pipeline, val_step=val_step, save_file_path=output_data_path
        )
    return

In [21]:
# 実施したい学習の定義
model_order_dicts = [
    {
        "model_name": LGBMClassifier().__class__.__name__,
        "preprocess_name": "preprocess4classification_1",
        "model_order": [
            (SMOTE().__class__.__name__, SMOTE(random_state=SEED)),
            (
                LGBMClassifier().__class__.__name__,
                LGBMClassifier(class_weight="balanced", random_state=SEED),
            ),
        ],
    },
    {
        "model_name": SVC().__class__.__name__,
        "preprocess_name": "preprocess4classification_1",
        "model_order": [
            (SMOTE().__class__.__name__, SMOTE(random_state=SEED)),
            (
                SVC().__class__.__name__,
                SVC(
                    kernel="rbf",
                    random_state=SEED,
                    class_weight="balanced",
                ),
            ),
        ],
    },
]

In [22]:
# 層別化する特徴量
subgroup_columns = [
    "CLAUDIN_SUBTYPE",
    # "NPI_CAT",
    # "TUMOR_CAT",
    # "LYMPH_CAT",
]
# 遺伝子発現量以外の特徴量を削除
drop_columns = [
    "CLAUDIN_SUBTYPE",
    "NPI_CAT",
    "TUMOR_CAT",
    "LYMPH_CAT",
]

val_steps = ["val_set"]


for subgroup_column in subgroup_columns:  # 各サブグループへの適用
    print("##########" * 10)
    print("subgroup_column: ", subgroup_column)
    for subgroup in df_recurrenced[subgroup_column].unique():  # サブグループの要素毎への適用
        print("==========" * 10)
        print("subgroup: ", subgroup)
        for model_order_dict in model_order_dicts:
            print(".........." * 10)
            print("model_order_dict name: ", model_order_dict["model_name"])
            input_file_path = "./{0}/{1}/{2}/{3}".format(
                config.INTERIM_PREPROCESSED_RECURRENCE_DIR,
                subgroup_column,
                subgroup,
                model_order_dict["preprocess_name"],
            )
            input_data_path = "./{0}/{1}/{2}/{3}".format(
                config.INTERIM_PREPROCESSED_RECURRENCE_DIR,
                subgroup_column,
                subgroup,
                model_order_dict["preprocess_name"],
            )
            output_data_path = "./{0}/{1}/{2}/{3}/{4}".format(
                config.INTERIM_MODELS_RECURRENCE_DIR,
                subgroup_column,
                subgroup,
                model_order_dict["preprocess_name"],
                model_order_dict["model_name"],
            )
            # plt.subplot(1, len(df_recurrenced[subgroup_column].unique()), col + 1)

            for val_step in val_steps:
                validate_model(
                    model_order=model_order_dict["model_order"],
                    val_step=val_step,
                    input_data_path=input_data_path,
                    output_data_path=output_data_path,
                )

####################################################################################################
subgroup_column:  CLAUDIN_SUBTYPE
subgroup:  claudin-low
....................................................................................................
model_order_dict name:  LGBMClassifier
preprocess pkl file dose not exist
....................................................................................................
model_order_dict name:  SVC
preprocess pkl file dose not exist
subgroup:  LumA
....................................................................................................
model_order_dict name:  LGBMClassifier


accuracy             1.000000e+00
log_loss             9.992007e-16
roc_auc_score        1.000000e+00
matthews_corrcoef    1.000000e+00
dtype: float64

accuracy              0.600000
log_loss             13.815511
roc_auc_score         0.636364
matthews_corrcoef     0.379980
dtype: float64

....................................................................................................
model_order_dict name:  SVC


accuracy             0.994382
log_loss             0.194038
roc_auc_score        0.994792
matthews_corrcoef    0.988767
dtype: float64

accuracy              0.550000
log_loss             15.542529
roc_auc_score         0.570707
matthews_corrcoef     0.153522
dtype: float64

subgroup:  LumB
....................................................................................................
model_order_dict name:  LGBMClassifier


accuracy             1.000000e+00
log_loss             9.992007e-16
roc_auc_score        1.000000e+00
matthews_corrcoef    1.000000e+00
dtype: float64

accuracy              0.400000
log_loss             20.723666
roc_auc_score         0.458333
matthews_corrcoef    -0.102062
dtype: float64

....................................................................................................
model_order_dict name:  SVC


accuracy             0.994382
log_loss             0.194038
roc_auc_score        0.992857
matthews_corrcoef    0.988267
dtype: float64

accuracy              0.500000
log_loss             17.269668
roc_auc_score         0.520833
matthews_corrcoef     0.041667
dtype: float64

subgroup:  Her2
....................................................................................................
model_order_dict name:  LGBMClassifier


accuracy             1.000000e+00
log_loss             9.992007e-16
roc_auc_score        1.000000e+00
matthews_corrcoef    1.000000e+00
dtype: float64

accuracy             0.727273
log_loss             9.419812
roc_auc_score        0.708333
matthews_corrcoef    0.385758
dtype: float64

....................................................................................................
model_order_dict name:  SVC


accuracy             1.000000e+00
log_loss             9.992007e-16
roc_auc_score        1.000000e+00
matthews_corrcoef    1.000000e+00
dtype: float64

accuracy              0.363636
log_loss             21.979585
roc_auc_score         0.354167
matthews_corrcoef    -0.260875
dtype: float64

subgroup:  Normal
....................................................................................................
model_order_dict name:  LGBMClassifier
preprocess pkl file dose not exist
....................................................................................................
model_order_dict name:  SVC
preprocess pkl file dose not exist
subgroup:  Basal
....................................................................................................
model_order_dict name:  LGBMClassifier
preprocess pkl file dose not exist
....................................................................................................
model_order_dict name:  SVC
preprocess pkl file dose not exist
subgroup:  NC
....................................................................................................
model_order_dict name:  LGBMClassifier
preprocess pkl file dose not exist
..................................................................................................