# 目的
遺伝子学的分類に基づいた、再発の2値分類を実施する。  

In [1]:
# 基本的なライブラリ
import itertools
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# パイプライン
from imblearn.pipeline import Pipeline

# サンプリング
from imblearn.over_sampling import SMOTE


# モデル
from sklearn.linear_model import LogisticRegression  # ロジスティック回帰
from sklearn.neighbors import KNeighborsClassifier  # K近傍法
from sklearn.svm import SVC  # サポートベクターマシン
from sklearn.tree import DecisionTreeClassifier, export_graphviz  # 決定木
from sklearn.ensemble import RandomForestClassifier  # ランダムフォレスト
from sklearn.ensemble import AdaBoostClassifier  # AdaBoost
from sklearn.naive_bayes import GaussianNB  # ナイーブ・ベイズ
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA  # 二次判別分析
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb


# 評価指標
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

# local libraries
import config
import functions

SEED = config.SEED
functions.fix_seed(SEED)


# 最大表示列数の指定（ここでは50列を指定）N
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

%matplotlib inline

  y_train: pd.Series(),
  y_val: pd.Series(),


In [2]:
df_recurrenced = pd.read_pickle(
    config.INTERIM_PREPROCESSED_RECURRENCE_DIR + "/df_recurrenced4classification.pkl"
)

classifiers = [
    LogisticRegression(max_iter=2000, random_state=SEED),
    KNeighborsClassifier(),
    SVC(
        kernel="linear",
        random_state=SEED,
        class_weight="balanced",
    ),
    SVC(kernel="poly", random_state=SEED, class_weight="balanced"),
    SVC(kernel="rbf", random_state=SEED, class_weight="balanced"),
    SVC(kernel="sigmoid", random_state=SEED, class_weight="balanced"),
    DecisionTreeClassifier(
        min_samples_split=20,
        min_samples_leaf=15,
        random_state=SEED,
        class_weight="balanced",
    ),
    RandomForestClassifier(
        min_samples_split=20,
        min_samples_leaf=15,
        random_state=SEED,
        class_weight="balanced",
    ),
    AdaBoostClassifier(random_state=SEED),
    GaussianNB(),
    GradientBoostingClassifier(random_state=SEED),
    SGDClassifier(random_state=SEED, class_weight="balanced"),
    QDA(),
    LGBMClassifier(class_weight="balanced", random_state=SEED),
    ExtraTreesClassifier(class_weight="balanced", random_state=SEED),
]

# データ全体での予測モデル検証

## ベースモデルの学習結果

各特徴量選択されたdfを用いて基本的な2値分類モデルを学習する

In [3]:
def validate_models(
    model_order: dict = {"original_data": []},
    input_file_path: str = None,
    output_file_path: str = None,
):

    cat_path = "/val_set"
    try:
        _X_train = pd.read_pickle(input_file_path + cat_path + "/train/X.pkl")
        _y_train = pd.read_pickle(input_file_path + cat_path + "/train/y.pkl")
        _X_val = pd.read_pickle(input_file_path + cat_path + "/val/X.pkl")
        _y_val = pd.read_pickle(input_file_path + cat_path + "/val/y.pkl")
    except:
        # pkl file dose not exist
        return

    display(_X_train.shape, _y_train.shape)

    # 学習用パイプラインの構築
    # パイプラインメソッドの関係上、オーバーサンプリングもこちらで行う
    # ※imblearnではオーバーサンプリング後のデータの保存が難しい
    # こちらはimblearnのPipelineなので注意
    _preprocess_pipeline = Pipeline(steps=model_order)
    _preprocess_pipeline.fit(_X_train, _y_train)
    _y_train_pred = _preprocess_pipeline.predict(_X_train)
    _y_val_pred = _preprocess_pipeline.predict(_X_val)
    # print(classification_report(_y_train, _y_train_pred))
    functions.show_scores(_y_train, _y_train_pred)
    functions.show_scores(_y_val, _y_val_pred)

In [4]:
# 層別化する特徴量
subgroup_columns = [
    "CLAUDIN_SUBTYPE",
    # "NPI_CAT",
    # "TUMOR_CAT",
    # "LYMPH_CAT",
]
# 遺伝子発現量以外の特徴量を削除
drop_columns = [
    "CLAUDIN_SUBTYPE",
    "NPI_CAT",
    "TUMOR_CAT",
    "LYMPH_CAT",
]

# 前処理名
preprocess_order_names = ["preprocess1", "preprocess2"]

# 実施したい学習の定義
model_order_dicts = [
    {
        "model_name": "lr",
        "preprocess_name": "preprocess4numeric_1",
        "model_order": [
            ("smote", SMOTE(random_state=SEED)),
            ("lr", LogisticRegression(max_iter=2000, random_state=SEED)),
        ],
    },
    {
        "model_name": "lgbm",
        "preprocess_name": "preprocess4numeric_1",
        "model_order": [
            ("smote", SMOTE(random_state=SEED)),
            ("lgbm", LGBMClassifier(class_weight="balanced", random_state=SEED)),
        ],
    },
    {
        "model_name": "svm",
        "preprocess_name": "preprocess4numeric_1",
        "model_order": [
            ("smote", SMOTE(random_state=SEED)),
            (
                "svm",
                SVC(
                    kernel="rbf",
                    random_state=SEED,
                    class_weight="balanced",
                ),
            ),
        ],
    },
]

for subgroup_column in subgroup_columns:  # 各サブグループへの適用
    print("##########" * 10)
    print("subgroup_column: ", subgroup_column)
    for subgroup in df_recurrenced[subgroup_column].unique():  # サブグループの要素毎への適用
        print("==========" * 10)
        print("subgroup: ", subgroup)
        for model_order_dict in model_order_dicts:
            print(".........." * 10)
            print("model_order_dict name: ", model_order_dict["model_name"])
            input_file_path = "./{0}/{1}/{2}/{3}".format(
                config.INTERIM_PREPROCESSED_RECURRENCE_DIR,
                subgroup_column,
                subgroup,
                model_order_dict["preprocess_name"],
            )
            """
            output_file_path = "./{0}/{1}/{2}/{3}".format(
                config.INTERIM_MODELS_RECURRENCE_DIR,
                subgroup_column,
                subgroup,
                model_order_dict["model_name"],
            )"""
            validate_models(
                model_order=model_order_dict["model_order"],
                input_file_path=input_file_path,
            )

####################################################################################################
subgroup_column:  CLAUDIN_SUBTYPE
subgroup:  LumB
....................................................................................................
model_order_dict name:  lr


(178, 159)

(178,)

ValueError: Unknown label type: 'continuous'