# 目的
遺伝子学的分類に基づいた、再発の2値分類を実施する。  

In [1]:
# 基本的なライブラリ
import itertools
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# models
from sklearn.linear_model import LogisticRegression  # ロジスティック回帰
from sklearn.neighbors import KNeighborsClassifier  # K近傍法
from sklearn.svm import SVC  # サポートベクターマシン
from sklearn.tree import DecisionTreeClassifier, export_graphviz  # 決定木
from sklearn.ensemble import RandomForestClassifier  # ランダムフォレスト
from sklearn.ensemble import AdaBoostClassifier  # AdaBoost
from sklearn.naive_bayes import GaussianNB  # ナイーブ・ベイズ
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA  # 二次判別分析
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb


# 評価指標
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


# local libraries
import config
import functions

SEED = config.SEED
functions.fix_seed(SEED)


# 最大表示列数の指定（ここでは50列を指定）N
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

%matplotlib inline

  y_train: pd.Series(),
  y_val: pd.Series(),


In [2]:
df_recurrenced = pd.read_pickle(
    config.INTERIM_PREPROCESSED_RECURRENCE_DIR + "/df_recurrenced.pkl"
)

classifiers = [
    LogisticRegression(max_iter=2000, random_state=SEED),
    KNeighborsClassifier(),
    SVC(
        kernel="linear",
        random_state=SEED,
        class_weight="balanced",
    ),
    SVC(kernel="poly", random_state=SEED, class_weight="balanced"),
    SVC(kernel="rbf", random_state=SEED, class_weight="balanced"),
    SVC(kernel="sigmoid", random_state=SEED, class_weight="balanced"),
    DecisionTreeClassifier(
        min_samples_split=20,
        min_samples_leaf=15,
        random_state=SEED,
        class_weight="balanced",
    ),
    RandomForestClassifier(
        min_samples_split=20,
        min_samples_leaf=15,
        random_state=SEED,
        class_weight="balanced",
    ),
    AdaBoostClassifier(random_state=SEED),
    GaussianNB(),
    GradientBoostingClassifier(random_state=SEED),
    SGDClassifier(random_state=SEED, class_weight="balanced"),
    QDA(),
    LGBMClassifier(class_weight="balanced", random_state=SEED),
    ExtraTreesClassifier(class_weight="balanced", random_state=SEED),
]

# データ全体での予測モデル検証

## ベースモデルの学習結果

各特徴量選択されたdfを用いて基本的な2値分類モデルを学習する

In [5]:
def validate_models(
    input_file_path: str = None,
    input_file_name: str = None,
    classifiers: list = config.classifiers,
    plot: bool = False,
):
    try:
        list_train, list_val, _, _ = functions.read_preprocessed_df(
            input_file_path, input_file_name
        )
    except:
        return
    X_train, y_train = list_train[0], list_train[1]
    X_val, y_val = list_val[0], list_val[1]
    print(X_train.shape, X_val.shape)
    display(
        functions.compare_bcms(
            X_train,
            y_train,
            X_val,
            y_val,
            classifiers=classifiers,
            plot=plot,
        )
    )


# サブグループ削除（一時的に）
subgroup_columns = [
    "CLAUDIN_SUBTYPE",
    # "NPI_CAT",
    # "TUMOR_CAT",
    # "LYMPH_CAT",
]

feature_selection_methods = [
    # "vt",
    # "mrmr"
    "none"
]

scaling_methods = [
    "std"
    # "norm"
]

dimensionality_reduction_methods = [
    "pca"
    # none
]

sampling_methods = [
    # "none",
    "smote"
]


for (
    feature_selection_method,
    scaling_method,
    dimensionality_reduction_method,
    sampling_method,
) in tqdm(
    itertools.product(
        feature_selection_methods,
        scaling_methods,
        dimensionality_reduction_methods,
        sampling_methods,
    )
):
    preprocess_order = "{0}_{1}_{2}_{3}".format(
        feature_selection_method,
        scaling_method,
        dimensionality_reduction_method,
        sampling_method,
    )
    print("=====" * 10)
    for subgroup_column in subgroup_columns:  # 各サブグループへの適用
        print("-----" * 10)
        print("subgroup column: ", subgroup_column)

        for subgroup in df_recurrenced[subgroup_column].unique():  # サブグループ毎への適用
            print("....." * 10)
            print("subgroup: ", subgroup)
            input_file_path = "./{0}/{1}/{2}".format(
                config.INTERIM_PREPROCESSED_RECURRENCE_DIR,
                subgroup_column,
                preprocess_order,
                plot=True,
            )
            print(input_file_path)
            validate_models(
                input_file_path=input_file_path, input_file_name=subgroup, plot=False
            )

0it [00:00, ?it/s]

--------------------------------------------------
subgroup column:  CLAUDIN_SUBTYPE
..................................................
subgroup:  LumB
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_pca_smote
(216, 158) (20, 158)


Unnamed: 0_level_0,accuracy_score_train,accuracy_score_val,f1_score_train,f1_score_val,matthews_corrcoef_train,matthews_corrcoef_val,cohen_kappa_score_train,cohen_kappa_score_val,roc_auc_score_train,roc_auc_score_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AdaBoostClassifier,1.0,0.45,1.0,0.56,1.0,0.057166,1.0,0.035088,1.0,0.520833
DecisionTreeClassifier,0.810185,0.6,0.817778,0.636364,0.622536,0.311805,0.62037,0.259259,0.810185,0.645833
ExtraTreesClassifier,1.0,0.4,1.0,0.571429,1.0,0.0,1.0,0.0,1.0,0.5
GaussianNB,0.810185,0.4,0.79803,0.571429,0.624914,0.0,0.62037,0.0,0.810185,0.5
GradientBoostingClassifier,1.0,0.45,1.0,0.592593,1.0,0.187317,1.0,0.067797,1.0,0.541667
KNeighborsClassifier,0.541667,0.4,0.685714,0.571429,0.208514,0.0,0.083333,0.0,0.541667,0.5
LGBMClassifier,1.0,0.4,1.0,0.5,1.0,-0.102062,1.0,-0.071429,1.0,0.458333
LogisticRegression,1.0,0.55,1.0,0.307692,1.0,0.0,1.0,0.0,1.0,0.5
QuadraticDiscriminantAnalysis,1.0,0.6,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.5
RandomForestClassifier,0.958333,0.4,0.956522,0.571429,0.919866,0.0,0.916667,0.0,0.958333,0.5


..................................................
subgroup:  Her2
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_pca_smote
(134, 80) (11, 80)


Unnamed: 0_level_0,accuracy_score_train,accuracy_score_val,f1_score_train,f1_score_val,matthews_corrcoef_train,matthews_corrcoef_val,cohen_kappa_score_train,cohen_kappa_score_val,roc_auc_score_train,roc_auc_score_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AdaBoostClassifier,1.0,0.454545,1.0,0.5,1.0,0.288675,1.0,0.153846,1.0,0.625
DecisionTreeClassifier,0.858209,0.454545,0.859259,0.5,0.716498,0.288675,0.716418,0.153846,0.858209,0.625
ExtraTreesClassifier,1.0,0.272727,1.0,0.2,1.0,-0.385758,1.0,-0.294118,1.0,0.291667
GaussianNB,0.902985,0.272727,0.897638,0.428571,0.810405,0.0,0.80597,0.0,0.902985,0.5
GradientBoostingClassifier,1.0,0.727273,1.0,0.571429,1.0,0.385758,1.0,0.377358,1.0,0.708333
KNeighborsClassifier,0.567164,0.272727,0.694737,0.428571,0.244677,0.0,0.134328,0.0,0.567164,0.5
LGBMClassifier,1.0,0.727273,1.0,0.571429,1.0,0.385758,1.0,0.377358,1.0,0.708333
LogisticRegression,1.0,0.818182,1.0,0.5,1.0,0.516398,1.0,0.421053,1.0,0.666667
QuadraticDiscriminantAnalysis,1.0,0.727273,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.5
RandomForestClassifier,0.992537,0.727273,0.992481,0.571429,0.985184,0.385758,0.985075,0.377358,0.992537,0.708333


..................................................
subgroup:  LumA
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_pca_smote
(192, 158) (20, 158)


Unnamed: 0_level_0,accuracy_score_train,accuracy_score_val,f1_score_train,f1_score_val,matthews_corrcoef_train,matthews_corrcoef_val,cohen_kappa_score_train,cohen_kappa_score_val,roc_auc_score_train,roc_auc_score_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AdaBoostClassifier,1.0,0.55,1.0,0.307692,1.0,0.301511,1.0,0.166667,1.0,0.590909
DecisionTreeClassifier,0.796875,0.5,0.797927,0.166667,0.593782,0.207514,0.59375,0.082569,0.796875,0.545455
ExtraTreesClassifier,1.0,0.45,1.0,0.153846,1.0,-0.033501,1.0,-0.018519,1.0,0.489899
GaussianNB,0.901042,0.5,0.904523,0.166667,0.804224,0.207514,0.802083,0.082569,0.901042,0.545455
GradientBoostingClassifier,1.0,0.5,1.0,0.166667,1.0,0.207514,1.0,0.082569,1.0,0.545455
KNeighborsClassifier,0.536458,0.5,0.264463,0.166667,0.108335,0.207514,0.072917,0.082569,0.536458,0.545455
LGBMClassifier,1.0,0.6,1.0,0.428571,1.0,0.37998,1.0,0.252336,1.0,0.636364
LogisticRegression,1.0,0.6,1.0,0.692308,1.0,0.174078,1.0,0.157895,1.0,0.575758
QuadraticDiscriminantAnalysis,1.0,0.55,1.0,0.709677,1.0,0.0,1.0,0.0,1.0,0.5
RandomForestClassifier,0.989583,0.65,0.989583,0.588235,0.979167,0.372839,0.979167,0.326923,0.989583,0.671717


1it [00:09,  9.18s/it]

..................................................
subgroup:  claudin-low
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_pca_smote
pkl file does not exist
..................................................
subgroup:  Basal
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_pca_smote
pkl file does not exist
..................................................
subgroup:  Normal
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_pca_smote
pkl file does not exist
..................................................
subgroup:  NC
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_pca_smote
pkl file does not exist





## xgboost

In [4]:
def validate_models(
    input_file_path: str = None,
    input_file_name: str = None,
    classifiers: list = config.classifiers,
    plot: bool = False,
):
    try:
        list_train, list_val, _, _ = functions.read_preprocessed_df(
            input_file_path, input_file_name
        )
    except:
        return
    X_train, y_train = list_train[0], list_train[1]
    X_val, y_val = list_val[0], list_val[1]
    print(X_train.shape, X_val.shape)
    display(
        functions.compare_bcms(
            X_train,
            y_train,
            X_val,
            y_val,
            classifiers=classifiers,
            plot=plot,
        )
    )


for feature_selection_method, scaling_method, sampling_method in tqdm(
    itertools.product(feature_selection_methods, scaling_methods, sampling_methods)
):
    preprocess_order = "{0}_{1}_{2}".format(
        feature_selection_method, scaling_method, sampling_method
    )
    print("=====" * 10)
    for subgroup_column in subgroup_columns:  # 各サブグループへの適用
        print("-----" * 10)
        print("subgroup column: ", subgroup_column)

        for subgroup in df_recurrenced[subgroup_column].unique():  # サブグループ毎への適用
            print("....." * 10)
            print("subgroup: ", subgroup)
            input_file_path = "./{0}/{1}/{2}".format(
                config.INTERIM_PREPROCESSED_RECURRENCE_DIR,
                subgroup_column,
                preprocess_order,
                plot=True,
            )
            print(input_file_path)

            try:
                list_train, list_val, _, _ = functions.read_preprocessed_df(
                    input_file_path, subgroup
                )
            except:
                pass

            X_train, y_train = list_train[0], list_train[1]
            X_val, y_val = list_val[0], list_val[1]
            print(X_train.shape, X_val.shape)

            xgb_train = xgb.DMatrix(
                X_train, label=y_train, feature_names=X_train.columns
            )
            xgb_val = xgb.DMatrix(X_val, label=y_val, feature_names=X_val.columns)

            param = {
                # 二値分類問題
                "objective": "binary:logistic",
            }
            clf = xgb.train(param, xgb_train)
            y_val_pred_proba = clf.predict(xgb_val)
            y_val_pred = np.where(y_val_pred_proba > 0.5, 1, 0)
            functions.show_scores(y_val, y_val_pred)

0it [00:00, ?it/s]


--------------------------------------------------
subgroup column:  CLAUDIN_SUBTYPE
..................................................
subgroup:  LumB
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_smote
pkl file does not exist


NameError: name 'list_train' is not defined