# 目的
遺伝子学的分類に基づいた、再発の2値分類を実施する。  

In [1]:
# 基本的なライブラリ
import itertools
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# models
from sklearn.linear_model import LogisticRegression  # ロジスティック回帰
from sklearn.neighbors import KNeighborsClassifier  # K近傍法
from sklearn.svm import SVC  # サポートベクターマシン
from sklearn.tree import DecisionTreeClassifier, export_graphviz  # 決定木
from sklearn.ensemble import RandomForestClassifier  # ランダムフォレスト
from sklearn.ensemble import AdaBoostClassifier  # AdaBoost
from sklearn.naive_bayes import GaussianNB  # ナイーブ・ベイズ
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA  # 二次判別分析
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb


# 評価指標
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


# local libraries
import config
import functions

SEED = config.SEED
functions.fix_seed(SEED)


# 最大表示列数の指定（ここでは50列を指定）N
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

%matplotlib inline

  y_train: pd.Series(),
  y_val: pd.Series(),


In [2]:
df_recurrenced = pd.read_pickle(
    config.INTERIM_PREPROCESSED_RECURRENCE_DIR + "/df_recurrenced.pkl"
)

classifiers = [
    LogisticRegression(max_iter=2000, random_state=SEED),
    KNeighborsClassifier(),
    SVC(
        kernel="linear",
        random_state=SEED,
        class_weight="balanced",
    ),
    SVC(kernel="poly", random_state=SEED, class_weight="balanced"),
    SVC(kernel="rbf", random_state=SEED, class_weight="balanced"),
    SVC(kernel="sigmoid", random_state=SEED, class_weight="balanced"),
    DecisionTreeClassifier(
        min_samples_split=20,
        min_samples_leaf=15,
        random_state=SEED,
        class_weight="balanced",
    ),
    RandomForestClassifier(
        min_samples_split=20,
        min_samples_leaf=15,
        random_state=SEED,
        class_weight="balanced",
    ),
    AdaBoostClassifier(random_state=SEED),
    GaussianNB(),
    GradientBoostingClassifier(random_state=SEED),
    SGDClassifier(random_state=SEED, class_weight="balanced"),
    QDA(),
    LGBMClassifier(class_weight="balanced", random_state=SEED),
    ExtraTreesClassifier(class_weight="balanced", random_state=SEED),
]

# データ全体での予測モデル検証

## ベースモデルの学習結果

各特徴量選択されたdfを用いて基本的な2値分類モデルを学習する

In [3]:
def validate_models(
    input_file_path: str = None,
    input_file_name: str = None,
    classifiers: list = config.classifiers,
    plot: bool = False,
):
    try:
        list_train, list_val, _, _ = functions.read_preprocessed_df(
            input_file_path, input_file_name
        )
    except:
        return
    X_train, y_train = list_train[0], list_train[1]
    X_val, y_val = list_val[0], list_val[1]
    print(X_train.shape, X_val.shape)
    display(
        functions.compare_bcms(
            X_train,
            y_train,
            X_val,
            y_val,
            classifiers=classifiers,
            plot=plot,
        )
    )


# サブグループ削除（一時的に）
subgroup_columns = [
    "CLAUDIN_SUBTYPE",
    # "NPI_CAT",
    # "TUMOR_CAT",
    # "LYMPH_CAT",
]

feature_selection_methods = [
    # "vt",
    # "mrmr"
    "none"
]

scaling_methods = [
    "std"
    # "norm"
]

dimensionality_reduction_methods = [
    "pca"
    # none
]

sampling_methods = [
     "none",
    #"smote"
]


for (
    feature_selection_method,
    scaling_method,
    dimensionality_reduction_method,
    sampling_method,
) in tqdm(
    itertools.product(
        feature_selection_methods,
        scaling_methods,
        dimensionality_reduction_methods,
        sampling_methods,
    )
):
    preprocess_order = "{0}_{1}_{2}_{3}".format(
        feature_selection_method,
        scaling_method,
        dimensionality_reduction_method,
        sampling_method,
    )
    print("=====" * 10)
    for subgroup_column in subgroup_columns:  # 各サブグループへの適用
        print("-----" * 10)
        print("subgroup column: ", subgroup_column)

        for subgroup in df_recurrenced[subgroup_column].unique():  # サブグループ毎への適用
            print("....." * 10)
            print("subgroup: ", subgroup)
            input_file_path = "./{0}/{1}/{2}".format(
                config.INTERIM_PREPROCESSED_RECURRENCE_DIR,
                subgroup_column,
                preprocess_order,
                plot=True,
            )
            print(input_file_path)
            validate_models(
                input_file_path=input_file_path, input_file_name=subgroup, plot=False
            )

0it [00:00, ?it/s]

--------------------------------------------------
subgroup column:  CLAUDIN_SUBTYPE
..................................................
subgroup:  LumB
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_pca_none
(178, 158) (20, 158)


Unnamed: 0_level_0,accuracy_score_train,accuracy_score_val,f1_score_train,f1_score_val,matthews_corrcoef_train,matthews_corrcoef_val,cohen_kappa_score_train,cohen_kappa_score_val,roc_auc_score_train,roc_auc_score_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AdaBoostClassifier,1.0,0.35,1.0,0.434783,1.0,-0.235702,1.0,-0.181818,1.0,0.395833
DecisionTreeClassifier,0.808989,0.5,0.721311,0.375,0.595626,-0.041667,0.58077,-0.041667,0.777249,0.479167
ExtraTreesClassifier,1.0,0.7,1.0,0.571429,1.0,0.356348,1.0,0.347826,1.0,0.666667
GaussianNB,0.904494,0.4,0.882759,0.571429,0.803711,0.0,0.802351,0.0,0.906217,0.5
GradientBoostingClassifier,1.0,0.6,1.0,0.2,1.0,0.068041,1.0,0.047619,1.0,0.520833
KNeighborsClassifier,0.764045,0.65,0.671875,0.461538,0.495465,0.235702,0.49018,0.222222,0.737698,0.604167
LGBMClassifier,1.0,0.65,1.0,0.533333,1.0,0.256776,1.0,0.255319,1.0,0.625
LogisticRegression,1.0,0.55,1.0,0.307692,1.0,0.0,1.0,0.0,1.0,0.5
QuadraticDiscriminantAnalysis,1.0,0.5,1.0,0.5,1.0,0.041667,1.0,0.038462,1.0,0.520833
RandomForestClassifier,0.977528,0.5,0.971014,0.583333,0.952939,0.153093,0.952672,0.107143,0.973942,0.5625


..................................................
subgroup:  Her2
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_pca_none
(90, 80) (11, 80)


Unnamed: 0_level_0,accuracy_score_train,accuracy_score_val,f1_score_train,f1_score_val,matthews_corrcoef_train,matthews_corrcoef_val,cohen_kappa_score_train,cohen_kappa_score_val,roc_auc_score_train,roc_auc_score_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AdaBoostClassifier,1.0,0.454545,1.0,0.25,1.0,-0.149071,1.0,-0.137931,1.0,0.416667
DecisionTreeClassifier,0.766667,0.727273,0.655738,0.666667,0.53066,0.559017,0.494923,0.47619,0.800454,0.8125
ExtraTreesClassifier,1.0,0.727273,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.5
GaussianNB,0.988889,0.454545,0.977778,0.4,0.970801,0.038576,0.970375,0.029412,0.978261,0.520833
GradientBoostingClassifier,1.0,0.636364,1.0,0.0,1.0,-0.193649,1.0,-0.157895,1.0,0.4375
KNeighborsClassifier,0.788889,0.909091,0.536585,0.8,0.407585,0.7698,0.402516,0.744186,0.686892,0.833333
LGBMClassifier,1.0,0.636364,1.0,0.333333,1.0,0.083333,1.0,0.083333,1.0,0.541667
LogisticRegression,1.0,0.818182,1.0,0.5,1.0,0.516398,1.0,0.421053,1.0,0.666667
QuadraticDiscriminantAnalysis,1.0,0.636364,1.0,0.333333,1.0,0.083333,1.0,0.083333,1.0,0.541667
RandomForestClassifier,0.966667,0.727273,0.933333,0.4,0.911526,0.240563,0.911126,0.232558,0.949059,0.604167


..................................................
subgroup:  LumA
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_pca_none
(178, 158) (20, 158)


Unnamed: 0_level_0,accuracy_score_train,accuracy_score_val,f1_score_train,f1_score_val,matthews_corrcoef_train,matthews_corrcoef_val,cohen_kappa_score_train,cohen_kappa_score_val,roc_auc_score_train,roc_auc_score_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AdaBoostClassifier,1.0,0.75,1.0,0.761905,1.0,0.502519,1.0,0.5,1.0,0.752525
DecisionTreeClassifier,0.747191,0.65,0.736842,0.666667,0.514737,0.301511,0.500561,0.3,0.754954,0.651515
ExtraTreesClassifier,1.0,0.7,1.0,0.75,1.0,0.389819,1.0,0.381443,1.0,0.686869
GaussianNB,0.882022,0.55,0.894472,0.689655,0.763514,0.033501,0.761089,0.021739,0.878176,0.510101
GradientBoostingClassifier,1.0,0.6,1.0,0.636364,1.0,0.191919,1.0,0.191919,1.0,0.59596
KNeighborsClassifier,0.657303,0.55,0.666667,0.4,0.317437,0.201008,0.315816,0.150943,0.659172,0.580808
LGBMClassifier,1.0,0.75,1.0,0.736842,1.0,0.533396,1.0,0.509804,1.0,0.762626
LogisticRegression,1.0,0.6,1.0,0.692308,1.0,0.174078,1.0,0.157895,1.0,0.575758
QuadraticDiscriminantAnalysis,1.0,0.75,1.0,0.8,1.0,0.50443,1.0,0.479167,1.0,0.732323
RandomForestClassifier,0.988764,0.7,0.989691,0.75,0.977599,0.389819,0.977348,0.381443,0.987805,0.686869


1it [00:08,  8.74s/it]

..................................................
subgroup:  claudin-low
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_pca_none
pkl file does not exist
..................................................
subgroup:  Basal
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_pca_none
pkl file does not exist
..................................................
subgroup:  Normal
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_pca_none
pkl file does not exist
..................................................
subgroup:  NC
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_pca_none
pkl file does not exist





## xgboost

In [4]:
def validate_models(
    input_file_path: str = None,
    input_file_name: str = None,
    classifiers: list = config.classifiers,
    plot: bool = False,
):
    try:
        list_train, list_val, _, _ = functions.read_preprocessed_df(
            input_file_path, input_file_name
        )
    except:
        return
    X_train, y_train = list_train[0], list_train[1]
    X_val, y_val = list_val[0], list_val[1]
    print(X_train.shape, X_val.shape)
    display(
        functions.compare_bcms(
            X_train,
            y_train,
            X_val,
            y_val,
            classifiers=classifiers,
            plot=plot,
        )
    )


for feature_selection_method, scaling_method, sampling_method in tqdm(
    itertools.product(feature_selection_methods, scaling_methods, sampling_methods)
):
    preprocess_order = "{0}_{1}_{2}".format(
        feature_selection_method, scaling_method, sampling_method
    )
    print("=====" * 10)
    for subgroup_column in subgroup_columns:  # 各サブグループへの適用
        print("-----" * 10)
        print("subgroup column: ", subgroup_column)

        for subgroup in df_recurrenced[subgroup_column].unique():  # サブグループ毎への適用
            print("....." * 10)
            print("subgroup: ", subgroup)
            input_file_path = "./{0}/{1}/{2}".format(
                config.INTERIM_PREPROCESSED_RECURRENCE_DIR,
                subgroup_column,
                preprocess_order,
                plot=True,
            )
            print(input_file_path)

            try:
                list_train, list_val, _, _ = functions.read_preprocessed_df(
                    input_file_path, subgroup
                )
            except:
                pass

            X_train, y_train = list_train[0], list_train[1]
            X_val, y_val = list_val[0], list_val[1]
            print(X_train.shape, X_val.shape)

            xgb_train = xgb.DMatrix(
                X_train, label=y_train, feature_names=X_train.columns
            )
            xgb_val = xgb.DMatrix(X_val, label=y_val, feature_names=X_val.columns)

            param = {
                # 二値分類問題
                "objective": "binary:logistic",
            }
            clf = xgb.train(param, xgb_train)
            y_val_pred_proba = clf.predict(xgb_val)
            y_val_pred = np.where(y_val_pred_proba > 0.5, 1, 0)
            functions.show_scores(y_val, y_val_pred)

0it [00:00, ?it/s]

--------------------------------------------------
subgroup column:  CLAUDIN_SUBTYPE
..................................................
subgroup:  LumB
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/none_std_none
pkl file does not exist





NameError: name 'list_train' is not defined