# 目的
遺伝子学的分類に基づいた、再発の2値分類を実施する。  

In [1]:
# 基本的なライブラリ
import itertools
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# models
from sklearn.linear_model import LogisticRegression  # ロジスティック回帰
from sklearn.neighbors import KNeighborsClassifier  # K近傍法
from sklearn.svm import SVC  # サポートベクターマシン
from sklearn.tree import DecisionTreeClassifier, export_graphviz  # 決定木
from sklearn.ensemble import RandomForestClassifier  # ランダムフォレスト
from sklearn.ensemble import AdaBoostClassifier  # AdaBoost
from sklearn.naive_bayes import GaussianNB  # ナイーブ・ベイズ
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA  # 二次判別分析
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb


# 評価指標
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


# local libraries
import config
import functions

SEED = config.SEED
functions.fix_seed(SEED)


# 最大表示列数の指定（ここでは50列を指定）N
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

%matplotlib inline

  from pandas import MultiIndex, Int64Index
  y_train: pd.Series(),
  y_val: pd.Series(),


In [2]:
df_recurrenced = pd.read_pickle(
    config.INTERIM_PREPROCESSED_RECURRENCE_DIR + "/df_recurrenced.pkl"
)

classifiers = [
    LogisticRegression(max_iter=2000, random_state=SEED),
    KNeighborsClassifier(),
    SVC(
        kernel="linear",
        random_state=SEED,
        class_weight="balanced",
    ),
    SVC(kernel="poly", random_state=SEED, class_weight="balanced"),
    SVC(kernel="rbf", random_state=SEED, class_weight="balanced"),
    SVC(kernel="sigmoid", random_state=SEED, class_weight="balanced"),
    DecisionTreeClassifier(
        min_samples_split=20,
        min_samples_leaf=15,
        random_state=SEED,
        class_weight="balanced",
    ),
    RandomForestClassifier(
        min_samples_split=20,
        min_samples_leaf=15,
        random_state=SEED,
        class_weight="balanced",
    ),
    AdaBoostClassifier(random_state=SEED),
    GaussianNB(),
    GradientBoostingClassifier(random_state=SEED),
    SGDClassifier(random_state=SEED, class_weight="balanced"),
    QDA(),
    LGBMClassifier(class_weight="balanced", random_state=SEED),
    ExtraTreesClassifier(class_weight="balanced", random_state=SEED),
]

# データ全体での予測モデル検証

## ベースモデルの学習結果

各特徴量選択されたdfを用いて基本的な2値分類モデルを学習する

In [6]:
def validate_models(
    input_file_path: str = None,
    input_file_name: str = None,
    classifiers: list = config.classifiers,
    plot: bool = False,
):
    try:
        list_train, list_val, _, _ = functions.read_preprocessed_df(
            input_file_path, input_file_name
        )
    except:
        return
    X_train, y_train = list_train[0], list_train[1]
    X_val, y_val = list_val[0], list_val[1]
    print(X_train.shape, X_val.shape)
    display(
        functions.compare_bcms(
            X_train,
            y_train,
            X_val,
            y_val,
            classifiers=classifiers,
            plot=plot,
        )
    )


# サブグループ削除（一時的に）
subgroup_columns = [
    "CLAUDIN_SUBTYPE",
    # "NPI_CAT",
    # "TUMOR_CAT",
    # "LYMPH_CAT",
]

feature_selection_methods = [
    "vt",
    # "mrmr"
]

scaling_methods = [
    "std",
    # "norm"
]

sampling_methods = [
    # "none",
    "smote"
]

for feature_selection_method, scaling_method, sampling_method in tqdm(
    itertools.product(feature_selection_methods, scaling_methods, sampling_methods)
):
    preprocess_order = "{0}_{1}_{2}".format(
        feature_selection_method, scaling_method, sampling_method
    )
    print("=====" * 10)
    for subgroup_column in subgroup_columns:  # 各サブグループへの適用
        print("-----" * 10)
        print("subgroup column: ", subgroup_column)

        for subgroup in df_recurrenced[subgroup_column].unique():  # サブグループ毎への適用
            print("....." * 10)
            print("subgroup: ", subgroup)
            input_file_path = "./{0}/{1}/{2}".format(
                config.INTERIM_PREPROCESSED_RECURRENCE_DIR,
                subgroup_column,
                preprocess_order,
                plot=True,
            )
            print(input_file_path)
            validate_models(
                input_file_path=input_file_path, input_file_name=subgroup, plot=False
            )

0it [00:00, ?it/s]

--------------------------------------------------
subgroup column:  CLAUDIN_SUBTYPE
..................................................
subgroup:  LumB
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/vt_std_smote
(216, 17) (20, 17)


Unnamed: 0_level_0,accuracy_score_train,accuracy_score_val,f1_score_train,f1_score_val,matthews_corrcoef_train,matthews_corrcoef_val,cohen_kappa_score_train,cohen_kappa_score_val,roc_auc_score_train,roc_auc_score_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AdaBoostClassifier,0.967593,0.5,0.967742,0.444444,0.935225,0.0,0.935185,0.0,0.967593,0.5
DecisionTreeClassifier,0.768519,0.5,0.776786,0.444444,0.538516,0.0,0.537037,0.0,0.768519,0.5
ExtraTreesClassifier,1.0,0.45,1.0,0.153846,1.0,-0.235702,1.0,-0.222222,1.0,0.395833
GaussianNB,0.625,0.3,0.61244,0.125,0.250527,-0.458333,0.25,-0.458333,0.625,0.270833
GradientBoostingClassifier,1.0,0.65,1.0,0.533333,1.0,0.256776,1.0,0.255319,1.0,0.625
KNeighborsClassifier,0.773148,0.45,0.780269,0.352941,0.547447,-0.123091,0.546296,-0.122449,0.773148,0.4375
LGBMClassifier,1.0,0.5,1.0,0.375,1.0,-0.041667,1.0,-0.041667,1.0,0.479167
LogisticRegression,0.703704,0.5,0.711712,0.444444,0.408038,0.0,0.407407,0.0,0.703704,0.5
QuadraticDiscriminantAnalysis,0.865741,0.7,0.865116,0.625,0.731513,0.375,0.731481,0.375,0.865741,0.6875
RandomForestClassifier,0.847222,0.4,0.850679,0.25,0.69519,-0.25,0.694444,-0.25,0.847222,0.375


..................................................
subgroup:  Her2
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/vt_std_smote
(134, 9) (11, 9)


Unnamed: 0_level_0,accuracy_score_train,accuracy_score_val,f1_score_train,f1_score_val,matthews_corrcoef_train,matthews_corrcoef_val,cohen_kappa_score_train,cohen_kappa_score_val,roc_auc_score_train,roc_auc_score_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AdaBoostClassifier,1.0,0.545455,1.0,0.285714,1.0,-0.038576,1.0,-0.037736,1.0,0.479167
DecisionTreeClassifier,0.80597,0.636364,0.80303,0.333333,0.612213,0.083333,0.61194,0.083333,0.80597,0.541667
ExtraTreesClassifier,1.0,0.636364,1.0,0.333333,1.0,0.083333,1.0,0.083333,1.0,0.541667
GaussianNB,0.708955,0.454545,0.706767,0.4,0.417957,0.038576,0.41791,0.029412,0.708955,0.520833
GradientBoostingClassifier,1.0,0.636364,1.0,0.333333,1.0,0.083333,1.0,0.083333,1.0,0.541667
KNeighborsClassifier,0.813433,0.545455,0.83871,0.444444,0.660129,0.149071,0.626866,0.126984,0.813433,0.583333
LGBMClassifier,1.0,0.727273,1.0,0.571429,1.0,0.385758,1.0,0.377358,1.0,0.708333
LogisticRegression,0.664179,0.636364,0.661654,0.5,0.328395,0.260875,0.328358,0.241379,0.664179,0.645833
QuadraticDiscriminantAnalysis,0.895522,0.818182,0.9,0.666667,0.794236,0.541667,0.791045,0.541667,0.895522,0.770833
RandomForestClassifier,0.843284,0.727273,0.839695,0.571429,0.687256,0.385758,0.686567,0.377358,0.843284,0.708333


..................................................
subgroup:  LumA
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/vt_std_smote
(192, 17) (20, 17)


Unnamed: 0_level_0,accuracy_score_train,accuracy_score_val,f1_score_train,f1_score_val,matthews_corrcoef_train,matthews_corrcoef_val,cohen_kappa_score_train,cohen_kappa_score_val,roc_auc_score_train,roc_auc_score_val
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AdaBoostClassifier,0.942708,0.55,0.942408,0.571429,0.885465,0.100504,0.885417,0.1,0.942708,0.550505
DecisionTreeClassifier,0.776042,0.4,0.790244,0.454545,0.557216,-0.212121,0.552083,-0.212121,0.776042,0.393939
ExtraTreesClassifier,1.0,0.65,1.0,0.740741,1.0,0.301511,1.0,0.255319,1.0,0.621212
GaussianNB,0.645833,0.6,0.649485,0.666667,0.29173,0.179106,0.291667,0.175258,0.645833,0.585859
GradientBoostingClassifier,1.0,0.45,1.0,0.521739,1.0,-0.123091,1.0,-0.122449,1.0,0.439394
KNeighborsClassifier,0.671875,0.55,0.651934,0.64,0.346029,0.065795,0.34375,0.0625,0.671875,0.530303
LGBMClassifier,1.0,0.55,1.0,0.608696,1.0,0.082061,1.0,0.081633,1.0,0.540404
LogisticRegression,0.65625,0.5,0.652632,0.583333,0.312568,-0.031607,0.3125,-0.030928,0.65625,0.484848
QuadraticDiscriminantAnalysis,0.828125,0.6,0.823529,0.666667,0.657142,0.179106,0.65625,0.175258,0.828125,0.585859
RandomForestClassifier,0.828125,0.65,0.827225,0.72,0.656286,0.285112,0.65625,0.270833,0.828125,0.631313


1it [00:10, 10.89s/it]

..................................................
subgroup:  claudin-low
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/vt_std_smote
pkl file does not exist
..................................................
subgroup:  Basal
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/vt_std_smote
pkl file does not exist
..................................................
subgroup:  Normal
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/vt_std_smote
pkl file does not exist
..................................................
subgroup:  NC
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/vt_std_smote
pkl file does not exist





## xgboost

In [11]:
def validate_models(
    input_file_path: str = None,
    input_file_name: str = None,
    classifiers: list = config.classifiers,
    plot: bool = False,
):
    try:
        list_train, list_val, _, _ = functions.read_preprocessed_df(
            input_file_path, input_file_name
        )
    except:
        return
    X_train, y_train = list_train[0], list_train[1]
    X_val, y_val = list_val[0], list_val[1]
    print(X_train.shape, X_val.shape)
    display(
        functions.compare_bcms(
            X_train,
            y_train,
            X_val,
            y_val,
            classifiers=classifiers,
            plot=plot,
        )
    )


for feature_selection_method, scaling_method, sampling_method in tqdm(
    itertools.product(feature_selection_methods, scaling_methods, sampling_methods)
):
    preprocess_order = "{0}_{1}_{2}".format(
        feature_selection_method, scaling_method, sampling_method
    )
    print("=====" * 10)
    for subgroup_column in subgroup_columns:  # 各サブグループへの適用
        print("-----" * 10)
        print("subgroup column: ", subgroup_column)

        for subgroup in df_recurrenced[subgroup_column].unique():  # サブグループ毎への適用
            print("....." * 10)
            print("subgroup: ", subgroup)
            input_file_path = "./{0}/{1}/{2}".format(
                config.INTERIM_PREPROCESSED_RECURRENCE_DIR,
                subgroup_column,
                preprocess_order,
                plot=True,
            )
            print(input_file_path)
            
            try:
                list_train, list_val, _, _ = functions.read_preprocessed_df(
                    input_file_path, subgroup
                )
            except:
                pass

            X_train, y_train = list_train[0], list_train[1]
            X_val, y_val = list_val[0], list_val[1]
            print(X_train.shape, X_val.shape)
            
            xgb_train = xgb.DMatrix(
                X_train, label=y_train, feature_names=X_train.columns
            )
            xgb_val = xgb.DMatrix(X_val, label=y_val, feature_names=X_val.columns)

            param = {
                # 二値分類問題
                "objective": "binary:logistic",
            }
            clf = xgb.train(param, xgb_train)
            y_val_pred_proba = clf.predict(xgb_val)
            y_val_pred = np.where(y_val_pred_proba > 0.5, 1, 0)
            functions.show_scores(y_val, y_val_pred)

0it [00:00, ?it/s]

--------------------------------------------------
subgroup column:  CLAUDIN_SUBTYPE
..................................................
subgroup:  LumB
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/vt_std_smote
(216, 17) (20, 17)


accuracy     0.600000
precision    0.500000
recall       0.375000
f1 score     0.428571
dtype: float64

..................................................
subgroup:  Her2
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/vt_std_smote
(134, 9) (11, 9)


accuracy     0.545455
precision    0.250000
recall       0.333333
f1 score     0.285714
dtype: float64

..................................................
subgroup:  LumA
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/vt_std_smote
(192, 17) (20, 17)


accuracy     0.450000
precision    0.500000
recall       0.545455
f1 score     0.521739
dtype: float64

..................................................
subgroup:  claudin-low
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/vt_std_smote
pkl file does not exist
(192, 17) (20, 17)


accuracy     0.450000
precision    0.500000
recall       0.545455
f1 score     0.521739
dtype: float64

..................................................
subgroup:  Basal
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/vt_std_smote
pkl file does not exist
(192, 17) (20, 17)


accuracy     0.450000
precision    0.500000
recall       0.545455
f1 score     0.521739
dtype: float64

..................................................
subgroup:  Normal
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/vt_std_smote
pkl file does not exist
(192, 17) (20, 17)


accuracy     0.450000
precision    0.500000
recall       0.545455
f1 score     0.521739
dtype: float64

..................................................
subgroup:  NC
./../data/interim/PREPROCESSED/RECURRENCE/CLAUDIN_SUBTYPE/vt_std_smote
pkl file does not exist
(192, 17) (20, 17)


accuracy     0.450000
precision    0.500000
recall       0.545455
f1 score     0.521739
dtype: float64

1it [00:01,  1.23s/it]
