# 論文再現

参考文献の精度が高すぎるため、リークなどがあったのではないか確認のために再現実装を行う

参考文献で述べられている特徴量選択手法は有料であり、かつpythonでの利用が難しそうなため、参考文献内で提示された遺伝子を特徴量として精度を確認する

In [1]:
import os
import random
import itertools
import re

# 基本的なライブラリ
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

# 描画ライブラリ
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn_analyzer import CustomPairPlot
import graphviz
import pydotplus
from IPython.display import Image
from IPython.display import HTML
from six import StringIO
from ipywidgets import interact, FloatSlider

# 前処理
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import VarianceThreshold

# 補完
from sklearn.experimental import (
    enable_iterative_imputer,
)  # IterativeImputerをimportするために必要
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

# エンコード
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

# データセット分割
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    ShuffleSplit,
    StratifiedShuffleSplit,
    KFold,
    cross_validate,
)

# 特徴量選択
from sklearn.feature_selection import (
    GenericUnivariateSelect,
    f_classif,
    mutual_info_classif,
    chi2,
)
from boruta import BorutaPy

# https://github.com/smazzanti/mrmr
# pipでinstallはできたが、そのままimportできなかったので、
# ライブラリのソースコードをそのまま環境に設置
from libraries.mrmr import mrmr

# 学習中
from tqdm import tqdm
from sklearn.model_selection import learning_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import warnings

import networkx as nx


# config python file
import config

SEED = config.SEED
INDEX_MICROARRAY = config.INDEX_MICROARRAY

from functions import *

fix_seed(SEED)


# 最大表示列数の指定（ここでは50列を指定）N
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

%matplotlib inline

# データ読み込み


## 生データの読み込み

In [2]:
df_patient = pd.read_table(
    config.RAW_BRCA_METABRIC_DIR + "/data_clinical_patient.txt", header=4
)
df_sample = pd.read_table(
    config.RAW_BRCA_METABRIC_DIR + "/data_clinical_sample.txt", header=4
)

In [3]:
df_clinical = pd.merge(df_patient, df_sample, on="PATIENT_ID")

### 臨床データ特徴量の順序変更（読みやすさのため）

In [4]:
def align_columns(df: pd.DataFrame, regex: str):
    # まとめたいcolumnの正規表現を一時退避
    df_copy = df.copy()
    df_tmp = df_copy.filter(regex=regex)
    # 元のdfから落とす
    df_copy.drop(df_tmp.columns, axis=1, inplace=True)
    # 元のdfに結合
    return pd.merge(df_copy, df_tmp, right_index=True, left_index=True)

In [5]:
# 癌の種類
df_clinical = align_columns(df_clinical, "^CANCER_")
# 重要そう（直感）な特徴量
df_clinical = align_columns(df_clinical, "^ER_|^HER2_|^TUMOR_")
# 治療の種類
df_clinical = align_columns(df_clinical, ".*THERAPY$|^BREAST_SURGERY")
# target系の種類（OS, RFS, VITAL）
df_clinical = align_columns(df_clinical, "^OS_.*|^RFS_.*|^VITAL_.*")

In [6]:
# データを大きく2つに分割できるので、ここで分割
df_MB = df_clinical[df_clinical["PATIENT_ID"].str.contains("MB")]
df_MTST = df_clinical[df_clinical["PATIENT_ID"].str.contains("MTS-T")]
df_MB.shape, df_MTST.shape

# save
make_dir(config.INTERIM_PREPROCESSED_PROGNOSIS_CROSS_DIR)
df_MB.to_pickle(config.INTERIM_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/df_MB.pkl")

## 遺伝子データの読み込み

In [7]:
# 遺伝子発現データ
# 生の遺伝子発現データ
df_mrna_agilent_microarray = pd.read_table(
    config.RAW_BRCA_METABRIC_DIR + "/data_mrna_agilent_microarray.txt", index_col=0
).T
df_mrna_agilent_microarray = df_mrna_agilent_microarray.drop(
    "Entrez_Gene_Id"
).sort_index()
# zスコア化済み
df_mrna_agilent_microarray_zscores_ref_all_samples = pd.read_table(
    config.RAW_BRCA_METABRIC_DIR
    + "/data_mrna_agilent_microarray_zscores_ref_all_samples.txt",
    index_col=0,
).T
df_mrna_agilent_microarray_zscores_ref_all_samples = (
    df_mrna_agilent_microarray_zscores_ref_all_samples.drop("Entrez_Gene_Id")
).sort_index()
# zスコア化（2倍体基準）済み
df_mrna_agilent_microarray_zscores_ref_diploid_samples = pd.read_table(
    config.RAW_BRCA_METABRIC_DIR
    + "/data_mrna_agilent_microarray_zscores_ref_diploid_samples.txt",
    index_col=0,
).T
df_mrna_agilent_microarray_zscores_ref_diploid_samples = (
    df_mrna_agilent_microarray_zscores_ref_diploid_samples.drop("Entrez_Gene_Id")
).sort_index()


# important to decide X_dict name
SET_DF_MICROARRAY = (
    df_mrna_agilent_microarray,
    df_mrna_agilent_microarray_zscores_ref_all_samples,
    df_mrna_agilent_microarray_zscores_ref_diploid_samples,
)

# dfの整理

In [8]:
df_MB = df_MB.set_index("PATIENT_ID")
# VITAL_STATUS、OS_MONTHSが欠損しているデータの削除
df_MB = df_MB[df_MB["VITAL_STATUS"].notna()]
df_MB = df_MB[df_MB["OS_STATUS"].notna()]

# 乳がん以外で死亡した患者の削除
df_MB = df_MB[df_MB["VITAL_STATUS"] != "Died of Other Causes"]


# 5年以下で生存している、予後が未知の患者の削除
df_MB = df_MB[~((df_MB["OS_MONTHS"] <= 12 * 5) & (df_MB["VITAL_STATUS"] != "Living"))]

# 複数の治療を受けている患者の削除
df_MB = df_MB[
    ~(
        (df_MB["CHEMOTHERAPY"] == "YES")
        & (df_MB["HORMONE_THERAPY"] == "YES")
        & (df_MB["RADIO_THERAPY"] == "YES")
    )
]
df_MB = df_MB[~((df_MB["CHEMOTHERAPY"] == "YES") & (df_MB["HORMONE_THERAPY"] == "YES"))]
df_MB = df_MB[
    ~((df_MB["RADIO_THERAPY"] == "YES") & (df_MB["HORMONE_THERAPY"] == "YES"))
]
df_MB = df_MB[~((df_MB["CHEMOTHERAPY"] == "YES") & (df_MB["RADIO_THERAPY"] == "YES"))]

# 目的変数の生成

D or L x 各治療法のonehot特徴量の作成（計6つ）

In [9]:
therapies = ["CHEMOTHERAPY", "HORMONE_THERAPY", "RADIO_THERAPY"]
statuses = ["Living", "Died of Disease"]
for therapy in therapies:
    for status in statuses:
        conditions = [(df_MB["VITAL_STATUS"] == status) & (df_MB[therapy] == "YES")]
        choices = [1]
        df_MB[status[0] + therapy[0]] = np.select(conditions, choices, default=0)

# データフレームの結合

臨床データと遺伝子データを結合する  

In [10]:
# 結合する臨床データ
df_MB_columns = ["LC", "DC", "LH", "DH", "LR", "DR"]
df_merged = pd.merge(
    df_MB[df_MB_columns],
    df_mrna_agilent_microarray_zscores_ref_all_samples,
    right_index=True,
    left_index=True,
)
df_merged = df_merged.dropna()

# save
make_dir(config.INTERIM_PREPROCESSED_PROGNOSIS_CROSS_DIR)
df_merged.to_pickle(
    config.INTERIM_PREPROCESSED_PROGNOSIS_CROSS_DIR + "/df_merged_reproduction.pkl"
)

In [11]:
df_merged.shape

(530, 24374)

# 学習

## 学習パイプライン

### 不均衡データへの対処
1. SMOTE
2. コスト敏感型モデル（class_weight='balanced'と同じ？）
3. over sampling

In [13]:
# 第1ノードの特徴量
X = df_merged[
    [
        "AKIP1",
        "FGF16",
        "AA884297",
        "CDC42BPG",
        "UPF3B",
        "FAM114A1",
        "OR2G6",
        "ANKLE1",
        "MGA",
        "C14orf145",
    ]
]
y = df_merged["DH"]

params = {
    "class_weight": "balanced",
    "random_state": SEED,
}
clf = RandomForestClassifier(**params)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=SEED, shuffle=True
)


scire_funcs = ["accuracy", "precision", "recall", "f1"]
skf = StratifiedKFold(n_splits=10, random_state=SEED, shuffle=True)


acc_list = []
f1_list = []

for train_index, val_index in skf.split(X_train, y_train):
    X_t, y_t = X_train.iloc[train_index], y_train.iloc[train_index]
    X_v, y_v = X_train.iloc[val_index], y_train.iloc[val_index]

    sm = SMOTE(random_state=SEED)
    X_t, y_t = sm.fit_resample(X_t, y_t)

    clf.fit(X_t, y_t)
    y_p = clf.predict(X_v)
    acc_list.append(accuracy_score(y_v, y_p))
    f1_list.append(f1_score(y_v, y_p))
print(
    "acc score: ",
    sum(acc_list) / len(acc_list),
    " f1 score: ",
    sum(f1_list) / len(f1_list),
)

acc score:  0.7960897435897435  f1 score:  0.07
