In [1]:
import os
import random
import itertools

# 基本的なライブラリ
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

# 描画ライブラリ
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn_analyzer import CustomPairPlot
import graphviz
import pydotplus
from IPython.display import Image
from IPython.display import HTML
from six import StringIO
from ipywidgets import interact, FloatSlider

# データセット分割
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.experimental import (
    enable_iterative_imputer,
)  # IterativeImputerをimportするために必要
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

# 前処理
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE

# 特徴量選択
from sklearn.feature_selection import (
    SequentialFeatureSelector,
    VarianceThreshold,
    RFE,
    RFECV,
)


# 学習中
from tqdm import tqdm
from sklearn.model_selection import learning_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import warnings


# config python file
import config

SEED = config.SEED


from functions import *

fix_seed(SEED)


# 最大表示列数の指定（ここでは50列を指定）N
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

%matplotlib inline

  y: pd.Series(),


# basic process

基本的なプロセス  
2つのデータを読み込み、PATIENT_IDカラムをキーとして結合する  
その後、MB\~とMTX-T\~でPATIENT_IDが別れているので、分割し、各データを保存する

In [2]:
df_patient = pd.read_table(
    config.RAW_BRCA_METABRIC_DIR + "/data_clinical_patient.txt", header=4
)
df_sample = pd.read_table(
    config.RAW_BRCA_METABRIC_DIR + "/data_clinical_sample.txt", header=4
)
print(df_patient.shape, df_sample.shape)

df_merged = pd.merge(df_patient, df_sample, on="PATIENT_ID")
check(df_merged)

(2509, 24) (2509, 13)


Unnamed: 0,feature,dtypes,nan,count,max,min,num_unique,unique,unique_counts
0,PATIENT_ID,object,0,2509,,,2509,,
1,LYMPH_NODES_EXAMINED_POSITIVE,float64,266,2243,45.0,0.0,32,,
2,NPI,float64,222,2287,7.2,1.0,436,,
3,CELLULARITY,object,592,1917,,,3,[nan 'High' 'Moderate' 'Low'],"{'High': 965, 'Moderate': 737, 'Low': 215}"
4,CHEMOTHERAPY,object,529,1980,,,2,['NO' 'YES' nan],"{'NO': 1568, 'YES': 412}"
5,COHORT,float64,11,2498,9.0,1.0,9,[ 1. 2. 3. 5. 4. 9. 7. 6. nan 8.],"{1.0: 809, 3.0: 763, 2.0: 288, 4.0: 238, 5.0: 170, 7.0: 105, 8.0: 82, 9.0: 40, 6.0: 3}"
6,ER_IHC,object,83,2426,,,2,['Positve' 'Negative' nan],"{'Positve': 1817, 'Negative': 609}"
7,HER2_SNP6,object,529,1980,,,4,['NEUTRAL' 'LOSS' nan 'GAIN' 'UNDEF'],"{'NEUTRAL': 1436, 'GAIN': 438, 'LOSS': 101, 'UNDEF': 5}"
8,HORMONE_THERAPY,object,529,1980,,,2,['YES' 'NO' nan],"{'YES': 1216, 'NO': 764}"
9,INFERRED_MENOPAUSAL_STATE,object,529,1980,,,2,['Post' 'Pre' nan],"{'Post': 1556, 'Pre': 424}"


## カラムの順序変更（読みやすさのため）

In [3]:
def align_columns(df: pd.DataFrame, regex: str):
    # まとめたいcolumnの正規表現を一時退避
    df_copy = df.copy()
    df_tmp = df_copy.filter(regex=regex)
    # 元のdfから落とす
    df_copy.drop(df_tmp.columns, axis=1, inplace=True)
    # 元のdfに結合
    return pd.merge(df_copy, df_tmp, right_index=True, left_index=True)

In [34]:
# ID
df_merged = align_columns(df_merged, ".*_ID")
# 癌の種類（さほど重要ではない、らしい）
df_merged = align_columns(df_merged, "^CANCER_")
df_merged = align_columns(df_merged, "^TUMOR_")
# 重要な特徴量
df_merged = align_columns(df_merged, "CLAUDIN_SUBTYPE|PR_STATUS|ER_STATUS|HER2_STATUS")
# 治療の種類
df_merged = align_columns(df_merged, ".*THERAPY$|^BREAST_SURGERY")
# target系の種類（OS, RFS, VITAL）
df_merged = align_columns(df_merged, "^OS_.*|^RFS_.*|^VITAL_.*")

check(df_merged)

Unnamed: 0,feature,dtypes,nan,count,max,min,num_unique,unique,unique_counts
0,LYMPH_NODES_EXAMINED_POSITIVE,float64,266,2243,45.0,0.0,32,,
1,NPI,float64,222,2287,7.2,1.0,436,,
2,CELLULARITY,object,592,1917,,,3,[nan 'High' 'Moderate' 'Low'],"{'High': 965, 'Moderate': 737, 'Low': 215}"
3,COHORT,float64,11,2498,9.0,1.0,9,[ 1. 2. 3. 5. 4. 9. 7. 6. nan 8.],"{1.0: 809, 3.0: 763, 2.0: 288, 4.0: 238, 5.0: 170, 7.0: 105, 8.0: 82, 9.0: 40, 6.0: 3}"
4,INFERRED_MENOPAUSAL_STATE,object,529,1980,,,2,['Post' 'Pre' nan],"{'Post': 1556, 'Pre': 424}"
5,SEX,object,0,2509,,,1,['Female'],{'Female': 2509}
6,INTCLUST,object,529,1980,,,11,['4ER+' '3' '9' '7' '4ER-' nan '5' '8' '10' '1' '2' '6'],"{'8': 299, '3': 290, '4ER+': 260, '10': 226, '7': 190, '5': 190, '9': 146, '1': 139, '6': 85, '4ER-': 83, '2': 72}"
7,AGE_AT_DIAGNOSIS,float64,11,2498,96.29,21.93,1843,,
8,THREEGENE,object,745,1764,,,4,['ER-/HER2-' 'ER+/HER2- High Prolif' nan 'ER+/HER2- Low Prolif' 'HER2+'],"{'ER+/HER2- Low Prolif': 640, 'ER+/HER2- High Prolif': 617, 'ER-/HER2-': 309, 'HER2+': 198}"
9,LATERALITY,object,639,1870,,,2,['Right' 'Left' nan],"{'Left': 973, 'Right': 897}"


In [5]:
# データの型の定義（適用する前処理が大きく異なるため）

int_columns = [
    # patient
    "LYMPH_NODES_EXAMINED_POSITIVE",
    "OS_MONTHS",
    "RFS_MONTHS",
]

float_columns = [
    # patient
    "NPI",
    "AGE_AT_DIAGNOSIS",
    # sample
    "TUMOR_SIZE",
    "TMB_NONSYNONYMOUS",
]

# 質的変数（順序尺度）
num_cat_columns = [
    # patient
    "CELLULARITY",
    "ER_IHC",
    "HER2_SNP6",
    "INFERRED_MENOPAUSAL_STATE",
    # sample
    "ER_STATUS",
    "HER2_STATUS",
    "GRADE",
    "PR_STATUS",
    "TUMOR_STAGE",
]
# 質的変数（名義尺度）
str_cat_columns = [
    # patient
    "COHORT",
    "INTCLUST",
    "CLAUDIN_SUBTYPE",
    "THREEGENE",
    "HISTOLOGICAL_SUBTYPE",
    "BREAST_SURGERY",
    "LATERALITY",
    "VITAL_STATUS",
    # sample
    "CANCER_TYPE",
    "CANCER_TYPE_DETAILED",
    "ONCOTREE_CODE",
]

bool_columns = [
    # patient
    "CHEMOTHERAPY",
    "HORMONE_THERAPY",
    "RADIO_THERAPY",
    "OS_STATUS",
    "RFS_STATUS",
]

meanless_columns = [
    # patient
    "PATIENT_ID",
    "SEX",
    # sample
    #'PATIENT_ID',
    "SAMPLE_ID",
    "SAMPLE_TYPE",
]

list_columns = [
    int_columns,
    float_columns,
    num_cat_columns,
    str_cat_columns,
    bool_columns,
    meanless_columns,
]
print(
    df_merged.shape[1],
    len(int_columns)
    + len(float_columns)
    + len(num_cat_columns)
    + len(str_cat_columns)
    + len(bool_columns)
    + len(meanless_columns),
)
assert df_merged.shape[1] == len(int_columns) + len(float_columns) + len(
    num_cat_columns
) + len(str_cat_columns) + len(bool_columns) + len(
    meanless_columns
), "lack or too much columns"

36 36


In [6]:
# データを大きく2つに分割できるので、ここで分割
df_MB = df_merged[df_merged["PATIENT_ID"].str.contains("MB")]
df_MTST = df_merged[df_merged["PATIENT_ID"].str.contains("MTS-T")]
df_MB.shape, df_MTST.shape

((1985, 36), (524, 36))

In [7]:
# save
make_dir(config.INTERIM_PICKLE_PREPROCESSED_DIR)
df_merged.to_pickle(config.INTERIM_PICKLE_PREPROCESSED_DIR + "/df_merged.pkl")
df_MB.to_pickle(config.INTERIM_PICKLE_PREPROCESSED_DIR + "/df_MB.pkl")
df_MTST.to_pickle(config.INTERIM_PICKLE_PREPROCESSED_DIR + "/df_MTST.pkl")

## 無意味な特徴量の削除

In [8]:
df_MB = pd.read_pickle(config.INTERIM_PICKLE_PREPROCESSED_DIR + "/df_MB.pkl")
df_MB.drop(meanless_columns, axis=1, inplace=True)

assert df_MB.shape[1] == len(int_columns) + len(float_columns) + len(
    num_cat_columns
) + len(str_cat_columns) + len(bool_columns), "mistake drop columns"

In [9]:
# 分散が小さい（thresholdで制御）ものを表示する関数
def variance_threshold(X: pd.DataFrame(), columns: str, threshold: float):
    selector = VarianceThreshold(threshold=threshold)
    tmp_columns = columns.copy()
    selector.fit(X[tmp_columns])
    print("元の特徴量数：", len(tmp_columns), ", 選択後の特徴量数", sum(selector.get_support()))

In [10]:
# とりあえず単純に数値を持つ特徴量の分散を確認
# 後で他のカテゴリ特徴量などにも適用し、確認したい
variance_threshold(df_MB, int_columns, 0.1)
variance_threshold(df_MB, float_columns, 0.1)

元の特徴量数： 3 , 選択後の特徴量数 3
元の特徴量数： 4 , 選択後の特徴量数 4


## 不要なデータへの対処
https://www.codexa.net/missing_value_python/

null値への対処  
最初はnull値がそもそも少ない（例．データ量に対し、5%以下の量）のデータはそもそも取り除く（リストワイズ法）  

In [11]:
# データの総量5%以下のnull値は補完せずに削除する


def dropna_nper(df_original: pd.DataFrame(), percent: int = 0.05):
    df = df_original.copy()
    # nullを含むデータのインデックスを確認
    num = int(df.shape[0] * percent)
    print("除外データ数：", str(num))
    # df_MB[df_MB.isnull().any(axis=1)]
    # nullを含むデータ数がnum個以下のcolumnsを抽出（indexとなっているのはisnull().sum()で取り出した際、indexにcolumns名が来るため）
    columns_u5null = df.isnull().sum()[df.isnull().sum() < num].index
    df_u5null = df[columns_u5null]
    # nullを含むデータ数がnum個以下のデータの全てのindex
    index_u5null = df_u5null[df_u5null.isnull().any(axis=1)].index
    print("nullを含むデータ数がnum個以下のcolumnsを保有するデータのインデックス：", index_u5null)

    # nullを含むデータ数がnum個以下のcolumnsがなくなったか確認
    df.drop(index_u5null, inplace=True)
    return df

**リストワイズ法適用下で欠損値を保有する特徴量**
- THREEGENE
- LATERALITY
- TUMOR_STAGE

**リストワイズ法を適用する場合の変化（目視）**
- HISTOLOGICAL_SUBTYPEが8種類から7種類になった　。少数派の'Metaplastic'に関しては、1939データ中2件しかなかったため、削除して問題なさそう。
- CANCER_TYPEが2種類から1種類になった（要カラム削除）。少数派の'Breast Sarcoma'に関しては、1985データ中3件しかなかったため、削除して問題なさそう。
- CANCER_TYPE_DETAILが8種類から5種類に削減された。削除された3クラスの内、2クラス（）は元々2データしか存在しなかったため削除しても問題ないと考えられるが、'Invasive Breast Carcinoma'クラスに関しては元々44データ存在していたものが全て削除されている。したがって、何らかの共通した特徴量がnullになっていると予想される。
- CANCER_TYPE_DETAILの'Breast'クラスが17データから12データへと減っている。減ったデータ数自体は5件と少ないが、割合としては3割ほどとそこそこ大きく注意が必要。
- TUMOR_STAGEについて、0.0クラスの個数が著しく低下している。おそらく腫瘍がちいさいため、本格的な治療が実施されていないゆえにデータが集まっていないものと思われる。削除するかは考えたほうがよさそう。


In [12]:
# TUMOR_STAGEが0のクラスを観察
df_MB[df_MB["TUMOR_STAGE"] == 0.0]

Unnamed: 0,LYMPH_NODES_EXAMINED_POSITIVE,NPI,CELLULARITY,COHORT,INFERRED_MENOPAUSAL_STATE,INTCLUST,AGE_AT_DIAGNOSIS,CLAUDIN_SUBTYPE,THREEGENE,LATERALITY,HISTOLOGICAL_SUBTYPE,GRADE,ONCOTREE_CODE,PR_STATUS,TMB_NONSYNONYMOUS,CANCER_TYPE,CANCER_TYPE_DETAILED,ER_IHC,HER2_SNP6,ER_STATUS,HER2_STATUS,TUMOR_SIZE,TUMOR_STAGE,CHEMOTHERAPY,HORMONE_THERAPY,RADIO_THERAPY,BREAST_SURGERY,OS_MONTHS,OS_STATUS,VITAL_STATUS,RFS_STATUS,RFS_MONTHS
42,,2.14,Low,1.0,Pre,3,45.73,claudin-low,,Left,,,BRCA,Negative,0.0,Breast Cancer,Invasive Breast Carcinoma,,NEUTRAL,Negative,Negative,70.0,0.0,NO,NO,NO,BREAST CONSERVING,157.5,0:LIVING,Living,1:Recurred,12.17
84,,2.004,Low,1.0,Post,4ER+,60.85,claudin-low,,Left,,1.0,BRCA,Negative,0.0,Breast Cancer,Invasive Breast Carcinoma,,NEUTRAL,Positive,Negative,2.0,0.0,NO,NO,NO,BREAST CONSERVING,2.866667,0:LIVING,Living,0:Not Recurred,2.83
87,,2.046,Low,1.0,Post,5,51.04,Basal,HER2+,Left,,,BRCA,Negative,0.0,Breast Cancer,Invasive Breast Carcinoma,,GAIN,Negative,Positive,23.0,0.0,NO,NO,NO,MASTECTOMY,75.3,0:LIVING,Living,0:Not Recurred,74.31
145,0.0,1.07,,1.0,Pre,5,41.98,Her2,HER2+,Left,Ductal/NST,,IDC,Negative,2.615035,Breast Cancer,Breast Invasive Ductal Carcinoma,,GAIN,Negative,Positive,35.0,0.0,NO,NO,NO,MASTECTOMY,200.333333,0:LIVING,Living,0:Not Recurred,197.7
183,,1.0,,1.0,Post,4ER+,52.79,claudin-low,,Left,,,PBS,Negative,0.0,Breast Sarcoma,Breast Angiosarcoma,,NEUTRAL,Positive,Negative,,0.0,NO,NO,NO,MASTECTOMY,72.8,0:LIVING,Living,0:Not Recurred,71.84
198,,1.05,High,1.0,Post,10,68.83,Basal,ER-/HER2-,Left,,,BRCA,Negative,0.0,Breast Cancer,Invasive Breast Carcinoma,,NEUTRAL,Negative,Negative,25.0,0.0,NO,NO,YES,BREAST CONSERVING,188.133333,0:LIVING,Living,0:Not Recurred,185.66
346,0.0,1.022,High,1.0,Post,7,59.34,LumB,ER+/HER2- High Prolif,Left,Lobular,,ILC,Positive,2.615035,Breast Cancer,Breast Invasive Lobular Carcinoma,Positve,NEUTRAL,Positive,Negative,11.0,0.0,NO,NO,YES,BREAST CONSERVING,27.4,0:LIVING,Living,0:Not Recurred,27.04
389,0.0,2.0,Low,1.0,Post,5,54.08,Her2,,,,,BRCA,Negative,9.152624,Breast Cancer,Invasive Breast Carcinoma,Negative,GAIN,Negative,Positive,,0.0,NO,NO,NO,,2.5,0:LIVING,Living,0:Not Recurred,2.47
403,,1.04,Low,1.0,Post,5,64.21,Basal,HER2+,Right,,,BRCA,Negative,0.0,Breast Cancer,Invasive Breast Carcinoma,,GAIN,Negative,Positive,20.0,0.0,NO,NO,NO,MASTECTOMY,85.5,1:DECEASED,Died of Other Causes,0:Not Recurred,84.38
425,0.0,3.13,Moderate,1.0,Post,7,76.22,claudin-low,ER+/HER2- Low Prolif,,Ductal/NST,2.0,IDC,Positive,0.0,Breast Cancer,Breast Invasive Ductal Carcinoma,Positve,LOSS,Positive,Negative,65.0,0.0,NO,YES,YES,BREAST CONSERVING,163.2,1:DECEASED,Died of Other Causes,0:Not Recurred,161.05


**TUMOR_STAGEについて、欠損値が多かった特徴量(目視)**
※欠損値を取り除かない状況下で、該当するデータ数は12件
- LYMPH_NODES_EXAMINED_POSITIVE（8件）
- HISTOLOGICAL_SUBTYPE（9件）
- GRADE（10件）
- ER_IHC（8件）

何れもリストワイズ法適用下で欠損値を保有する特徴量ではない。  
▷今後作成予定の目的変数（5年後の予後）は半数以上のデータが目的変数を作成することができることからも、これらの特徴量に関しては、欠損値を補い固有のクラスターとしたほうが良い？

### 少数カテゴリのデータの取り扱い
※少数カテゴリの閾値をどうするかも課題
- 予測に良い影響を与えなさそう（過学習になるかも）
- 一方、削除してしまうと、以後そのカテゴリに属する人たちのデータを扱えなくなってしまう・・・  
「その他カテゴリ」のような集約カテゴリを設ける？

**少数カテゴリを保有する特徴量一覧（調査対象｜リストワイズ法を適用したdf_MB）**  
今回は直感的に少数だと感じることを選択基準とした
- CLAUDIN_SUBTYPE（'NC': 5）
- HISTOLOGICAL_SUBTYPE（'Medullary': 24, 'Tubular/ cribriform': 20, 'Mucinous': 18, 'Other': 12'）
- ONCOTREE_CODE（'IMMC': 18, 'BREAST': 12）
- HER2_SNP6（'UNDEF': 3）
- CANCER_TYPE_DETAILED（'Breast Invasive Mixed Mucinous Carcinoma': 18, 'Breast': 12）
- TUMOR_STAGE（4.0: 8, 0.0: 1）


## 特徴量選択手法でそもそも上記の特徴量が必要そうかを確認してみる？

sklearn　特徴量選択手法｜https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection


とりあえずリストワイズ法を適用

In [13]:
df_MB = dropna_nper(df_MB, percent=0.05)

除外データ数： 99
nullを含むデータ数がnum個以下のcolumnsを保有するデータのインデックス： Int64Index([   0,    7,    9,   21,   27,   34,   41,   42,   58,   81,
            ...
            1870, 1871, 1876, 1899, 1906, 1918, 1923, 1933, 1938, 1944],
           dtype='int64', length=271)


# 目的変数の生成

元のdfにはない目的変数カラム（5年後の生存の有無）を生成する。

## 目的変数｜n年後の予後の2値分類
n年後の予後を2値分類する。  
そのためにVITAL_STATUSとOS_MONTHを利用する。  
以下のフローチャートで生成する。  

In [14]:
HTML(
    '<div class="mxgraph" style="max-width:100%;border:1px solid transparent;" data-mxgraph="{&quot;highlight&quot;:&quot;#0000ff&quot;,&quot;nav&quot;:true,&quot;resize&quot;:true,&quot;toolbar&quot;:&quot;zoom layers tags lightbox&quot;,&quot;edit&quot;:&quot;_blank&quot;,&quot;xml&quot;:&quot;&lt;mxfile host=\&quot;Electron\&quot; modified=\&quot;2022-05-01T07:33:42.0.405Z\&quot; agent=\&quot;5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/17.2.0.2 Chrome/100.0.4896.60 Electron/18.0.1 Safari/537.36\&quot; etag=\&quot;CUa6MuG2A-EYqdbBh0ys\&quot; version=\&quot;17.2.0.2\&quot; type=\&quot;device\&quot;&gt;&lt;diagram id=\&quot;C5RBs43oDa-KdzZeNtuy\&quot; name=\&quot;Page-1\&quot;&gt;7VhZc5swEP41TJ7a4Yix/RjbcZI2SduBJNOnjAIyqBasK4SP/vpKQTIQXB9tncMTP3jYZVlJ+317gOH0k/kZQ5P4CkJMDdsM54YzMGy7ZZniXyoWhcLtOoUiYiQsVFap8MgvrJTquSgnIc5qhhyAcjKpKwNIUxzwmg4xBrO62QhofdUJinBD4QWINrV3JORxoe3Y7VJ/jkkU65Utt1vcSZA2VifJYhTCrKJyTg2nzwB4cZXM+5jK2Om43F0s7ujl2D379C37iW56n/3r2w+Fs+EujyyPwHDK/69ru3A9RTRX8VJn5QsdQAZ5GmLpxDScXswTKi4tcfkDc75QgKOcg1AB4zFEkCJ6CTBRdiNIuTKzpIzT8EQCK+QHCsG4UA0JpWoNISn7jpAyzmC8xE46WAIhjSl6wLSHgnH0uNE+UGDiVgoplq5CQQZ1lnJzp6W2t2VsFQYZ5CzAa+xUcnDEIrzOn1vYyf1VeKqQO8OQYM4WwoBhijiZ1smMVE5ES7sSd3GhoN+BBk6DBuHIaPVuL/yTy3vPP/FvPMMWrs0v3v3Vl2v/3DNagzVMkSjNYsKxN0GP4ZqJ2lJnT5UV4uC9iKIsU5hugHw3yKaYcTxfG2R911WZrkpdR4mzsm5YuhjElZpxbO4JluMGLN9FKX1P0H9JUHfLBNX9a2OGKrJoYmydsMrTVyDiXKUJjEaZ2NhT6iwX/Hs2tRpsuoYGmTbj8abp9kJkau/GJevVc8ltcKnsDDITXJTIek+5jLm8b+raWi1cMSQPeba5V9QAlvQZooRQGaxzTKeYkwCt6CiIkigVQiCwxmw1c8SSJI2E5JaS/8hUUXz32Gna9U6zlKutxlzRajr7ajXt1ROAYrZs9maB4xDRDB9c63fsJ4B0X7r3d7ap1m+6Fj9769dldXPvt7as14o95kdH/GoEev3jQHer4fJ9HtgPw+xDmwj0Dit8qr8+qv5xNCCCLLYJI/E3IBlGGT56Hw3+OBrYqzrRs44GuhpWZwMmAnFoM8DT1/9j56VnAF0lNk9lPssPfyhzrP0BIsTyS25RE8vP4c7pbw==&lt;/diagram&gt;&lt;/mxfile&gt;&quot;}"></div><script type="text/javascript" src="https://viewer.diagrams.net/js/viewer-static.min.js"></script>'
)

In [15]:
def make_target(df: pd.DataFrame(), year: int = 5) -> pd.DataFrame():
    df = df.copy()  # 引数dfの更新を防ぐ（pythonの関数は参照渡し）
    target_name = "OS_" + str(year) + "years"
    df[target_name] = np.nan
    df[target_name] = df[target_name].mask(df["OS_MONTHS"] > year * 12, False)
    df[target_name] = df[target_name].mask(
        (df["OS_MONTHS"] <= year * 12) & (df["VITAL_STATUS"] == "Died of Disease"),
        True,
    )
    # targetを定義できなかったデータは除く
    df = df[df[target_name].notnull()]
    return df

In [16]:
def len_list_columns() -> None:
    size = 0
    for lis in list_columns:
        size += len(lis)
    print("columns size: ", size)

In [17]:
def drop_columns_in_dict(
    dict_df: pd.DataFrame(), drop_columns: list()
) -> pd.DataFrame():
    print("before drop")
    len_list_columns()

    for c in drop_columns:
        for k in dict_df:
            if c in dict_df[k]:
                dict_df[k] = dict_df[k].drop(c, axis=1)
        for lis in list_columns:
            if c in lis:
                lis.remove(c)
    size = 0
    print("after drop")
    len_list_columns()
    return  # pythonの関数の引数は参照渡しなのでdfオブジェクトを生成してreturnしなくて良い（引数オブジェクトが変わる）

## 5,10,15年後の予後についての目的変数の生成

In [18]:
df_os_5 = make_target(df_MB, year=5)
df_os_10 = make_target(df_MB, year=10)
df_os_15 = make_target(df_MB, year=15)

dict_X_os = {
    "os_5": df_os_5.drop("OS_5years", axis=1),
    "os_10": df_os_10.drop("OS_10years", axis=1),
    "os_15": df_os_15.drop("OS_15years", axis=1),
}

dict_y_os = {
    "os_5": df_os_5["OS_5years"].astype(bool),
    "os_10": df_os_10["OS_10years"].astype(bool),
    "os_15": df_os_15["OS_15years"].astype(bool),
}
drop_columns_in_dict(dict_X_os, ["OS_MONTHS", "OS_STATUS", "VITAL_STATUS"])

for k in dict_X_os:
    print(k)
    print(dict_X_os[k].shape)
    print(dict_y_os[k].shape)

before drop
columns size:  36
after drop
columns size:  33
os_5
(1583, 29)
(1583,)
os_10
(1282, 29)
(1282,)
os_15
(983, 29)
(983,)


# 特徴量の削除

nullの多い特徴量
- THREEGENE
- LATERALITY
- TUMOR_STAGE

今回はcolumnごと削除する

In [19]:
drop_columns_in_dict(dict_X_os, ["THREEGENE", "LATERALITY", "TUMOR_STAGE"])
for k in dict_X_os:
    print(k)
    print(dict_X_os[k].shape)
    print(dict_y_os[k].shape)

before drop
columns size:  33
after drop
columns size:  30
os_5
(1583, 26)
(1583,)
os_10
(1282, 26)
(1282,)
os_15
(983, 26)
(983,)


# データ前処理

## null値の補完

null値をsklearnのImputeクラスを中心に補完する（https://scikit-learn.org/stable/modules/classes.html#module-sklearn.impute）  
Imputerクラスを使用するのは、学習前にtrainとtestで分割するので、そのとき平均値などでリークを起こさないようにするため  
Imputerクラスは数値情報にしか対応していないので、クラスラベルは置き換え必須

In [20]:
"""
def impute(df_train: pd.DataFrame, df_test: pd.DataFrame, how: str = "mean"):
    if how == "mean":
        imp_mean = SimpleImputer(strategy="mean")
        imp_mean.fit(df_train)
        df_train = pd.DataFrame(
            imp_mean.transform(df_train), columns=df_train.columns, index=df_train.index
        )
        df_test = pd.DataFrame(
            imp_mean.transform(df_test), columns=df_test.columns, index=df_test.index
        )
    elif how == "iter":
        imp_iter = IterativeImputer()
        imp_iter.fit(df_train)
        df_train = pd.DataFrame(
            imp_iter.transform(df_train), columns=df_train.columns, index=df_train.index
        )
        df_test = pd.DataFrame(
            imp_iter.transform(df_test), columns=df_test.columns, index=df_test.index
        )
    elif how == "knn":
        imp_knn = KNNImputer(n_neighbors=5)
        imp_knn.fit(df_train)
        df_train = pd.DataFrame(
            imp_knn.transform(df_train), columns=df_train.columns, index=df_train.index
        )
        df_test = pd.DataFrame(
            imp_knn.transform(df_test), columns=df_test.columns, index=df_test.index
        )
    else:
        print("how is not defined!")
    return df_train, df_test
"""
pass

impute()の動作検証

In [21]:
"""
imputed_column = "THREEGENE"
impute(df_MB_train[imputed_column], df_MB_test[imputed_column], how="knn")
"""
pass

## 特徴量のエンコーディング
順序尺度特徴量とboolean特徴量について、データを扱いやすい形に変換する。

### 質的変数（順序尺度）・boolのエンコーディング
num_cat_columnsは、順序のあるstrの変数で構成されている。  
そこで大小関係に対応するようstrをintに変換する。  
また、boolに類する特徴量に関しても、bool型ではないので、手動で変換する。

In [22]:
def encode_dtypes(df: pd.DataFrame()):
    df = df.copy()
    # cat
    df["CELLULARITY"] = (
        df["CELLULARITY"].replace({"High": 3, "Moderate": 2, "Low": 1}).astype(int)
    )
    df["ER_IHC"] = df["ER_IHC"].replace({"Positve": 1, "Negative": 0}).astype(int)
    df["HER2_SNP6"] = (
        df["HER2_SNP6"]
        .replace({"GAIN": 3, "NEUTRAL": 2, "LOSS": 1, "UNDEF": 0})
        .astype(int)
    )
    df["INFERRED_MENOPAUSAL_STATE"] = (
        df["INFERRED_MENOPAUSAL_STATE"].replace({"Post": 1, "Pre": 0}).astype(int)
    )
    df["ER_STATUS"] = (
        df["ER_STATUS"].replace({"Positive": 1, "Negative": 0}).astype(int)
    )
    df["HER2_STATUS"] = (
        df["HER2_STATUS"].replace({"Positive": 1, "Negative": 0}).astype(int)
    )
    df["GRADE"] = df["GRADE"].astype(int)
    df["PR_STATUS"] = (
        df["PR_STATUS"].replace({"Positive": 1, "Negative": 0}).astype(int)
    )
    # bool
    df["CHEMOTHERAPY"] = df["CHEMOTHERAPY"].replace({"YES": 1, "NO": 0}).astype(int)
    df["HORMONE_THERAPY"] = (
        df["HORMONE_THERAPY"].replace({"YES": 1, "NO": 0}).astype(int)
    )
    df["RADIO_THERAPY"] = df["RADIO_THERAPY"].replace({"YES": 1, "NO": 0}).astype(int)
    df["RFS_STATUS"] = (
        df["RFS_STATUS"].replace({"1:Recurred": 1, "0:Not Recurred": 0}).astype(int)
    )
    return df

In [23]:
for k in dict_X_os:
    dict_X_os[k] = encode_dtypes(dict_X_os[k])

# 癌に影響する要因のグループ分け

乳がんに影響する要素
- CLAUDIN_SUBTYPE

ここから再開

In [38]:
dict_X_os[k].CLAUDIN_SUBTYPE.unique()

array(['LumB', 'Her2', 'LumA', 'Basal', 'claudin-low', 'Normal', 'NC'],
      dtype=object)

## Onehot encoding
上記で作成した一部特徴量を修正したdfを元に、onehot encodingを行う。
onehot encoding自体は名義尺度の質的変数カラムであるstr_cat_columnsのみに対してのみ実施するが、最終的に特徴量の型の修正はモデルへ入力する際に必要になるので、df_MB_dtype_encodedを元にonehot encodingを実施する。    

In [24]:
# 多重共線性回避のために、drop='first'
def onehot_encode(df: pd.DataFrame(), encoding_columns: list()) -> pd.DataFrame():
    onehot_encoder = OneHotEncoder()
    onehot_encoder.fit(df[encoding_columns])
    df_onehot = pd.DataFrame(
        onehot_encoder.transform(df[encoding_columns]).toarray(),
        columns=onehot_encoder.get_feature_names_out(encoding_columns),
    )
    df_onehot = df_onehot.astype(int)
    return df_onehot

In [31]:
for k in dict_X_os:
    dict_X_os[k] = encode_dtypes(dict_X_os[k])
    dict_X_os[k] = pd.merge(
        # インデックスを参照してmergeするが、サンプルの削除によってインデックスがずれている
        # そのため、インデックスをここでリセットする
        dict_X_os[k].reset_index(drop=True).drop(str_cat_columns, axis=True),
        # Ordinal Encoderによって返るdf
        onehot_encode(dict_X_os[k], str_cat_columns),
        right_index=True,
        left_index=True,
    )

y_reindex = y.reset_index(drop=True)

KeyError: "['COHORT', 'INTCLUST', 'CLAUDIN_SUBTYPE', 'HISTOLOGICAL_SUBTYPE', 'BREAST_SURGERY', 'CANCER_TYPE', 'CANCER_TYPE_DETAILED', 'ONCOTREE_CODE'] not found in axis"

In [None]:
encoded_columns = str_cat_columns

X_merged = pd.merge(
    X.reset_index(drop=True).drop(
        encoded_columns, axis=True
    ),  # 序盤のnullのdropでindexがずれているのでリセット
    X_onehot,
    right_index=True,
    left_index=True,
)

y_reindex = y.reset_index(drop=True)

# check(X_merged)

In [None]:
X.tail()

Unnamed: 0,LYMPH_NODES_EXAMINED_POSITIVE,NPI,CELLULARITY,COHORT,INFERRED_MENOPAUSAL_STATE,INTCLUST,AGE_AT_DIAGNOSIS,CLAUDIN_SUBTYPE,HISTOLOGICAL_SUBTYPE,GRADE,ONCOTREE_CODE,PR_STATUS,TMB_NONSYNONYMOUS,CANCER_TYPE,CANCER_TYPE_DETAILED,ER_IHC,HER2_SNP6,ER_STATUS,HER2_STATUS,TUMOR_SIZE,CHEMOTHERAPY,HORMONE_THERAPY,RADIO_THERAPY,BREAST_SURGERY,RFS_STATUS,RFS_MONTHS
1980,1.0,5.05,3,4.0,0,3,43.1,LumA,Lobular,3,ILC,1,5.230071,Breast Cancer,Breast Invasive Lobular Carcinoma,1,2,1,0,25.0,0,1,1,BREAST CONSERVING,0,194.28
1981,1.0,5.04,3,4.0,0,5,42.88,LumB,Ductal/NST,3,IDC,0,7.845106,Breast Cancer,Breast Invasive Ductal Carcinoma,1,3,1,1,20.0,0,0,1,MASTECTOMY,1,16.09
1982,45.0,6.05,3,4.0,1,1,62.9,LumB,Ductal/NST,3,IDC,1,5.230071,Breast Cancer,Breast Invasive Ductal Carcinoma,1,2,1,0,25.0,0,1,1,MASTECTOMY,1,121.18
1983,12.0,5.05,2,4.0,1,1,61.16,LumB,Ductal/NST,2,IDC,1,19.612766,Breast Cancer,Breast Invasive Ductal Carcinoma,1,2,1,0,25.0,0,1,0,MASTECTOMY,0,85.1
1984,1.0,5.04,3,4.0,1,10,60.02,LumB,Ductal/NST,3,IDC,0,3.922553,Breast Cancer,Breast Invasive Ductal Carcinoma,1,2,1,0,20.0,0,1,1,BREAST CONSERVING,0,199.24


In [None]:
X_merged.tail()

Unnamed: 0,LYMPH_NODES_EXAMINED_POSITIVE,NPI,CELLULARITY,INFERRED_MENOPAUSAL_STATE,AGE_AT_DIAGNOSIS,GRADE,PR_STATUS,TMB_NONSYNONYMOUS,ER_IHC,HER2_SNP6,ER_STATUS,HER2_STATUS,TUMOR_SIZE,CHEMOTHERAPY,HORMONE_THERAPY,RADIO_THERAPY,RFS_STATUS,RFS_MONTHS,COHORT_2.0,COHORT_3.0,COHORT_4.0,COHORT_5.0,INTCLUST_10,INTCLUST_2,INTCLUST_3,...,INTCLUST_6,INTCLUST_7,INTCLUST_8,INTCLUST_9,CLAUDIN_SUBTYPE_Her2,CLAUDIN_SUBTYPE_LumA,CLAUDIN_SUBTYPE_LumB,CLAUDIN_SUBTYPE_NC,CLAUDIN_SUBTYPE_Normal,CLAUDIN_SUBTYPE_claudin-low,HISTOLOGICAL_SUBTYPE_Lobular,HISTOLOGICAL_SUBTYPE_Medullary,HISTOLOGICAL_SUBTYPE_Mixed,HISTOLOGICAL_SUBTYPE_Mucinous,HISTOLOGICAL_SUBTYPE_Other,HISTOLOGICAL_SUBTYPE_Tubular/ cribriform,BREAST_SURGERY_MASTECTOMY,CANCER_TYPE_DETAILED_Breast Invasive Ductal Carcinoma,CANCER_TYPE_DETAILED_Breast Invasive Lobular Carcinoma,CANCER_TYPE_DETAILED_Breast Invasive Mixed Mucinous Carcinoma,CANCER_TYPE_DETAILED_Breast Mixed Ductal and Lobular Carcinoma,ONCOTREE_CODE_IDC,ONCOTREE_CODE_ILC,ONCOTREE_CODE_IMMC,ONCOTREE_CODE_MDLC
1578,1.0,5.05,3,0,43.1,3,1,5.230071,1,2,1,0,25.0,0,1,1,0,194.28,0,0,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0
1579,1.0,5.04,3,0,42.88,3,0,7.845106,1,3,1,1,20.0,0,0,1,1,16.09,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0
1580,45.0,6.05,3,1,62.9,3,1,5.230071,1,2,1,0,25.0,0,1,1,1,121.18,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0
1581,12.0,5.05,2,1,61.16,2,1,19.612766,1,2,1,0,25.0,0,1,0,0,85.1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0
1582,1.0,5.04,3,1,60.02,3,0,3.922553,1,2,1,0,20.0,0,1,1,0,199.24,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0


In [38]:
y.tail(), y_reindex.tail()

(1980    False
 1981     True
 1982    False
 1983    False
 1984    False
 Name: target_OS_5years, dtype: bool,
 1578    False
 1579     True
 1580    False
 1581    False
 1582    False
 Name: target_OS_5years, dtype: bool)

In [39]:
# スケーリング処理が必要なデータ候補
X[int_columns + float_columns].head()

Unnamed: 0,LYMPH_NODES_EXAMINED_POSITIVE,RFS_MONTHS,NPI,AGE_AT_DIAGNOSIS,TUMOR_SIZE,TMB_NONSYNONYMOUS
1,0.0,83.52,4.02,43.19,10.0,2.615035
2,1.0,151.28,4.03,48.87,15.0,2.615035
3,3.0,162.76,4.05,47.68,25.0,1.307518
4,8.0,18.55,6.08,76.97,40.0,2.615035
5,0.0,2.89,4.062,78.77,31.0,5.230071


In [40]:
X_train, X_test, y_train, y_test = train_test_split(
    X_merged, y_reindex, random_state=SEED
)
pass

In [41]:
X_train.tail()

Unnamed: 0,LYMPH_NODES_EXAMINED_POSITIVE,NPI,CELLULARITY,INFERRED_MENOPAUSAL_STATE,AGE_AT_DIAGNOSIS,GRADE,PR_STATUS,TMB_NONSYNONYMOUS,ER_IHC,HER2_SNP6,ER_STATUS,HER2_STATUS,TUMOR_SIZE,CHEMOTHERAPY,HORMONE_THERAPY,RADIO_THERAPY,RFS_STATUS,RFS_MONTHS,COHORT_2.0,COHORT_3.0,COHORT_4.0,COHORT_5.0,INTCLUST_10,INTCLUST_2,INTCLUST_3,...,INTCLUST_6,INTCLUST_7,INTCLUST_8,INTCLUST_9,CLAUDIN_SUBTYPE_Her2,CLAUDIN_SUBTYPE_LumA,CLAUDIN_SUBTYPE_LumB,CLAUDIN_SUBTYPE_NC,CLAUDIN_SUBTYPE_Normal,CLAUDIN_SUBTYPE_claudin-low,HISTOLOGICAL_SUBTYPE_Lobular,HISTOLOGICAL_SUBTYPE_Medullary,HISTOLOGICAL_SUBTYPE_Mixed,HISTOLOGICAL_SUBTYPE_Mucinous,HISTOLOGICAL_SUBTYPE_Other,HISTOLOGICAL_SUBTYPE_Tubular/ cribriform,BREAST_SURGERY_MASTECTOMY,CANCER_TYPE_DETAILED_Breast Invasive Ductal Carcinoma,CANCER_TYPE_DETAILED_Breast Invasive Lobular Carcinoma,CANCER_TYPE_DETAILED_Breast Invasive Mixed Mucinous Carcinoma,CANCER_TYPE_DETAILED_Breast Mixed Ductal and Lobular Carcinoma,ONCOTREE_CODE_IDC,ONCOTREE_CODE_ILC,ONCOTREE_CODE_IMMC,ONCOTREE_CODE_MDLC
53,6.0,6.052,3,1,79.73,3,1,6.537589,1,2,1,0,26.0,0,1,1,1,116.38,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
350,1.0,5.042,2,1,85.12,3,1,3.922553,1,3,1,1,21.0,0,1,0,1,95.36,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0
79,6.0,5.042,3,1,56.45,2,1,6.537589,1,2,1,0,21.0,1,1,1,0,90.39,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0
792,1.0,5.074,2,1,50.08,3,0,19.612766,0,3,0,1,37.0,1,0,1,1,29.67,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
1544,1.0,3.024,2,1,54.81,1,1,13.075177,1,2,1,0,12.0,0,0,1,0,181.74,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0


In [42]:
X_test.tail()

Unnamed: 0,LYMPH_NODES_EXAMINED_POSITIVE,NPI,CELLULARITY,INFERRED_MENOPAUSAL_STATE,AGE_AT_DIAGNOSIS,GRADE,PR_STATUS,TMB_NONSYNONYMOUS,ER_IHC,HER2_SNP6,ER_STATUS,HER2_STATUS,TUMOR_SIZE,CHEMOTHERAPY,HORMONE_THERAPY,RADIO_THERAPY,RFS_STATUS,RFS_MONTHS,COHORT_2.0,COHORT_3.0,COHORT_4.0,COHORT_5.0,INTCLUST_10,INTCLUST_2,INTCLUST_3,...,INTCLUST_6,INTCLUST_7,INTCLUST_8,INTCLUST_9,CLAUDIN_SUBTYPE_Her2,CLAUDIN_SUBTYPE_LumA,CLAUDIN_SUBTYPE_LumB,CLAUDIN_SUBTYPE_NC,CLAUDIN_SUBTYPE_Normal,CLAUDIN_SUBTYPE_claudin-low,HISTOLOGICAL_SUBTYPE_Lobular,HISTOLOGICAL_SUBTYPE_Medullary,HISTOLOGICAL_SUBTYPE_Mixed,HISTOLOGICAL_SUBTYPE_Mucinous,HISTOLOGICAL_SUBTYPE_Other,HISTOLOGICAL_SUBTYPE_Tubular/ cribriform,BREAST_SURGERY_MASTECTOMY,CANCER_TYPE_DETAILED_Breast Invasive Ductal Carcinoma,CANCER_TYPE_DETAILED_Breast Invasive Lobular Carcinoma,CANCER_TYPE_DETAILED_Breast Invasive Mixed Mucinous Carcinoma,CANCER_TYPE_DETAILED_Breast Mixed Ductal and Lobular Carcinoma,ONCOTREE_CODE_IDC,ONCOTREE_CODE_ILC,ONCOTREE_CODE_IMMC,ONCOTREE_CODE_MDLC
585,6.0,5.07,3,0,21.93,2,0,6.537589,0,3,0,1,35.0,1,0,1,1,33.06,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1
979,0.0,4.026,1,0,48.76,3,1,11.767659,1,3,1,1,13.0,0,0,1,0,79.44,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
110,0.0,2.022,1,1,67.83,1,1,1.307518,1,2,1,0,11.0,0,1,0,0,147.76,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
198,1.0,5.03,3,1,60.71,3,1,3.922553,1,1,1,0,15.0,0,1,1,0,62.66,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
67,1.0,3.052,1,1,83.35,1,0,3.922553,1,2,1,0,26.0,0,1,1,0,92.14,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0


In [43]:
y_train.tail(), y_test.tail()

(53      False
 350     False
 79      False
 792      True
 1544    False
 Name: target_OS_5years, dtype: bool,
 585     True
 979    False
 110    False
 198    False
 67     False
 Name: target_OS_5years, dtype: bool)

# 前処理後のdfの保存

In [46]:
# save
make_dir(config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CLINICAL_DIR)
X_merged.to_pickle(config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CLINICAL_DIR + "/X.pkl")
y.to_pickle(config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CLINICAL_DIR + "/y.pkl")


X_train.to_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CLINICAL_DIR + "/X_train.pkl"
)
X_test.to_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CLINICAL_DIR + "/X_test.pkl"
)

y_train.to_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CLINICAL_DIR + "/y_train.pkl"
)
y_test.to_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_PROGNOSIS_CLINICAL_DIR + "/y_test.pkl"
)