In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.experimental import (
    enable_iterative_imputer,
)  # IterativeImputerをimportするために必要
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


from seaborn_analyzer import CustomPairPlot
import seaborn as sns

from tqdm import tqdm
from IPython.display import HTML


import config

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)
pd.set_option("display.width", 2000)

In [2]:
def make_dir(dir_name: str):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)


# 特徴量チェック
def check(df):
    col_list = df.columns.values  # 列名を取得
    row = []
    for col in col_list:
        unique = ""
        value_counts = ""
        if df[col].nunique() < 12:
            unique = df[col].unique()
            value_counts = df[col].value_counts().to_dict()
        tmp = (
            col,  # 列名
            df[col].dtypes,  # データタイプ
            df[col].isnull().sum(),  # null数
            df[col].count(),  # データ数 (欠損値除く)
            df[col].nunique(),  # ユニーク値の数 (欠損値除く)
            unique,  # ユニーク値
            value_counts,  # ユニーク値のそれぞれの個数
        )
        row.append(tmp)  # tmpを順次rowに保存
    df = pd.DataFrame(row)  # rowをデータフレームの形式に変換
    df.columns = [
        "feature",
        "dtypes",
        "nan",
        "count",
        "num_unique",
        "unique",
        "unique_counts",
    ]  # データフレームの列名指定
    # unique_countsの中身確認のために横幅拡張
    d = dict(selector=".col8", props=[("min-width", "200px")])  # name
    # display(df.style.set_table_styles([d]))
    # display(df)
    return df.style.set_table_styles([d])

# basic process

基本的なプロセス  
2つのデータを読み込み、PATIENT_IDカラムをキーとして結合する  
その後、MB\~とMTX-T\~でPATIENT_IDが別れているので、分割し、各データを保存する

In [3]:
df_patient = pd.read_table(
    config.RAW_BRCA_METABRIC_DIR + "/data_clinical_patient.txt", header=4
)
df_sample = pd.read_table(
    config.RAW_BRCA_METABRIC_DIR + "/data_clinical_sample.txt", header=4
)

In [4]:
df_patient.shape, df_sample.shape

((2509, 24), (2509, 13))

In [5]:
df_merged = pd.merge(df_patient, df_sample, on="PATIENT_ID")
check(df_merged)

Unnamed: 0,feature,dtypes,nan,count,num_unique,unique,unique_counts
0,PATIENT_ID,object,0,2509,2509,,
1,LYMPH_NODES_EXAMINED_POSITIVE,float64,266,2243,32,,
2,NPI,float64,222,2287,436,,
3,CELLULARITY,object,592,1917,3,[nan 'High' 'Moderate' 'Low'],"{'High': 965, 'Moderate': 737, 'Low': 215}"
4,CHEMOTHERAPY,object,529,1980,2,['NO' 'YES' nan],"{'NO': 1568, 'YES': 412}"
5,COHORT,float64,11,2498,9,[ 1. 2. 3. 5. 4. 9. 7. 6. nan 8.],"{1.0: 809, 3.0: 763, 2.0: 288, 4.0: 238, 5.0: 170, 7.0: 105, 8.0: 82, 9.0: 40, 6.0: 3}"
6,ER_IHC,object,83,2426,2,['Positve' 'Negative' nan],"{'Positve': 1817, 'Negative': 609}"
7,HER2_SNP6,object,529,1980,4,['NEUTRAL' 'LOSS' nan 'GAIN' 'UNDEF'],"{'NEUTRAL': 1436, 'GAIN': 438, 'LOSS': 101, 'UNDEF': 5}"
8,HORMONE_THERAPY,object,529,1980,2,['YES' 'NO' nan],"{'YES': 1216, 'NO': 764}"
9,INFERRED_MENOPAUSAL_STATE,object,529,1980,2,['Post' 'Pre' nan],"{'Post': 1556, 'Pre': 424}"


## カラムの順序変更（読みやすさのため）

In [6]:
def align_columns(df: pd.DataFrame, regex: str):
    # まとめたいcolumnの正規表現を一時退避
    df_copy = df.copy()
    df_tmp = df_copy.filter(regex=regex)
    # 元のdfから落とす
    df_copy.drop(df_tmp.columns, axis=1, inplace=True)
    # 元のdfに結合
    return pd.merge(df_copy, df_tmp, right_index=True, left_index=True)

In [7]:
check(align_columns(df_merged, ".*_THERAPY|BREAST_SURGERY"))

Unnamed: 0,feature,dtypes,nan,count,num_unique,unique,unique_counts
0,PATIENT_ID,object,0,2509,2509,,
1,LYMPH_NODES_EXAMINED_POSITIVE,float64,266,2243,32,,
2,NPI,float64,222,2287,436,,
3,CELLULARITY,object,592,1917,3,[nan 'High' 'Moderate' 'Low'],"{'High': 965, 'Moderate': 737, 'Low': 215}"
4,CHEMOTHERAPY,object,529,1980,2,['NO' 'YES' nan],"{'NO': 1568, 'YES': 412}"
5,COHORT,float64,11,2498,9,[ 1. 2. 3. 5. 4. 9. 7. 6. nan 8.],"{1.0: 809, 3.0: 763, 2.0: 288, 4.0: 238, 5.0: 170, 7.0: 105, 8.0: 82, 9.0: 40, 6.0: 3}"
6,ER_IHC,object,83,2426,2,['Positve' 'Negative' nan],"{'Positve': 1817, 'Negative': 609}"
7,HER2_SNP6,object,529,1980,4,['NEUTRAL' 'LOSS' nan 'GAIN' 'UNDEF'],"{'NEUTRAL': 1436, 'GAIN': 438, 'LOSS': 101, 'UNDEF': 5}"
8,INFERRED_MENOPAUSAL_STATE,object,529,1980,2,['Post' 'Pre' nan],"{'Post': 1556, 'Pre': 424}"
9,SEX,object,0,2509,1,['Female'],{'Female': 2509}


In [8]:
df_merged.filter(regex="^ER|^HER2")

Unnamed: 0,ER_IHC,HER2_SNP6,ER_STATUS,HER2_STATUS
0,Positve,NEUTRAL,Positive,Negative
1,Positve,NEUTRAL,Positive,Negative
2,Positve,NEUTRAL,Positive,Negative
3,Positve,NEUTRAL,Positive,Negative
4,Positve,NEUTRAL,Positive,Negative
...,...,...,...,...
2504,Positve,,Positive,
2505,Positve,,Positive,
2506,,,,
2507,,,,


In [9]:
# 癌の種類
df_merged = align_columns(df_merged, "^CANCER_")
# 重要そう（直感）な特徴量
df_merged = align_columns(df_merged, "^ER_|^HER2_|^TUMOR_")
# 治療の種類
df_merged = align_columns(df_merged, ".*_THERAPY$|^BREAST_SURGERY")
# target系の種類（OS, RFS, VITAL）
df_merged = align_columns(df_merged, "^OS_.*|^RFS_.*|^VITAL_.*")

check(df_merged)

Unnamed: 0,feature,dtypes,nan,count,num_unique,unique,unique_counts
0,PATIENT_ID,object,0,2509,2509,,
1,LYMPH_NODES_EXAMINED_POSITIVE,float64,266,2243,32,,
2,NPI,float64,222,2287,436,,
3,CELLULARITY,object,592,1917,3,[nan 'High' 'Moderate' 'Low'],"{'High': 965, 'Moderate': 737, 'Low': 215}"
4,CHEMOTHERAPY,object,529,1980,2,['NO' 'YES' nan],"{'NO': 1568, 'YES': 412}"
5,COHORT,float64,11,2498,9,[ 1. 2. 3. 5. 4. 9. 7. 6. nan 8.],"{1.0: 809, 3.0: 763, 2.0: 288, 4.0: 238, 5.0: 170, 7.0: 105, 8.0: 82, 9.0: 40, 6.0: 3}"
6,INFERRED_MENOPAUSAL_STATE,object,529,1980,2,['Post' 'Pre' nan],"{'Post': 1556, 'Pre': 424}"
7,SEX,object,0,2509,1,['Female'],{'Female': 2509}
8,INTCLUST,object,529,1980,11,['4ER+' '3' '9' '7' '4ER-' nan '5' '8' '10' '1' '2' '6'],"{'8': 299, '3': 290, '4ER+': 260, '10': 226, '7': 190, '5': 190, '9': 146, '1': 139, '6': 85, '4ER-': 83, '2': 72}"
9,AGE_AT_DIAGNOSIS,float64,11,2498,1843,,


In [10]:
# 重複カラム確認
# df_merged.T.duplicated()
# SAMPLE_IDのみ（PATIENT_IDと重複している）

In [11]:
# データの型の定義（適用する前処理が大きく異なるため）

int_columns = [
    # patient
    "LYMPH_NODES_EXAMINED_POSITIVE",
    "OS_MONTHS",
    "RFS_MONTHS",
]

float_columns = [
    # patient
    "NPI",
    "AGE_AT_DIAGNOSIS",
    # sample
    "TUMOR_SIZE",
    "TMB_NONSYNONYMOUS",
]

str_columns = []

# 質的変数（順序尺度）
num_cat_columns = [
    # patient
    "CELLULARITY",
    "ER_IHC",
    "HER2_SNP6",
    "INFERRED_MENOPAUSAL_STATE",
    # sample
    "ER_STATUS",
    "HER2_STATUS",
    "GRADE",
    "PR_STATUS",
    "TUMOR_STAGE",
]
# 質的変数（名義尺度）
str_cat_columns = [
    # patient
    "COHORT",
    "INTCLUST",
    "CLAUDIN_SUBTYPE",
    "THREEGENE",
    "HISTOLOGICAL_SUBTYPE",
    "BREAST_SURGERY",
    "LATERALITY",
    "VITAL_STATUS",
    # sample
    "CANCER_TYPE",
    "CANCER_TYPE_DETAILED",
    "ONCOTREE_CODE",
]

bool_columns = [
    # patient
    "CHEMOTHERAPY",
    "HORMONE_THERAPY",
    "RADIO_THERAPY",
    "OS_STATUS",
    "RFS_STATUS",
]

meanless_columns = [
    # patient
    "PATIENT_ID",
    "SEX",
    # sample
    #'PATIENT_ID',
    "SAMPLE_ID",
    "SAMPLE_TYPE",
]

print(
    df_merged.shape[1],
    len(int_columns)
    + len(float_columns)
    + len(ｓｔｒ_columns)
    + len(num_cat_columns)
    + len(str_cat_columns)
    + len(bool_columns)
    + len(meanless_columns),
)
assert df_merged.shape[1] == len(int_columns) + len(float_columns) + len(
    ｓｔｒ_columns
) + len(num_cat_columns) + len(str_cat_columns) + len(bool_columns) + len(
    meanless_columns
), "lack or too much columns"

36 36


In [12]:
# データを大きく2つに分割できるので、ここで分割
df_MB = df_merged[df_merged["PATIENT_ID"].str.contains("MB")]
df_MTST = df_merged[df_merged["PATIENT_ID"].str.contains("MTS-T")]
df_MB.shape, df_MTST.shape

((1985, 36), (524, 36))

In [13]:
# save
make_dir(config.INTERIM_PICKLE_PREPROCESSED_DIR)
df_merged.to_pickle(config.INTERIM_PICKLE_PREPROCESSED_DIR + "/df_merged.pkl")
df_MB.to_pickle(config.INTERIM_PICKLE_PREPROCESSED_DIR + "/df_MB.pkl")
df_MTST.to_pickle(config.INTERIM_PICKLE_PREPROCESSED_DIR + "/df_MTST.pkl")

## 無意味な特徴量の削除

In [14]:
df_MB = pd.read_pickle(config.INTERIM_PICKLE_PREPROCESSED_DIR + "/df_MB.pkl")
df_MB.drop(meanless_columns, axis=1, inplace=True)

check(df_MB)
assert df_MB.shape[1] == len(int_columns) + len(float_columns) + len(ｓｔｒ_columns) + len(
    num_cat_columns
) + len(str_cat_columns) + len(bool_columns), "mistake drop columns"

In [15]:
def variance_threshold(X: pd.DataFrame(), columns: str, threshold: float):
    selector = VarianceThreshold(threshold=threshold)
    tmp_columns = columns.copy()
    selector.fit(X[tmp_columns])
    print("元の特徴量数：", len(tmp_columns), ", 選択後の特徴量数", sum(selector.get_support()))

In [16]:
# とりあえず単純に数値を持つ特徴量の分散を確認
# 後で他のカテゴリ特徴量などにも適用し、確認したい
df_MB_train, df_MB_test = train_test_split(df_MB, random_state=config.SEED)
variance_threshold(df_MB_train, int_columns, 0.1)
variance_threshold(df_MB_train, float_columns, 0.1)

元の特徴量数： 3 , 選択後の特徴量数 3
元の特徴量数： 4 , 選択後の特徴量数 4


## 不要なデータへの対処
https://www.codexa.net/missing_value_python/

null値への対処  
最初はnull値がそもそも少ない（例．データ量に対し、5%以下の量）のデータはそもそも取り除く（リストワイズ法）  

In [17]:
# データの総量5%以下のnull値は補完せずに削除する


def dropna_u5(df_original: pd.DataFrame()):
    df = df_original.copy()
    # nullを含むデータのインデックスを確認
    num = int(df.shape[0] * 0.05)
    print("除外データ数：", str(num))
    # df_MB[df_MB.isnull().any(axis=1)]
    # nullを含むデータ数がnum個以下のcolumnsを抽出（indexとなっているのはisnull().sum()で取り出した際、indexにcolumns名が来るため）
    columns_u5null = df.isnull().sum()[df.isnull().sum() < num].index
    df_u5null = df[columns_u5null]
    # nullを含むデータ数がnum個以下のデータの全てのindex
    index_u5null = df_u5null[df_u5null.isnull().any(axis=1)].index
    print("nullを含むデータ数がnum個以下のcolumnsを保有するデータのインデックス：", index_u5null)

    # nullを含むデータ数がnum個以下のcolumnsがなくなったか確認
    df.drop(index_u5null, inplace=True)
    return df

In [18]:
check(df_MB)

Unnamed: 0,feature,dtypes,nan,count,num_unique,unique,unique_counts
0,LYMPH_NODES_EXAMINED_POSITIVE,float64,76,1909,31,,
1,NPI,float64,1,1984,323,,
2,CELLULARITY,object,68,1917,3,[nan 'High' 'Moderate' 'Low'],"{'High': 965, 'Moderate': 737, 'Low': 215}"
3,CHEMOTHERAPY,object,5,1980,2,['NO' 'YES' nan],"{'NO': 1568, 'YES': 412}"
4,COHORT,float64,0,1985,5,[1. 2. 3. 5. 4.],"{3.0: 763, 1.0: 526, 2.0: 288, 4.0: 238, 5.0: 170}"
5,INFERRED_MENOPAUSAL_STATE,object,5,1980,2,['Post' 'Pre' nan],"{'Post': 1556, 'Pre': 424}"
6,INTCLUST,object,5,1980,11,['4ER+' '3' '9' '7' '4ER-' nan '5' '8' '10' '1' '2' '6'],"{'8': 299, '3': 290, '4ER+': 260, '10': 226, '7': 190, '5': 190, '9': 146, '1': 139, '6': 85, '4ER-': 83, '2': 72}"
7,AGE_AT_DIAGNOSIS,float64,0,1985,1624,,
8,CLAUDIN_SUBTYPE,object,5,1980,7,['claudin-low' 'LumA' 'LumB' 'Normal' nan 'Her2' 'Basal' 'NC'],"{'LumA': 700, 'LumB': 475, 'Her2': 224, 'claudin-low': 218, 'Basal': 209, 'Normal': 148, 'NC': 6}"
9,THREEGENE,object,221,1764,4,['ER-/HER2-' 'ER+/HER2- High Prolif' nan 'ER+/HER2- Low Prolif' 'HER2+'],"{'ER+/HER2- Low Prolif': 640, 'ER+/HER2- High Prolif': 617, 'ER-/HER2-': 309, 'HER2+': 198}"


In [19]:
check(dropna_u5(df_MB))

除外データ数： 99
nullを含むデータ数がnum個以下のcolumnsを保有するデータのインデックス： Int64Index([   0,    7,    9,   21,   27,   34,   41,   42,   58,   81,
            ...
            1870, 1871, 1876, 1899, 1906, 1918, 1923, 1933, 1938, 1944], dtype='int64', length=271)


Unnamed: 0,feature,dtypes,nan,count,num_unique,unique,unique_counts
0,LYMPH_NODES_EXAMINED_POSITIVE,float64,0,1714,31,,
1,NPI,float64,0,1714,276,,
2,CELLULARITY,object,0,1714,3,['High' 'Moderate' 'Low'],"{'High': 887, 'Moderate': 645, 'Low': 182}"
3,CHEMOTHERAPY,object,0,1714,2,['NO' 'YES'],"{'NO': 1347, 'YES': 367}"
4,COHORT,float64,0,1714,5,[1. 2. 3. 5. 4.],"{3.0: 661, 1.0: 424, 2.0: 260, 4.0: 214, 5.0: 155}"
5,INFERRED_MENOPAUSAL_STATE,object,0,1714,2,['Pre' 'Post'],"{'Post': 1339, 'Pre': 375}"
6,INTCLUST,object,0,1714,11,['4ER+' '3' '9' '7' '4ER-' '5' '8' '10' '1' '2' '6'],"{'8': 257, '3': 256, '4ER+': 209, '10': 207, '7': 167, '5': 166, '1': 126, '9': 123, '6': 75, '2': 67, '4ER-': 61}"
7,AGE_AT_DIAGNOSIS,float64,0,1714,1432,,
8,CLAUDIN_SUBTYPE,object,0,1714,7,['LumA' 'LumB' 'claudin-low' 'Her2' 'Normal' 'Basal' 'NC'],"{'LumA': 624, 'LumB': 423, 'Her2': 193, 'Basal': 184, 'claudin-low': 170, 'Normal': 115, 'NC': 5}"
9,THREEGENE,object,183,1531,4,['ER+/HER2- High Prolif' nan 'ER+/HER2- Low Prolif' 'ER-/HER2-' 'HER2+'],"{'ER+/HER2- High Prolif': 549, 'ER+/HER2- Low Prolif': 546, 'ER-/HER2-': 264, 'HER2+': 172}"


**リストワイズ法適用下で欠損値を保有する特徴量**
- THREEGENE
- LATERALITY
- TUMOR_STAGE

**リストワイズ法を適用する場合の変化（目視）**
- HISTOLOGICAL_SUBTYPEが8種類から7種類になった　。少数派の'Metaplastic'に関しては、1939データ中2件しかなかったため、削除して問題なさそう。
- CANCER_TYPEが2種類から1種類になった（要カラム削除）。少数派の'Breast Sarcoma'に関しては、1985データ中3件しかなかったため、削除して問題なさそう。
- CANCER_TYPE_DETAILが8種類から5種類に削減された。削除された3クラスの内、2クラス（）は元々2データしか存在しなかったため削除しても問題ないと考えられるが、'Invasive Breast Carcinoma'クラスに関しては元々44データ存在していたものが全て削除されている。したがって、何らかの共通した特徴量がnullになっていると予想される。
- CANCER_TYPE_DETAILの'Breast'クラスが17データから12データへと減っている。減ったデータ数自体は5件と少ないが、割合としては3割ほどとそこそこ大きく注意が必要。
- TUMOR_STAGEについて、0.0クラスの個数が著しく低下している。おそらく腫瘍がちいさいため、本格的な治療が実施されていないゆえにデータが集まっていないものと思われる。削除するかは考えたほうがよさそう。


In [20]:
# TUMOR_STAGEが0のクラスを観察
df_MB[df_MB["TUMOR_STAGE"] == 0.0]

Unnamed: 0,LYMPH_NODES_EXAMINED_POSITIVE,NPI,CELLULARITY,CHEMOTHERAPY,COHORT,INFERRED_MENOPAUSAL_STATE,INTCLUST,AGE_AT_DIAGNOSIS,CLAUDIN_SUBTYPE,THREEGENE,LATERALITY,HISTOLOGICAL_SUBTYPE,GRADE,ONCOTREE_CODE,PR_STATUS,TMB_NONSYNONYMOUS,CANCER_TYPE,CANCER_TYPE_DETAILED,ER_IHC,HER2_SNP6,ER_STATUS,HER2_STATUS,TUMOR_SIZE,TUMOR_STAGE,HORMONE_THERAPY,RADIO_THERAPY,BREAST_SURGERY,OS_MONTHS,OS_STATUS,VITAL_STATUS,RFS_STATUS,RFS_MONTHS
42,,2.14,Low,NO,1.0,Pre,3,45.73,claudin-low,,Left,,,BRCA,Negative,0.0,Breast Cancer,Invasive Breast Carcinoma,,NEUTRAL,Negative,Negative,70.0,0.0,NO,NO,BREAST CONSERVING,157.5,0:LIVING,Living,1:Recurred,12.17
84,,2.004,Low,NO,1.0,Post,4ER+,60.85,claudin-low,,Left,,1.0,BRCA,Negative,0.0,Breast Cancer,Invasive Breast Carcinoma,,NEUTRAL,Positive,Negative,2.0,0.0,NO,NO,BREAST CONSERVING,2.866667,0:LIVING,Living,0:Not Recurred,2.83
87,,2.046,Low,NO,1.0,Post,5,51.04,Basal,HER2+,Left,,,BRCA,Negative,0.0,Breast Cancer,Invasive Breast Carcinoma,,GAIN,Negative,Positive,23.0,0.0,NO,NO,MASTECTOMY,75.3,0:LIVING,Living,0:Not Recurred,74.31
145,0.0,1.07,,NO,1.0,Pre,5,41.98,Her2,HER2+,Left,Ductal/NST,,IDC,Negative,2.615035,Breast Cancer,Breast Invasive Ductal Carcinoma,,GAIN,Negative,Positive,35.0,0.0,NO,NO,MASTECTOMY,200.333333,0:LIVING,Living,0:Not Recurred,197.7
183,,1.0,,NO,1.0,Post,4ER+,52.79,claudin-low,,Left,,,PBS,Negative,0.0,Breast Sarcoma,Breast Angiosarcoma,,NEUTRAL,Positive,Negative,,0.0,NO,NO,MASTECTOMY,72.8,0:LIVING,Living,0:Not Recurred,71.84
198,,1.05,High,NO,1.0,Post,10,68.83,Basal,ER-/HER2-,Left,,,BRCA,Negative,0.0,Breast Cancer,Invasive Breast Carcinoma,,NEUTRAL,Negative,Negative,25.0,0.0,NO,YES,BREAST CONSERVING,188.133333,0:LIVING,Living,0:Not Recurred,185.66
346,0.0,1.022,High,NO,1.0,Post,7,59.34,LumB,ER+/HER2- High Prolif,Left,Lobular,,ILC,Positive,2.615035,Breast Cancer,Breast Invasive Lobular Carcinoma,Positve,NEUTRAL,Positive,Negative,11.0,0.0,NO,YES,BREAST CONSERVING,27.4,0:LIVING,Living,0:Not Recurred,27.04
389,0.0,2.0,Low,NO,1.0,Post,5,54.08,Her2,,,,,BRCA,Negative,9.152624,Breast Cancer,Invasive Breast Carcinoma,Negative,GAIN,Negative,Positive,,0.0,NO,NO,,2.5,0:LIVING,Living,0:Not Recurred,2.47
403,,1.04,Low,NO,1.0,Post,5,64.21,Basal,HER2+,Right,,,BRCA,Negative,0.0,Breast Cancer,Invasive Breast Carcinoma,,GAIN,Negative,Positive,20.0,0.0,NO,NO,MASTECTOMY,85.5,1:DECEASED,Died of Other Causes,0:Not Recurred,84.38
425,0.0,3.13,Moderate,NO,1.0,Post,7,76.22,claudin-low,ER+/HER2- Low Prolif,,Ductal/NST,2.0,IDC,Positive,0.0,Breast Cancer,Breast Invasive Ductal Carcinoma,Positve,LOSS,Positive,Negative,65.0,0.0,YES,YES,BREAST CONSERVING,163.2,1:DECEASED,Died of Other Causes,0:Not Recurred,161.05


**TUMOR_STAGEについて、欠損値が多かった特徴量(目視)**
※欠損値を取り除かない状況下で、該当するデータ数は12件
- LYMPH_NODES_EXAMINED_POSITIVE（8件）
- HISTOLOGICAL_SUBTYPE（9件）
- GRADE（10件）
- ER_IHC（8件）

何れもリストワイズ法適用下で欠損値を保有する特徴量ではない。  
▷今後作成予定の目的変数（5年後の予後）は半数以上のデータが目的変数を作成することができることからも、これらの特徴量に関しては、欠損値を補い固有のクラスターとしたほうが良い？

### 少数カテゴリのデータの取り扱い
※少数カテゴリの閾値をどうするかも課題
- 予測に良い影響を与えなさそう（過学習になるかも）
- 一方、削除してしまうと、以後そのカテゴリに属する人たちのデータを扱えなくなってしまう・・・  
「その他カテゴリ」のような集約カテゴリを設ける？

**少数カテゴリを保有する特徴量一覧（調査対象｜リストワイズ法を適用したdf_MB）**  
今回は直感的に少数だと感じることを選択基準とした
- CLAUDIN_SUBTYPE（'NC': 5）
- HISTOLOGICAL_SUBTYPE（'Medullary': 24, 'Tubular/ cribriform': 20, 'Mucinous': 18, 'Other': 12'）
- ONCOTREE_CODE（'IMMC': 18, 'BREAST': 12）
- HER2_SNP6（'UNDEF': 3）
- CANCER_TYPE_DETAILED（'Breast Invasive Mixed Mucinous Carcinoma': 18, 'Breast': 12）
- TUMOR_STAGE（4.0: 8, 0.0: 1）


## 特徴量選択手法でそもそも上記の特徴量が必要そうかを確認してみる？

sklearn　特徴量選択手法｜https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection


とりあえずリストワイズ法を適用

In [21]:
df_MB = dropna_u5(df_MB)

除外データ数： 99
nullを含むデータ数がnum個以下のcolumnsを保有するデータのインデックス： Int64Index([   0,    7,    9,   21,   27,   34,   41,   42,   58,   81,
            ...
            1870, 1871, 1876, 1899, 1906, 1918, 1923, 1933, 1938, 1944], dtype='int64', length=271)


# 目的変数の生成

元のdfにはない目的変数カラム（5年後の生存の有無）を生成する。

## 目的変数｜5年後の予後の2値分類
5年後の予後を2値分類する。  
そのためにVITAL_STATUSとOS_MONTHを利用する。  
以下のフローチャートで生成する。  

In [22]:
HTML(
    '<div class="mxgraph" style="max-width:100%;border:1px solid transparent;" data-mxgraph="{&quot;highlight&quot;:&quot;#0000ff&quot;,&quot;nav&quot;:true,&quot;resize&quot;:true,&quot;toolbar&quot;:&quot;zoom layers tags lightbox&quot;,&quot;edit&quot;:&quot;_blank&quot;,&quot;xml&quot;:&quot;&lt;mxfile host=\&quot;Electron\&quot; modified=\&quot;2022-05-01T07:33:42.0.405Z\&quot; agent=\&quot;5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/17.2.0.2 Chrome/100.0.4896.60 Electron/18.0.1 Safari/537.36\&quot; etag=\&quot;CUa6MuG2A-EYqdbBh0ys\&quot; version=\&quot;17.2.0.2\&quot; type=\&quot;device\&quot;&gt;&lt;diagram id=\&quot;C5RBs43oDa-KdzZeNtuy\&quot; name=\&quot;Page-1\&quot;&gt;7VhZc5swEP41TJ7a4Yix/RjbcZI2SduBJNOnjAIyqBasK4SP/vpKQTIQXB9tncMTP3jYZVlJ+317gOH0k/kZQ5P4CkJMDdsM54YzMGy7ZZniXyoWhcLtOoUiYiQsVFap8MgvrJTquSgnIc5qhhyAcjKpKwNIUxzwmg4xBrO62QhofdUJinBD4QWINrV3JORxoe3Y7VJ/jkkU65Utt1vcSZA2VifJYhTCrKJyTg2nzwB4cZXM+5jK2Om43F0s7ujl2D379C37iW56n/3r2w+Fs+EujyyPwHDK/69ru3A9RTRX8VJn5QsdQAZ5GmLpxDScXswTKi4tcfkDc75QgKOcg1AB4zFEkCJ6CTBRdiNIuTKzpIzT8EQCK+QHCsG4UA0JpWoNISn7jpAyzmC8xE46WAIhjSl6wLSHgnH0uNE+UGDiVgoplq5CQQZ1lnJzp6W2t2VsFQYZ5CzAa+xUcnDEIrzOn1vYyf1VeKqQO8OQYM4WwoBhijiZ1smMVE5ES7sSd3GhoN+BBk6DBuHIaPVuL/yTy3vPP/FvPMMWrs0v3v3Vl2v/3DNagzVMkSjNYsKxN0GP4ZqJ2lJnT5UV4uC9iKIsU5hugHw3yKaYcTxfG2R911WZrkpdR4mzsm5YuhjElZpxbO4JluMGLN9FKX1P0H9JUHfLBNX9a2OGKrJoYmydsMrTVyDiXKUJjEaZ2NhT6iwX/Hs2tRpsuoYGmTbj8abp9kJkau/GJevVc8ltcKnsDDITXJTIek+5jLm8b+raWi1cMSQPeba5V9QAlvQZooRQGaxzTKeYkwCt6CiIkigVQiCwxmw1c8SSJI2E5JaS/8hUUXz32Gna9U6zlKutxlzRajr7ajXt1ROAYrZs9maB4xDRDB9c63fsJ4B0X7r3d7ap1m+6Fj9769dldXPvt7as14o95kdH/GoEev3jQHer4fJ9HtgPw+xDmwj0Dit8qr8+qv5xNCCCLLYJI/E3IBlGGT56Hw3+OBrYqzrRs44GuhpWZwMmAnFoM8DT1/9j56VnAF0lNk9lPssPfyhzrP0BIsTyS25RE8vP4c7pbw==&lt;/diagram&gt;&lt;/mxfile&gt;&quot;}"></div><script type="text/javascript" src="https://viewer.diagrams.net/js/viewer-static.min.js"></script>'
)

In [23]:
df_MB["target_OS_5years"] = np.nan
df_MB["target_OS_5years"] = df_MB["target_OS_5years"].mask(
    df_MB["OS_MONTHS"] > 60, False
)
df_MB["target_OS_5years"] = df_MB["target_OS_5years"].mask(
    (df_MB["OS_MONTHS"] <= 60) & (df_MB["VITAL_STATUS"] == "Died of Disease"),
    True,
)
# targetを定義できなかったデータは除く
df_MB = df_MB[df_MB["target_OS_5years"].notnull()]
check(df_MB)

Unnamed: 0,feature,dtypes,nan,count,num_unique,unique,unique_counts
0,LYMPH_NODES_EXAMINED_POSITIVE,float64,0,1583,30,,
1,NPI,float64,0,1583,265,,
2,CELLULARITY,object,0,1583,3,['High' 'Moderate' 'Low'],"{'High': 821, 'Moderate': 599, 'Low': 163}"
3,CHEMOTHERAPY,object,0,1583,2,['NO' 'YES'],"{'NO': 1230, 'YES': 353}"
4,COHORT,float64,0,1583,5,[1. 2. 3. 5. 4.],"{3.0: 619, 1.0: 372, 2.0: 254, 4.0: 200, 5.0: 138}"
5,INFERRED_MENOPAUSAL_STATE,object,0,1583,2,['Pre' 'Post'],"{'Post': 1221, 'Pre': 362}"
6,INTCLUST,object,0,1583,11,['4ER+' '3' '9' '7' '4ER-' '8' '10' '1' '2' '6' '5'],"{'3': 240, '8': 240, '10': 198, '4ER+': 186, '5': 153, '7': 150, '1': 117, '9': 113, '6': 68, '2': 63, '4ER-': 55}"
7,AGE_AT_DIAGNOSIS,float64,0,1583,1337,,
8,CLAUDIN_SUBTYPE,object,0,1583,7,['LumA' 'LumB' 'claudin-low' 'Her2' 'Normal' 'Basal' 'NC'],"{'LumA': 568, 'LumB': 388, 'Her2': 181, 'Basal': 175, 'claudin-low': 158, 'Normal': 108, 'NC': 5}"
9,THREEGENE,object,169,1414,4,['ER+/HER2- High Prolif' nan 'ER+/HER2- Low Prolif' 'ER-/HER2-' 'HER2+'],"{'ER+/HER2- High Prolif': 511, 'ER+/HER2- Low Prolif': 495, 'ER-/HER2-': 247, 'HER2+': 161}"


In [24]:
int_columns, str_cat_columns, bool_columns

(['LYMPH_NODES_EXAMINED_POSITIVE', 'OS_MONTHS', 'RFS_MONTHS'],
 ['COHORT',
  'INTCLUST',
  'CLAUDIN_SUBTYPE',
  'THREEGENE',
  'HISTOLOGICAL_SUBTYPE',
  'BREAST_SURGERY',
  'LATERALITY',
  'VITAL_STATUS',
  'CANCER_TYPE',
  'CANCER_TYPE_DETAILED',
  'ONCOTREE_CODE'],
 ['CHEMOTHERAPY',
  'HORMONE_THERAPY',
  'RADIO_THERAPY',
  'OS_STATUS',
  'RFS_STATUS'])

In [25]:
# 目的変数生成で使用した特徴量を削除
df_MB.drop(["OS_MONTHS", "OS_STATUS", "VITAL_STATUS"], axis=1, inplace=True)

# リストから特徴量名を削除
int_columns.remove("OS_MONTHS")
str_cat_columns.remove("VITAL_STATUS")
bool_columns.remove("OS_STATUS")

In [26]:
df_MB.head()

Unnamed: 0,LYMPH_NODES_EXAMINED_POSITIVE,NPI,CELLULARITY,CHEMOTHERAPY,COHORT,INFERRED_MENOPAUSAL_STATE,INTCLUST,AGE_AT_DIAGNOSIS,CLAUDIN_SUBTYPE,THREEGENE,LATERALITY,HISTOLOGICAL_SUBTYPE,GRADE,ONCOTREE_CODE,PR_STATUS,TMB_NONSYNONYMOUS,CANCER_TYPE,CANCER_TYPE_DETAILED,ER_IHC,HER2_SNP6,ER_STATUS,HER2_STATUS,TUMOR_SIZE,TUMOR_STAGE,HORMONE_THERAPY,RADIO_THERAPY,BREAST_SURGERY,RFS_STATUS,RFS_MONTHS,target_OS_5years
1,0.0,4.02,High,NO,1.0,Pre,4ER+,43.19,LumA,ER+/HER2- High Prolif,Right,Ductal/NST,3.0,IDC,Positive,2.615035,Breast Cancer,Breast Invasive Ductal Carcinoma,Positve,NEUTRAL,Positive,Negative,10.0,1.0,YES,YES,BREAST CONSERVING,0:Not Recurred,83.52,False
2,1.0,4.03,High,YES,1.0,Pre,3,48.87,LumB,,Right,Ductal/NST,2.0,IDC,Positive,2.615035,Breast Cancer,Breast Invasive Ductal Carcinoma,Positve,NEUTRAL,Positive,Negative,15.0,2.0,YES,NO,MASTECTOMY,1:Recurred,151.28,False
3,3.0,4.05,Moderate,YES,1.0,Pre,9,47.68,LumB,,Right,Mixed,2.0,MDLC,Positive,1.307518,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Positve,NEUTRAL,Positive,Negative,25.0,2.0,YES,YES,MASTECTOMY,0:Not Recurred,162.76,False
4,8.0,6.08,High,YES,1.0,Post,9,76.97,LumB,ER+/HER2- High Prolif,Right,Mixed,3.0,MDLC,Positive,2.615035,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Positve,NEUTRAL,Positive,Negative,40.0,2.0,YES,YES,MASTECTOMY,1:Recurred,18.55,True
5,0.0,4.062,Moderate,NO,1.0,Post,7,78.77,LumB,ER+/HER2- High Prolif,Left,Ductal/NST,3.0,IDC,Positive,5.230071,Breast Cancer,Breast Invasive Ductal Carcinoma,Positve,NEUTRAL,Positive,Negative,31.0,4.0,YES,YES,MASTECTOMY,1:Recurred,2.89,True


# 特徴量の削除

nullの多い特徴量
- THREEGENE
- LATERALITY
- TUMOR_STAGE

今回はcolumnごと削除する

In [27]:
check(df_MB[["THREEGENE", "LATERALITY", "TUMOR_STAGE"]])

Unnamed: 0,feature,dtypes,nan,count,num_unique,unique,unique_counts
0,THREEGENE,object,169,1414,4,['ER+/HER2- High Prolif' nan 'ER+/HER2- Low Prolif' 'ER-/HER2-' 'HER2+'],"{'ER+/HER2- High Prolif': 511, 'ER+/HER2- Low Prolif': 495, 'ER-/HER2-': 247, 'HER2+': 161}"
1,LATERALITY,object,81,1502,2,['Right' 'Left' nan],"{'Left': 789, 'Right': 713}"
2,TUMOR_STAGE,float64,403,1180,5,[ 1. 2. 4. 3. nan 0.],"{2.0: 671, 1.0: 402, 3.0: 98, 4.0: 8, 0.0: 1}"


In [28]:
str_cat_columns, num_cat_columns

(['COHORT',
  'INTCLUST',
  'CLAUDIN_SUBTYPE',
  'THREEGENE',
  'HISTOLOGICAL_SUBTYPE',
  'BREAST_SURGERY',
  'LATERALITY',
  'CANCER_TYPE',
  'CANCER_TYPE_DETAILED',
  'ONCOTREE_CODE'],
 ['CELLULARITY',
  'ER_IHC',
  'HER2_SNP6',
  'INFERRED_MENOPAUSAL_STATE',
  'ER_STATUS',
  'HER2_STATUS',
  'GRADE',
  'PR_STATUS',
  'TUMOR_STAGE'])

In [29]:
df_MB_dopped = df_MB.drop(["THREEGENE", "LATERALITY", "TUMOR_STAGE"], axis=1)

# columnsのlistから上記の名前を削除
str_cat_columns.remove("THREEGENE")
str_cat_columns.remove("LATERALITY")
num_cat_columns.remove("TUMOR_STAGE")

check(df_MB_dopped)

Unnamed: 0,feature,dtypes,nan,count,num_unique,unique,unique_counts
0,LYMPH_NODES_EXAMINED_POSITIVE,float64,0,1583,30,,
1,NPI,float64,0,1583,265,,
2,CELLULARITY,object,0,1583,3,['High' 'Moderate' 'Low'],"{'High': 821, 'Moderate': 599, 'Low': 163}"
3,CHEMOTHERAPY,object,0,1583,2,['NO' 'YES'],"{'NO': 1230, 'YES': 353}"
4,COHORT,float64,0,1583,5,[1. 2. 3. 5. 4.],"{3.0: 619, 1.0: 372, 2.0: 254, 4.0: 200, 5.0: 138}"
5,INFERRED_MENOPAUSAL_STATE,object,0,1583,2,['Pre' 'Post'],"{'Post': 1221, 'Pre': 362}"
6,INTCLUST,object,0,1583,11,['4ER+' '3' '9' '7' '4ER-' '8' '10' '1' '2' '6' '5'],"{'3': 240, '8': 240, '10': 198, '4ER+': 186, '5': 153, '7': 150, '1': 117, '9': 113, '6': 68, '2': 63, '4ER-': 55}"
7,AGE_AT_DIAGNOSIS,float64,0,1583,1337,,
8,CLAUDIN_SUBTYPE,object,0,1583,7,['LumA' 'LumB' 'claudin-low' 'Her2' 'Normal' 'Basal' 'NC'],"{'LumA': 568, 'LumB': 388, 'Her2': 181, 'Basal': 175, 'claudin-low': 158, 'Normal': 108, 'NC': 5}"
9,HISTOLOGICAL_SUBTYPE,object,0,1583,7,['Ductal/NST' 'Mixed' 'Lobular' 'Tubular/ cribriform' 'Mucinous'  'Medullary' 'Other'],"{'Ductal/NST': 1228, 'Mixed': 172, 'Lobular': 115, 'Medullary': 24, 'Tubular/ cribriform': 18, 'Mucinous': 16, 'Other': 10}"


# 以下、データ前処理

trainとtestに分割して前処理を実施（trainに適用した前処理をtestにも実施することで、リークをなくす）  
random_stateでtrain_splitの再現性をもたせる

In [30]:
df_MB_dropped_train, df_MB_dropped_test = train_test_split(
    df_MB_dopped, random_state=config.SEED
)
df_MB_dropped_train.shape, df_MB_dropped_test.shape

((1187, 27), (396, 27))

## null値の補完

null値をsklearnのImputeクラスを中心に補完する（https://scikit-learn.org/stable/modules/classes.html#module-sklearn.impute）  
Imputerクラスを使用するのは、学習前にtrainとtestで分割するので、そのとき平均値などでリークを起こさないようにするため  
Imputerクラスは数値情報にしか対応していないので、クラスラベルは置き換え必須

def impute(df_train: pd.DataFrame, df_test: pd.DataFrame, how: str = "mean"):
    if how == "mean":
        imp_mean = SimpleImputer(strategy="mean")
        imp_mean.fit(df_train)
        df_train = pd.DataFrame(
            imp_mean.transform(df_train), columns=df_train.columns, index=df_train.index
        )
        df_test = pd.DataFrame(
            imp_mean.transform(df_test), columns=df_test.columns, index=df_test.index
        )
    elif how == "iter":
        imp_iter = IterativeImputer()
        imp_iter.fit(df_train)
        df_train = pd.DataFrame(
            imp_iter.transform(df_train), columns=df_train.columns, index=df_train.index
        )
        df_test = pd.DataFrame(
            imp_iter.transform(df_test), columns=df_test.columns, index=df_test.index
        )
    elif how == "knn":
        imp_knn = KNNImputer(n_neighbors=5)
        imp_knn.fit(df_train)
        df_train = pd.DataFrame(
            imp_knn.transform(df_train), columns=df_train.columns, index=df_train.index
        )
        df_test = pd.DataFrame(
            imp_knn.transform(df_test), columns=df_test.columns, index=df_test.index
        )
    else:
        print("how is not defined!")
    return df_train, df_test

impute()の動作検証

imputed_column = "THREEGENE"
impute(df_MB_train[imputed_column], df_MB_test[imputed_column], how="knn")

## 型のエンコーディング
順序尺度特徴量とboolean特徴量について、データを扱いやすい形に変換する。

### 質的変数（順序尺度）のエンコーディング
num_cat_columnsは、順序のあるstrの変数で構成されている。
そこで大小関係に対応するようstrをintに変換する。

In [31]:
check(df_MB_dopped[num_cat_columns])

Unnamed: 0,feature,dtypes,nan,count,num_unique,unique,unique_counts
0,CELLULARITY,object,0,1583,3,['High' 'Moderate' 'Low'],"{'High': 821, 'Moderate': 599, 'Low': 163}"
1,ER_IHC,object,0,1583,2,['Positve' 'Negative'],"{'Positve': 1214, 'Negative': 369}"
2,HER2_SNP6,object,0,1583,4,['NEUTRAL' 'LOSS' 'GAIN' 'UNDEF'],"{'NEUTRAL': 1152, 'GAIN': 344, 'LOSS': 84, 'UNDEF': 3}"
3,INFERRED_MENOPAUSAL_STATE,object,0,1583,2,['Pre' 'Post'],"{'Post': 1221, 'Pre': 362}"
4,ER_STATUS,object,0,1583,2,['Positive' 'Negative'],"{'Positive': 1205, 'Negative': 378}"
5,HER2_STATUS,object,0,1583,2,['Negative' 'Positive'],"{'Negative': 1387, 'Positive': 196}"
6,GRADE,float64,0,1583,3,[3. 2. 1.],"{3.0: 809, 2.0: 640, 1.0: 134}"
7,PR_STATUS,object,0,1583,2,['Positive' 'Negative'],"{'Positive': 835, 'Negative': 748}"


In [32]:
df_MB_dopped["CELLULARITY"].replace({"High": 3, "Moderate": 2, "Low": 1}, inplace=True)
df_MB_dopped["ER_IHC"].replace({"Positve": 1, "Negative": 0}, inplace=True)
df_MB_dopped["HER2_SNP6"].replace(
    {"GAIN": 3, "NEUTRAL": 2, "LOSS": 1, "UNDEF": 0}, inplace=True
)
df_MB_dopped["INFERRED_MENOPAUSAL_STATE"].replace({"Post": 1, "Pre": 0}, inplace=True)
df_MB_dopped["ER_STATUS"].replace({"Positive": 1, "Negative": 0}, inplace=True)
df_MB_dopped["HER2_STATUS"].replace({"Positive": 1, "Negative": 0}, inplace=True)
df_MB_dopped["PR_STATUS"].replace({"Positive": 1, "Negative": 0}, inplace=True)

check(df_MB_dopped[num_cat_columns])

Unnamed: 0,feature,dtypes,nan,count,num_unique,unique,unique_counts
0,CELLULARITY,int64,0,1583,3,[3 2 1],"{3: 821, 2: 599, 1: 163}"
1,ER_IHC,int64,0,1583,2,[1 0],"{1: 1214, 0: 369}"
2,HER2_SNP6,int64,0,1583,4,[2 1 3 0],"{2: 1152, 3: 344, 1: 84, 0: 3}"
3,INFERRED_MENOPAUSAL_STATE,int64,0,1583,2,[0 1],"{1: 1221, 0: 362}"
4,ER_STATUS,int64,0,1583,2,[1 0],"{1: 1205, 0: 378}"
5,HER2_STATUS,int64,0,1583,2,[0 1],"{0: 1387, 1: 196}"
6,GRADE,float64,0,1583,3,[3. 2. 1.],"{3.0: 809, 2.0: 640, 1.0: 134}"
7,PR_STATUS,int64,0,1583,2,[1 0],"{1: 835, 0: 748}"


### boolのエンコーディング
bool_columnsは対象的な2項目から構成されているため、これらの項目をboolean型にする。
Yes、Noのみを対象とする（2項目のみから構成される特徴量でも、Yes、No以外の場合は3項目を考慮してカテゴリ特徴量として考える）。

In [33]:
check(df_MB_dopped[bool_columns])

Unnamed: 0,feature,dtypes,nan,count,num_unique,unique,unique_counts
0,CHEMOTHERAPY,object,0,1583,2,['NO' 'YES'],"{'NO': 1230, 'YES': 353}"
1,HORMONE_THERAPY,object,0,1583,2,['YES' 'NO'],"{'YES': 979, 'NO': 604}"
2,RADIO_THERAPY,object,0,1583,2,['YES' 'NO'],"{'YES': 986, 'NO': 597}"
3,RFS_STATUS,object,0,1583,2,['0:Not Recurred' '1:Recurred'],"{'0:Not Recurred': 891, '1:Recurred': 692}"


In [35]:
df_MB_dopped["CHEMOTHERAPY"].replace({"YES": 1, "NO": 0}, inplace=True)
df_MB_dopped["HORMONE_THERAPY"].replace({"YES": 1, "NO": 0}, inplace=True)
df_MB_dopped["RADIO_THERAPY"].replace({"YES": 1, "NO": 0}, inplace=True)
df_MB_dopped["RFS_STATUS"].replace({"1:Recurred": 1, "0:Not Recurred": 0}, inplace=True)

In [36]:
check(df_MB_dopped[bool_columns])

Unnamed: 0,feature,dtypes,nan,count,num_unique,unique,unique_counts
0,CHEMOTHERAPY,int64,0,1583,2,[0 1],"{0: 1230, 1: 353}"
1,HORMONE_THERAPY,int64,0,1583,2,[1 0],"{1: 979, 0: 604}"
2,RADIO_THERAPY,int64,0,1583,2,[1 0],"{1: 986, 0: 597}"
3,RFS_STATUS,int64,0,1583,2,[0 1],"{0: 891, 1: 692}"


## Onehot encoding
上記で作成した一部特徴量を修正したdfを元に、onehot encodingを行う。
onehot encoding自体は名義尺度の質的変数カラムであるstr_cat_columnsのみに対してのみ実施するが、最終的に特徴量の型の修正はモデルへ入力する際に必要になるので、df_MB_dtype_encodedを元にonehot encodingを実施する。    

In [37]:
str_cat_columns

['COHORT',
 'INTCLUST',
 'CLAUDIN_SUBTYPE',
 'HISTOLOGICAL_SUBTYPE',
 'BREAST_SURGERY',
 'CANCER_TYPE',
 'CANCER_TYPE_DETAILED',
 'ONCOTREE_CODE']

# VITAL_STATUSは目的変数の生成に必要&リークになりうるので、onehot encodingからは除外する
str_cat_columns.remove(
    "VITAL_STATUS"
) if "VITAL_STATUS" in str_cat_columns else str_cat_columns
str_cat_columns

In [38]:
# 多重共線性回避のために、drop='first'
onehot_encoder = OneHotEncoder(drop="first")
onehot_encoding_columns = str_cat_columns
onehot_encoder.fit(df_MB_dopped[onehot_encoding_columns])
df_onehot = pd.DataFrame(
    onehot_encoder.transform(df_MB_dopped[onehot_encoding_columns]).toarray(),
    columns=onehot_encoder.get_feature_names_out(onehot_encoding_columns),
)  # .astype(int)
check(df_onehot)

Unnamed: 0,feature,dtypes,nan,count,num_unique,unique,unique_counts
0,COHORT_2.0,float64,0,1583,2,[0. 1.],"{0.0: 1329, 1.0: 254}"
1,COHORT_3.0,float64,0,1583,2,[0. 1.],"{0.0: 964, 1.0: 619}"
2,COHORT_4.0,float64,0,1583,2,[0. 1.],"{0.0: 1383, 1.0: 200}"
3,COHORT_5.0,float64,0,1583,2,[0. 1.],"{0.0: 1445, 1.0: 138}"
4,INTCLUST_10,float64,0,1583,2,[0. 1.],"{0.0: 1385, 1.0: 198}"
5,INTCLUST_2,float64,0,1583,2,[0. 1.],"{0.0: 1520, 1.0: 63}"
6,INTCLUST_3,float64,0,1583,2,[0. 1.],"{0.0: 1343, 1.0: 240}"
7,INTCLUST_4ER+,float64,0,1583,2,[1. 0.],"{0.0: 1397, 1.0: 186}"
8,INTCLUST_4ER-,float64,0,1583,2,[0. 1.],"{0.0: 1528, 1.0: 55}"
9,INTCLUST_5,float64,0,1583,2,[0. 1.],"{0.0: 1430, 1.0: 153}"


# 元のdfとonehotしたdfの結合

In [39]:
df_MB_dopped.shape, df_onehot.shape

((1583, 27), (1583, 35))

In [40]:
encoded_columns = str_cat_columns

df_MB_encoded = pd.merge(
    df_MB_dopped.drop(encoded_columns, axis=True),
    df_onehot,
    right_index=True,
    left_index=True,
)

check(df_MB_encoded)

Unnamed: 0,feature,dtypes,nan,count,num_unique,unique,unique_counts
0,LYMPH_NODES_EXAMINED_POSITIVE,float64,0,1251,27,,
1,NPI,float64,0,1251,208,,
2,CELLULARITY,int64,0,1251,3,[3 2 1],"{3: 615, 2: 500, 1: 136}"
3,CHEMOTHERAPY,int64,0,1251,2,[0 1],"{0: 972, 1: 279}"
4,INFERRED_MENOPAUSAL_STATE,int64,0,1251,2,[0 1],"{1: 952, 0: 299}"
5,AGE_AT_DIAGNOSIS,float64,0,1251,1089,,
6,GRADE,float64,0,1251,3,[3. 2. 1.],"{3.0: 665, 2.0: 492, 1.0: 94}"
7,PR_STATUS,int64,0,1251,2,[1 0],"{1: 659, 0: 592}"
8,TMB_NONSYNONYMOUS,float64,0,1251,29,,
9,ER_IHC,int64,0,1251,2,[1 0],"{1: 969, 0: 282}"


In [41]:
# save
make_dir(config.INTERIM_PICKLE_PREPROCESSED_OS5YEARS_DIR)
df_MB_encoded.to_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_OS5YEARS_DIR + "/df_MB_encoded.pkl"
)

In [None]:
cp = CustomPairPlot()
# cp.pairanalyzer(df_MB_encoded, hue="target_OS_5years")

In [None]:
cp = CustomPairPlot()
# cp.pairanalyzer(df_MB_encoded)