In [3]:
import os

import numpy as np
import pandas as pd

import config

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)


def make_dir(dir_name: str):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [4]:
# PATIENT_IDにMBを含むデータのみ含むデータフレームを読み込み
df_MB = pd.read_pickle(config.INTERIM_PICKLE_PREPROCESSED_DIR + "/3.0-df_MB.pkl")
df_MB.shape

(1985, 36)

In [5]:
df_MB.isnull().sum()

PATIENT_ID                         0
LYMPH_NODES_EXAMINED_POSITIVE     76
NPI                                1
CELLULARITY                       68
CHEMOTHERAPY                       5
COHORT                             0
ER_IHC                            43
HER2_SNP6                          5
HORMONE_THERAPY                    5
INFERRED_MENOPAUSAL_STATE          5
SEX                                0
INTCLUST                           5
AGE_AT_DIAGNOSIS                   0
OS_MONTHS                          4
OS_STATUS                          4
CLAUDIN_SUBTYPE                    5
THREEGENE                        221
VITAL_STATUS                       5
LATERALITY                       115
RADIO_THERAPY                      5
HISTOLOGICAL_SUBTYPE              46
BREAST_SURGERY                    30
RFS_STATUS                         1
RFS_MONTHS                         0
SAMPLE_ID                          0
CANCER_TYPE                        0
CANCER_TYPE_DETAILED               0
E

# drop all null data case

### df｜df_MB_dropped  
null値を含む行を削除することで、null値に対応する.  

In [6]:
df_MB_dropped = df_MB.dropna(how="any")
display(df_MB_dropped.shape, df_MB_dropped.isnull().sum().sum())
# save data
make_dir(config.INTERIM_PICKLE_PREPROCESSED_DIR)
df_MB_dropped.to_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_DIR + "/3.1-df_MB_dropped.pkl"
)

(1124, 36)

0

# fill null by mean, median value (only float) case

### df｜df_MB_filled_by_mean, df_MB_filled_by_median  
null値を平均値で埋める。  
ただし、カテゴリ変数など数値以外のデータは平均値を取ることができないので、nullのまま（文字列に変換）にする。

注意｜OS_STATUSとOS_MONTH、RFS_STATUSとRFS_MONTH、VITAL_STATUSのnull値を埋めてしまうと、誤回答のデータを作成してしまうことになるので、あらかじめ除いておく。  
outcomeのnullを除いたdf｜df_MB_dropped_outcome  
除く対象｜場合分けして保存するのが億劫なので、今回は全部削除する。

In [7]:
# null値を含むかの確認表示（nullか着目しているcolumnのみ）
display(
    df_MB[
        df_MB["OS_STATUS"].isnull()
        | df_MB["OS_MONTHS"].isnull()
        | df_MB["RFS_STATUS"].isnull()
        | df_MB["RFS_MONTHS"].isnull()
        | df_MB["VITAL_STATUS"].isnull()
    ][["OS_STATUS", "OS_MONTHS", "RFS_STATUS", "RFS_MONTHS", "VITAL_STATUS"]]
)
# dfのサイズ確認（df_MB, df_MBから上記のdfを取り除いたdf）
display(
    df_MB.shape,
    df_MB[
        ~(
            df_MB["OS_STATUS"].isnull()
            | df_MB["OS_MONTHS"].isnull()
            | df_MB["RFS_STATUS"].isnull()
            | df_MB["RFS_MONTHS"].isnull()
            | df_MB["VITAL_STATUS"].isnull()
        )
    ].shape,
)
# df生成
df_MB_dropped_outcome = df_MB[
    ~(
        df_MB["OS_STATUS"].isnull()
        | df_MB["OS_MONTHS"].isnull()
        | df_MB["RFS_STATUS"].isnull()
        | df_MB["RFS_MONTHS"].isnull()
        | df_MB["VITAL_STATUS"].isnull()
    )
]

Unnamed: 0,OS_STATUS,OS_MONTHS,RFS_STATUS,RFS_MONTHS,VITAL_STATUS
9,,,1:Recurred,126.32,
223,,,0:Not Recurred,191.25,
226,,,0:Not Recurred,25.63,
411,,,0:Not Recurred,185.46,
1219,1:DECEASED,255.0,,251.64,


(1985, 36)

(1980, 36)

In [8]:
df_MB_filled_by_mean = df_MB_dropped_outcome.fillna(
    df_MB_dropped_outcome.mean(numeric_only=True)
)
display(df_MB_filled_by_mean.shape, df_MB_filled_by_mean.isnull().sum())
# save data
make_dir(config.INTERIM_PICKLE_PREPROCESSED_DIR)
df_MB_filled_by_mean.to_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_DIR + "/3.1-df_MB_filled_by_mean.pkl"
)

(1980, 36)

PATIENT_ID                         0
LYMPH_NODES_EXAMINED_POSITIVE      0
NPI                                0
CELLULARITY                       64
CHEMOTHERAPY                       1
COHORT                             0
ER_IHC                            43
HER2_SNP6                          1
HORMONE_THERAPY                    1
INFERRED_MENOPAUSAL_STATE          1
SEX                                0
INTCLUST                           1
AGE_AT_DIAGNOSIS                   0
OS_MONTHS                          0
OS_STATUS                          0
CLAUDIN_SUBTYPE                    1
THREEGENE                        217
VITAL_STATUS                       0
LATERALITY                       111
RADIO_THERAPY                      1
HISTOLOGICAL_SUBTYPE              44
BREAST_SURGERY                    26
RFS_STATUS                         0
RFS_MONTHS                         0
SAMPLE_ID                          0
CANCER_TYPE                        0
CANCER_TYPE_DETAILED               0
E

In [9]:
df_MB_filled_by_median = df_MB_dropped_outcome.fillna(
    df_MB_dropped_outcome.median(numeric_only=True)
)
display(df_MB_filled_by_median.shape, df_MB_filled_by_median.isnull().sum())
# save data
make_dir(config.INTERIM_PICKLE_PREPROCESSED_DIR)
df_MB_filled_by_median.to_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_DIR + "/3.1-df_MB_filled_by_median.pkl"
)

(1980, 36)

PATIENT_ID                         0
LYMPH_NODES_EXAMINED_POSITIVE      0
NPI                                0
CELLULARITY                       64
CHEMOTHERAPY                       1
COHORT                             0
ER_IHC                            43
HER2_SNP6                          1
HORMONE_THERAPY                    1
INFERRED_MENOPAUSAL_STATE          1
SEX                                0
INTCLUST                           1
AGE_AT_DIAGNOSIS                   0
OS_MONTHS                          0
OS_STATUS                          0
CLAUDIN_SUBTYPE                    1
THREEGENE                        217
VITAL_STATUS                       0
LATERALITY                       111
RADIO_THERAPY                      1
HISTOLOGICAL_SUBTYPE              44
BREAST_SURGERY                    26
RFS_STATUS                         0
RFS_MONTHS                         0
SAMPLE_ID                          0
CANCER_TYPE                        0
CANCER_TYPE_DETAILED               0
E