In [12]:
import os
import numpy as np
import pandas as pd

from IPython.display import HTML
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

import config

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)


def make_dir(dir_name: str):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [13]:
df_MB_onehot_concated = pd.read_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_DIR + "/4.2-df_MB_onehot_concated.pkl"
)
df_MB_onehot_concated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1124 entries, 0 to 1123
Data columns (total 64 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                                                          --------------  -----  
 0   PATIENT_ID                                                      1124 non-null   object 
 1   LYMPH_NODES_EXAMINED_POSITIVE                                   1124 non-null   float64
 2   NPI                                                             1124 non-null   float64
 3   CELLULARITY                                                     1124 non-null   int64  
 4   CHEMOTHERAPY                                                    1124 non-null   bool   
 5   ER_IHC                                                          1124 non-null   int64  
 6   HER2_SNP6                                                       1124 non-null   int64  
 7   HORMONE_THERAPY                                    

In [14]:
# column確認
df_MB_onehot_concated.columns

Index(['PATIENT_ID', 'LYMPH_NODES_EXAMINED_POSITIVE', 'NPI', 'CELLULARITY',
       'CHEMOTHERAPY', 'ER_IHC', 'HER2_SNP6', 'HORMONE_THERAPY',
       'INFERRED_MENOPAUSAL_STATE', 'SEX', 'AGE_AT_DIAGNOSIS', 'OS_MONTHS',
       'OS_STATUS', 'VITAL_STATUS', 'RADIO_THERAPY', 'RFS_STATUS',
       'RFS_MONTHS', 'SAMPLE_ID', 'ER_STATUS', 'HER2_STATUS', 'GRADE',
       'PR_STATUS', 'SAMPLE_TYPE', 'TUMOR_SIZE', 'TUMOR_STAGE',
       'TMB_NONSYNONYMOUS', 'COHORT_2.0', 'COHORT_3.0', 'COHORT_5.0',
       'INTCLUST_10', 'INTCLUST_2', 'INTCLUST_3', 'INTCLUST_4ER+',
       'INTCLUST_4ER-', 'INTCLUST_5', 'INTCLUST_6', 'INTCLUST_7', 'INTCLUST_8',
       'INTCLUST_9', 'CLAUDIN_SUBTYPE_Her2', 'CLAUDIN_SUBTYPE_LumA',
       'CLAUDIN_SUBTYPE_LumB', 'CLAUDIN_SUBTYPE_NC', 'CLAUDIN_SUBTYPE_Normal',
       'CLAUDIN_SUBTYPE_claudin-low', 'THREEGENE_ER+/HER2- Low Prolif',
       'THREEGENE_ER-/HER2-', 'THREEGENE_HER2+',
       'HISTOLOGICAL_SUBTYPE_Lobular', 'HISTOLOGICAL_SUBTYPE_Medullary',
       'HISTOLOGICAL_S

# 目的変数の生成

元のdfにはない目的変数カラムを生成する。  

## 目的変数｜5年後の予後の2値分類

### df| df_MB_OS_status_in_5years
5年後の予後を2値分類する。  
そのためにVITAL_STATUSとOS_MONTHを利用する。  
以下のフローチャートで生成する。  

In [15]:
HTML(
    '<div class="mxgraph" style="max-width:100%;border:1px solid transparent;" data-mxgraph="{&quot;highlight&quot;:&quot;#0000ff&quot;,&quot;nav&quot;:true,&quot;resize&quot;:true,&quot;toolbar&quot;:&quot;zoom layers tags lightbox&quot;,&quot;edit&quot;:&quot;_blank&quot;,&quot;xml&quot;:&quot;&lt;mxfile host=\&quot;Electron\&quot; modified=\&quot;2022-05-01T07:33:44.405Z\&quot; agent=\&quot;5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/17.4.2 Chrome/100.0.4896.60 Electron/18.0.1 Safari/537.36\&quot; etag=\&quot;CUa6MuG2A-EYqdbBh0ys\&quot; version=\&quot;17.4.2\&quot; type=\&quot;device\&quot;&gt;&lt;diagram id=\&quot;C5RBs43oDa-KdzZeNtuy\&quot; name=\&quot;Page-1\&quot;&gt;7VhZc5swEP41TJ7a4Yix/RjbcZI2SduBJNOnjAIyqBasK4SP/vpKQTIQXB9tncMTP3jYZVlJ+317gOH0k/kZQ5P4CkJMDdsM54YzMGy7ZZniXyoWhcLtOoUiYiQsVFap8MgvrJTquSgnIc5qhhyAcjKpKwNIUxzwmg4xBrO62QhofdUJinBD4QWINrV3JORxoe3Y7VJ/jkkU65Utt1vcSZA2VifJYhTCrKJyTg2nzwB4cZXM+5jK2Om43F0s7ujl2D379C37iW56n/3r2w+Fs+EujyyPwHDK/69ru3A9RTRX8VJn5QsdQAZ5GmLpxDScXswTKi4tcfkDc75QgKOcg1AB4zFEkCJ6CTBRdiNIuTKzpIzT8EQCK+QHCsG4UA0JpWoNISn7jpAyzmC8xE46WAIhjSl6wLSHgnH0uNE+UGDiVgoplq5CQQZ1lnJzp6W2t2VsFQYZ5CzAa+xUcnDEIrzOn1vYyf1VeKqQO8OQYM4WwoBhijiZ1smMVE5ES7sSd3GhoN+BBk6DBuHIaPVuL/yTy3vPP/FvPMMWrs0v3v3Vl2v/3DNagzVMkSjNYsKxN0GP4ZqJ2lJnT5UV4uC9iKIsU5hugHw3yKaYcTxfG2R911WZrkpdR4mzsm5YuhjElZpxbO4JluMGLN9FKX1P0H9JUHfLBNX9a2OGKrJoYmydsMrTVyDiXKUJjEaZ2NhT6iwX/Hs2tRpsuoYGmTbj8abp9kJkau/GJevVc8ltcKnsDDITXJTIek+5jLm8b+raWi1cMSQPeba5V9QAlvQZooRQGaxzTKeYkwCt6CiIkigVQiCwxmw1c8SSJI2E5JaS/8hUUXz32Gna9U6zlKutxlzRajr7ajXt1ROAYrZs9maB4xDRDB9c63fsJ4B0X7r3d7ap1m+6Fj9769dldXPvt7as14o95kdH/GoEev3jQHer4fJ9HtgPw+xDmwj0Dit8qr8+qv5xNCCCLLYJI/E3IBlGGT56Hw3+OBrYqzrRs44GuhpWZwMmAnFoM8DT1/9j56VnAF0lNk9lPssPfyhzrP0BIsTyS25RE8vP4c7pbw==&lt;/diagram&gt;&lt;/mxfile&gt;&quot;}"></div><script type="text/javascript" src="https://viewer.diagrams.net/js/viewer-static.min.js"></script>'
)

In [16]:
df_MB_OS_status_in_5years = df_MB_onehot_concated.copy()

In [17]:
df_MB_OS_status_in_5years[
    df_MB_OS_status_in_5years["OS_MONTHS"] > 60
].shape, df_MB_OS_status_in_5years[
    (df_MB_OS_status_in_5years["OS_MONTHS"] <= 60)
    & (df_MB_OS_status_in_5years["VITAL_STATUS"] == "Died of Disease")
].shape

((844, 64), (198, 64))

In [18]:
df_MB_OS_status_in_5years = df_MB_onehot_concated.copy()
df_MB_OS_status_in_5years["target"] = np.nan
df_MB_OS_status_in_5years["target"] = df_MB_OS_status_in_5years["target"].mask(
    df_MB_OS_status_in_5years["OS_MONTHS"] > 60, False
)
df_MB_OS_status_in_5years["target"] = df_MB_OS_status_in_5years["target"].mask(
    (df_MB_OS_status_in_5years["OS_MONTHS"] <= 60)
    & (df_MB_OS_status_in_5years["VITAL_STATUS"] == "Died of Disease"),
    True,
)
df_MB_OS_status_in_5years = df_MB_OS_status_in_5years.dropna().astype(bool)
display(
    df_MB_OS_status_in_5years.target.value_counts(),
    df_MB_OS_status_in_5years.target.isnull().sum(),
)

False    844
True     198
Name: target, dtype: int64

0

In [19]:
df_MB_OS_status_in_5years.shape

(1042, 65)

In [20]:
# save
make_dir(config.INTERIM_PICKLE_PREPROCESSED_DIR)
df_MB_OS_status_in_5years.to_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_DIR + "/4.3-df_MB_OS_status_in_5years.pkl"
)