In [6]:
import os
import numpy as np
import pandas as pd

from IPython.display import HTML
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from seaborn_analyzer import CustomPairPlot
import seaborn as sns
import config

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)


def make_dir(dir_name: str):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [7]:
df_MB_OS_status_in_5years = pd.read_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_DIR + "/2.0.3-df_MB_OS_status_in_5years.pkl"
)
df_MB_OS_status_in_5years.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1730 entries, 1 to 1979
Data columns (total 80 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                                                          --------------  -----  
 0   PATIENT_ID                                                      1730 non-null   object 
 1   LYMPH_NODES_EXAMINED_POSITIVE                                   1730 non-null   float64
 2   NPI                                                             1730 non-null   float64
 3   CELLULARITY                                                     1730 non-null   float64
 4   CHEMOTHERAPY                                                    1730 non-null   object 
 5   ER_IHC                                                          1730 non-null   float64
 6   HER2_SNP6                                                       1730 non-null   float64
 7   HORMONE_THERAPY                                    

In [8]:
target_columns = [
    # patient
    "OS_MONTHS",
    "RFS_MONTHS",
    "OS_STATUS",
    "RFS_STATUS",
    "VITAL_STATUS",
]

meanless_columns = [
    # patient
    "PATIENT_ID",
    "SEX",
    # sample
    "SAMPLE_ID",
    "SAMPLE_TYPE",
]

# 不要なdfを削除する

### df| df_MB_OS_status_in_5years_dropped
5年後の予後の2値分類のために、不要な特徴量を削除する。  
削除するcolumnsのリスト
- target_columns
- meanless_columns

In [9]:
target = "target"
drop_columns = target_columns + meanless_columns
df_MB_OS_status_in_5years_dropped = df_MB_OS_status_in_5years.drop(drop_columns, axis=1)
df_MB_OS_status_in_5years_dropped.shape

(1730, 71)

In [10]:
df_MB_OS_status_in_5years_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1730 entries, 1 to 1979
Data columns (total 71 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                                                          --------------  -----  
 0   LYMPH_NODES_EXAMINED_POSITIVE                                   1730 non-null   float64
 1   NPI                                                             1730 non-null   float64
 2   CELLULARITY                                                     1730 non-null   float64
 3   CHEMOTHERAPY                                                    1730 non-null   object 
 4   ER_IHC                                                          1730 non-null   float64
 5   HER2_SNP6                                                       1730 non-null   float64
 6   HORMONE_THERAPY                                                 1730 non-null   object 
 7   INFERRED_MENOPAUSAL_STATE                          

In [11]:
# save
make_dir(config.INTERIM_PICKLE_PREPROCESSED_DIR)
df_MB_OS_status_in_5years_dropped.to_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_DIR
    + "/2.0.4-df_MB_OS_status_in_5years_dropped.pkl"
)

plot(めっちゃ重い)

In [None]:
#cp = CustomPairPlot()
#cp.pairanalyzer(df_MB_OS_status_in_5years_dropped, hue="target")

In [None]:
#cp = CustomPairPlot()
#cp.pairanalyzer(df_MB_OS_status_in_5years_dropped)

### 再発の分類でOS_MONTHとOS_STATUSを特徴量にしてしまうと生死が分かった状態で学習してしまう→これは良くない、死んだ人がいる前提的な、消した方がいいかも

In [9]:
import os
import numpy as np
import pandas as pd

from IPython.display import HTML
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from seaborn_analyzer import CustomPairPlot
import seaborn as sns
import config

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

def make_dir(dir_name: str):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [10]:
df_MB_RFS_STATUS = pd.read_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_DIR +  "/2.0.2-df_MB_onehot_concated.pkl"
)
df_MB_RFS_STATUS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1980 entries, 0 to 1979
Data columns (total 79 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                                                          --------------  -----  
 0   PATIENT_ID                                                      1980 non-null   object 
 1   LYMPH_NODES_EXAMINED_POSITIVE                                   1980 non-null   float64
 2   NPI                                                             1980 non-null   float64
 3   CELLULARITY                                                     1916 non-null   float64
 4   CHEMOTHERAPY                                                    1979 non-null   object 
 5   ER_IHC                                                          1937 non-null   float64
 6   HER2_SNP6                                                       1979 non-null   float64
 7   HORMONE_THERAPY                                    

In [11]:
target_columns = [
    # patient
    "OS_MONTHS",
    "RFS_MONTHS",
    "OS_STATUS",
    #"RFS_STATUS",
    "VITAL_STATUS",
]

meanless_columns = [
    # patient
    "PATIENT_ID",
    "SEX",
    # sample
    "SAMPLE_ID",
    "SAMPLE_TYPE",
]

In [19]:
target = "RFS_STATUS"
drop_columns = target_columns + meanless_columns
df_MB_RFS_status_dropped = df_MB_RFS_STATUS.drop(drop_columns, axis=1)
df_MB_RFS_status_dropped.shape

(1980, 71)

In [23]:
df_MB_RFS_status_dropped=df_MB_RFS_status_dropped.dropna()

In [24]:
df_MB_RFS_status_dropped.isnull().sum()

LYMPH_NODES_EXAMINED_POSITIVE    0
NPI                              0
CELLULARITY                      0
CHEMOTHERAPY                     0
ER_IHC                           0
                                ..
ONCOTREE_CODE_ILC                0
ONCOTREE_CODE_IMMC               0
ONCOTREE_CODE_MBC                0
ONCOTREE_CODE_MDLC               0
ONCOTREE_CODE_PBS                0
Length: 71, dtype: int64

In [25]:
# save
make_dir(config.INTERIM_PICKLE_PREPROCESSED_DIR)
df_MB_RFS_status_dropped.to_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_DIR
    + "/2.0.4-df_MB_RFS_status_dropped.pkl"
)