In [2]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

import config

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)


def make_dir(dir_name: str):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

# 型のエンコーディング
## df｜df_MB_dtype_encoded
順序尺度特徴量とboolean特徴量について、データを扱いやすい形に変換する。

In [3]:
'''df_MB_dtype_encoded = pd.read_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_DIR + "/2.0.1-df_MB_dropped.pkl"
)
'''
df_MB_dtype_encoded = pd.read_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_DIR + "/2.0.1-df_MB_filled_by_median.pkl"
)

df_MB_dtype_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1980 entries, 0 to 1984
Data columns (total 36 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   PATIENT_ID                     1980 non-null   object 
 1   LYMPH_NODES_EXAMINED_POSITIVE  1980 non-null   float64
 2   NPI                            1980 non-null   float64
 3   CELLULARITY                    1916 non-null   object 
 4   CHEMOTHERAPY                   1979 non-null   object 
 5   COHORT                         1980 non-null   float64
 6   ER_IHC                         1937 non-null   object 
 7   HER2_SNP6                      1979 non-null   object 
 8   HORMONE_THERAPY                1979 non-null   object 
 9   INFERRED_MENOPAUSAL_STATE      1979 non-null   object 
 10  SEX                            1980 non-null   object 
 11  INTCLUST                       1979 non-null   object 
 12  AGE_AT_DIAGNOSIS               1980 non-null   f

In [5]:
"""[
    str(column) + str(df_MB_dtype_encoded[column].unique())
    for column in df_MB_dtype_encoded.columns
]"""

'[\n    str(column) + str(df_MB_dtype_encoded[column].unique())\n    for column in df_MB_dtype_encoded.columns\n]'

In [6]:
target_columns = [
    # patient
    "OS_MONTHS",
    "RFS_MONTHS",
    "OS_STATUS",
    "RFS_STATUS",
    "VITAL_STATUS",
]

int_columns = [
    # patient
    "LYMPH_NODES_EXAMINED_POSITIVE",
    "OS_MONTHS",
    "RFS_MONTHS",
]

float_columns = [
    # patient
    "NPI",
    "AGE_AT_DIAGNOSIS",
    # sample
    "TUMOR_SIZE",
    "TMB_NONSYNONYMOUS",
]

str_columns = []

# 質的変数（順序尺度）
qualitative_ordinal_columns = [
    # patient
    "CELLULARITY",
    "ER_IHC",
    "HER2_SNP6",
    "INFERRED_MENOPAUSAL_STATE",
    # sample
    "ER_STATUS",
    "HER2_STATUS",
    "GRADE",
    "PR_STATUS",
    "TUMOR_STAGE",
]
# 質的変数（名義尺度）
qualitative_name_columns = [
    # patient
    "COHORT",
    "INTCLUST",
    "CLAUDIN_SUBTYPE",
    "THREEGENE",
    "HISTOLOGICAL_SUBTYPE",
    "BREAST_SURGERY",
    "LATERALITY",
    "VITAL_STATUS",
    # sample
    "CANCER_TYPE",
    "CANCER_TYPE_DETAILED",
    "ONCOTREE_CODE",
]

bool_columns = [
    # patient
    "CHEMOTHERAPY",
    "HORMONE_THERAPY",
    "RADIO_THERAPY",
    "OS_STATUS",
    "RFS_STATUS",
]

meanless_columns = [
    # patient
    "PATIENT_ID",
    "SEX",
    # sample
    #'PATIENT_ID',
    "SAMPLE_ID",
    "SAMPLE_TYPE",
]

print(
    df_MB_dtype_encoded.shape[1],
    len(int_columns)
    + len(float_columns)
    + len(ｓｔｒ_columns)
    + len(qualitative_ordinal_columns)
    + len(qualitative_name_columns)
    + len(bool_columns)
    + len(meanless_columns),
)
assert df_MB_dtype_encoded.shape[1] == len(int_columns) + len(float_columns) + len(
    ｓｔｒ_columns
) + len(qualitative_ordinal_columns) + len(qualitative_name_columns) + len(
    bool_columns
) + len(
    meanless_columns
), "lack or too much columns"

36 36


# 質的変数（順序尺度）のエンコーディング
qualitative_ordinal_columnsは、順序のあるstrの変数で構成されている。
そこで大小関係に対応するようstrをintに変換する。

In [7]:
df_MB_dtype_encoded[qualitative_ordinal_columns].head()

Unnamed: 0,CELLULARITY,ER_IHC,HER2_SNP6,INFERRED_MENOPAUSAL_STATE,ER_STATUS,HER2_STATUS,GRADE,PR_STATUS,TUMOR_STAGE
0,,Positve,NEUTRAL,Post,Positive,Negative,3.0,Negative,2.0
1,High,Positve,NEUTRAL,Pre,Positive,Negative,3.0,Positive,1.0
2,High,Positve,NEUTRAL,Pre,Positive,Negative,2.0,Positive,2.0
3,Moderate,Positve,NEUTRAL,Pre,Positive,Negative,2.0,Positive,2.0
4,High,Positve,NEUTRAL,Post,Positive,Negative,3.0,Positive,2.0


In [8]:
for column in qualitative_ordinal_columns:
    print(column)
    display(df_MB_dtype_encoded[column].value_counts())
    print("-----" * 10)

CELLULARITY


High        965
Moderate    737
Low         214
Name: CELLULARITY, dtype: int64

--------------------------------------------------
ER_IHC


Positve     1498
Negative     439
Name: ER_IHC, dtype: int64

--------------------------------------------------
HER2_SNP6


NEUTRAL    1436
GAIN        438
LOSS        100
UNDEF         5
Name: HER2_SNP6, dtype: int64

--------------------------------------------------
INFERRED_MENOPAUSAL_STATE


Post    1555
Pre      424
Name: INFERRED_MENOPAUSAL_STATE, dtype: int64

--------------------------------------------------
ER_STATUS


Positive    1506
Negative     474
Name: ER_STATUS, dtype: int64

--------------------------------------------------
HER2_STATUS


Negative    1732
Positive     247
Name: HER2_STATUS, dtype: int64

--------------------------------------------------
GRADE


3.0    1040
2.0     771
1.0     169
Name: GRADE, dtype: int64

--------------------------------------------------
PR_STATUS


Positive    1039
Negative     940
Name: PR_STATUS, dtype: int64

--------------------------------------------------
TUMOR_STAGE


2.0    1340
1.0     500
3.0     118
0.0      12
4.0      10
Name: TUMOR_STAGE, dtype: int64

--------------------------------------------------


In [9]:
df_MB_dtype_encoded["CELLULARITY"].replace(
    {"High": 3, "Moderate": 2, "Low": 1}, inplace=True
)
df_MB_dtype_encoded["ER_IHC"].replace({"Positve": 1, "Negative": -1}, inplace=True)
df_MB_dtype_encoded["HER2_SNP6"].replace(
    {"GAIN": 3, "NEUTRAL": 2, "LOSS": 1, "UNDEF": 0}, inplace=True
)
df_MB_dtype_encoded["INFERRED_MENOPAUSAL_STATE"].replace(
    {"Post": 1, "Pre": -1}, inplace=True
)
df_MB_dtype_encoded["ER_STATUS"].replace({"Positive": 1, "Negative": -1}, inplace=True)
df_MB_dtype_encoded["HER2_STATUS"].replace(
    {"Positive": 1, "Negative": -1}, inplace=True
)
# df_MB_dtype_encoded['GRADEGRADE']
df_MB_dtype_encoded["PR_STATUS"].replace({"Positive": 1, "Negative": -1}, inplace=True)
# df_MB_dtype_encoded['TUMOR_STAGE']

for column in qualitative_ordinal_columns:
    print(column)
    display(df_MB_dtype_encoded[column].value_counts())
    print("-----" * 10)

CELLULARITY


3.0    965
2.0    737
1.0    214
Name: CELLULARITY, dtype: int64

--------------------------------------------------
ER_IHC


 1.0    1498
-1.0     439
Name: ER_IHC, dtype: int64

--------------------------------------------------
HER2_SNP6


2.0    1436
3.0     438
1.0     100
0.0       5
Name: HER2_SNP6, dtype: int64

--------------------------------------------------
INFERRED_MENOPAUSAL_STATE


 1.0    1555
-1.0     424
Name: INFERRED_MENOPAUSAL_STATE, dtype: int64

--------------------------------------------------
ER_STATUS


 1    1506
-1     474
Name: ER_STATUS, dtype: int64

--------------------------------------------------
HER2_STATUS


-1.0    1732
 1.0     247
Name: HER2_STATUS, dtype: int64

--------------------------------------------------
GRADE


3.0    1040
2.0     771
1.0     169
Name: GRADE, dtype: int64

--------------------------------------------------
PR_STATUS


 1.0    1039
-1.0     940
Name: PR_STATUS, dtype: int64

--------------------------------------------------
TUMOR_STAGE


2.0    1340
1.0     500
3.0     118
0.0      12
4.0      10
Name: TUMOR_STAGE, dtype: int64

--------------------------------------------------


# boolのエンコーディング
bool_columnsは対象的な2項目から構成されているため、これらの項目をboolean型にする。
Yes、Noのみを対象とする（2項目のみから構成される特徴量でも、Yes、No以外の場合は3項目を考慮してカテゴリ特徴量として考える）。

In [10]:
df_MB_dtype_encoded[bool_columns].astype(bool).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1980 entries, 0 to 1984
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   CHEMOTHERAPY     1980 non-null   bool 
 1   HORMONE_THERAPY  1980 non-null   bool 
 2   RADIO_THERAPY    1980 non-null   bool 
 3   OS_STATUS        1980 non-null   bool 
 4   RFS_STATUS       1980 non-null   bool 
dtypes: bool(5)
memory usage: 25.1 KB


In [11]:
for column in bool_columns:
    print(column)
    display(df_MB_dtype_encoded[column].value_counts())
    print("-----" * 10)

CHEMOTHERAPY


NO     1567
YES     412
Name: CHEMOTHERAPY, dtype: int64

--------------------------------------------------
HORMONE_THERAPY


YES    1216
NO      763
Name: HORMONE_THERAPY, dtype: int64

--------------------------------------------------
RADIO_THERAPY


YES    1172
NO      807
Name: RADIO_THERAPY, dtype: int64

--------------------------------------------------
OS_STATUS


1:DECEASED    1143
0:LIVING       837
Name: OS_STATUS, dtype: int64

--------------------------------------------------
RFS_STATUS


0:Not Recurred    1177
1:Recurred         803
Name: RFS_STATUS, dtype: int64

--------------------------------------------------


In [12]:
df_MB_dtype_encoded["CHEMOTHERAPY"].replace({"YES": True, "NO": False}, inplace=True)
df_MB_dtype_encoded["HORMONE_THERAPY"].replace({"YES": True, "NO": False}, inplace=True)
df_MB_dtype_encoded["RADIO_THERAPY"].replace({"YES": True, "NO": False}, inplace=True)
df_MB_dtype_encoded["OS_STATUS"].replace(
    {"1:DECEASED": True, "0:LIVING": False}, inplace=True
)
df_MB_dtype_encoded["RFS_STATUS"].replace(
    {"1:Recurred": True, "0:Not Recurred": False}, inplace=True
)

for column in bool_columns:
    print(column)
    display(df_MB_dtype_encoded[column].value_counts())
    print("-----" * 10)

CHEMOTHERAPY


False    1567
True      412
Name: CHEMOTHERAPY, dtype: int64

--------------------------------------------------
HORMONE_THERAPY


True     1216
False     763
Name: HORMONE_THERAPY, dtype: int64

--------------------------------------------------
RADIO_THERAPY


True     1172
False     807
Name: RADIO_THERAPY, dtype: int64

--------------------------------------------------
OS_STATUS


True     1143
False     837
Name: OS_STATUS, dtype: int64

--------------------------------------------------
RFS_STATUS


False    1177
True      803
Name: RFS_STATUS, dtype: int64

--------------------------------------------------


In [13]:
df_MB_dtype_encoded[bool_columns].dtypes

CHEMOTHERAPY       object
HORMONE_THERAPY    object
RADIO_THERAPY      object
OS_STATUS            bool
RFS_STATUS           bool
dtype: object

In [14]:
# save
make_dir(config.INTERIM_PICKLE_PREPROCESSED_DIR)
df_MB_dtype_encoded.to_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_DIR + "/2.0.2-df_MB_dtype_encoded.pkl"
)

# Onehot encoding
## df｜df_MB_onehot_encoded(onehot encodingしたdf), df_MB_onehot_concated(onehot encodingしたdfと元のdfの結合)
上記で作成した一部特徴量を修正したdfを元に、onehot encodingを行う。
onehot encoding自体は名義尺度の質的変数カラムであるqualitative_name_columnsのみに対してのみ実施するが、最終的に特徴量の型の修正はモデルへ入力する際に必要になるので、df_MB_dtype_encodedを元にonehot encodingを実施する。    

In [15]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

import config

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)


def make_dir(dir_name: str):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [16]:
df_MB_dtype_encoded = pd.read_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_DIR + "/2.0.2-df_MB_dtype_encoded.pkl"
)
df_MB_dtype_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1980 entries, 0 to 1984
Data columns (total 36 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   PATIENT_ID                     1980 non-null   object 
 1   LYMPH_NODES_EXAMINED_POSITIVE  1980 non-null   float64
 2   NPI                            1980 non-null   float64
 3   CELLULARITY                    1916 non-null   float64
 4   CHEMOTHERAPY                   1979 non-null   object 
 5   COHORT                         1980 non-null   float64
 6   ER_IHC                         1937 non-null   float64
 7   HER2_SNP6                      1979 non-null   float64
 8   HORMONE_THERAPY                1979 non-null   object 
 9   INFERRED_MENOPAUSAL_STATE      1979 non-null   float64
 10  SEX                            1980 non-null   object 
 11  INTCLUST                       1979 non-null   object 
 12  AGE_AT_DIAGNOSIS               1980 non-null   f

In [17]:
# VITAL_STATUSは目的変数の生成に必要&リークになりうるので、onehot encodingからは除外する
qualitative_name_columns.remove(
    "VITAL_STATUS"
) if "VITAL_STATUS" in qualitative_name_columns else qualitative_name_columns
qualitative_name_columns

['COHORT',
 'INTCLUST',
 'CLAUDIN_SUBTYPE',
 'THREEGENE',
 'HISTOLOGICAL_SUBTYPE',
 'BREAST_SURGERY',
 'LATERALITY',
 'CANCER_TYPE',
 'CANCER_TYPE_DETAILED',
 'ONCOTREE_CODE']

In [18]:
# 多重共線性回避のために、drop='first'
onehot_encoder = OneHotEncoder(drop="first")
onehot_encoding_columns = qualitative_name_columns
onehot_encoder.fit(df_MB_dtype_encoded[onehot_encoding_columns])
display(onehot_encoder.categories_)
display(onehot_encoder.get_feature_names_out(onehot_encoding_columns))
# enc.transform(df_MB_onehot_encoded[encoding_columns]).toarray()
df_MB_onehot_encoded = pd.DataFrame(
    onehot_encoder.transform(df_MB_dtype_encoded[onehot_encoding_columns]).toarray(),
    columns=onehot_encoder.get_feature_names_out(onehot_encoding_columns),
).astype(int)
df_MB_onehot_encoded.shape

[array([1., 2., 3., 4., 5.]),
 array(['1', '10', '2', '3', '4ER+', '4ER-', '5', '6', '7', '8', '9', nan],
       dtype=object),
 array(['Basal', 'Her2', 'LumA', 'LumB', 'NC', 'Normal', 'claudin-low',
        nan], dtype=object),
 array(['ER+/HER2- High Prolif', 'ER+/HER2- Low Prolif', 'ER-/HER2-',
        'HER2+', nan], dtype=object),
 array(['Ductal/NST', 'Lobular', 'Medullary', 'Metaplastic', 'Mixed',
        'Mucinous', 'Other', 'Tubular/ cribriform', nan], dtype=object),
 array(['BREAST CONSERVING', 'MASTECTOMY', nan], dtype=object),
 array(['Left', 'Right', nan], dtype=object),
 array(['Breast Cancer', 'Breast Sarcoma'], dtype=object),
 array(['Breast', 'Breast Angiosarcoma',
        'Breast Invasive Ductal Carcinoma',
        'Breast Invasive Lobular Carcinoma',
        'Breast Invasive Mixed Mucinous Carcinoma',
        'Breast Mixed Ductal and Lobular Carcinoma',
        'Invasive Breast Carcinoma', 'Metaplastic Breast Cancer'],
       dtype=object),
 array(['BRCA', 'BREAST', '

array(['COHORT_2.0', 'COHORT_3.0', 'COHORT_4.0', 'COHORT_5.0',
       'INTCLUST_10', 'INTCLUST_2', 'INTCLUST_3', 'INTCLUST_4ER+',
       'INTCLUST_4ER-', 'INTCLUST_5', 'INTCLUST_6', 'INTCLUST_7',
       'INTCLUST_8', 'INTCLUST_9', 'INTCLUST_nan', 'CLAUDIN_SUBTYPE_Her2',
       'CLAUDIN_SUBTYPE_LumA', 'CLAUDIN_SUBTYPE_LumB',
       'CLAUDIN_SUBTYPE_NC', 'CLAUDIN_SUBTYPE_Normal',
       'CLAUDIN_SUBTYPE_claudin-low', 'CLAUDIN_SUBTYPE_nan',
       'THREEGENE_ER+/HER2- Low Prolif', 'THREEGENE_ER-/HER2-',
       'THREEGENE_HER2+', 'THREEGENE_nan', 'HISTOLOGICAL_SUBTYPE_Lobular',
       'HISTOLOGICAL_SUBTYPE_Medullary',
       'HISTOLOGICAL_SUBTYPE_Metaplastic', 'HISTOLOGICAL_SUBTYPE_Mixed',
       'HISTOLOGICAL_SUBTYPE_Mucinous', 'HISTOLOGICAL_SUBTYPE_Other',
       'HISTOLOGICAL_SUBTYPE_Tubular/ cribriform',
       'HISTOLOGICAL_SUBTYPE_nan', 'BREAST_SURGERY_MASTECTOMY',
       'BREAST_SURGERY_nan', 'LATERALITY_Right', 'LATERALITY_nan',
       'CANCER_TYPE_Breast Sarcoma',
       'CANCER_T

(1980, 53)

In [19]:
df_MB_onehot_encoded.head()

Unnamed: 0,COHORT_2.0,COHORT_3.0,COHORT_4.0,COHORT_5.0,INTCLUST_10,INTCLUST_2,INTCLUST_3,INTCLUST_4ER+,INTCLUST_4ER-,INTCLUST_5,INTCLUST_6,INTCLUST_7,INTCLUST_8,INTCLUST_9,INTCLUST_nan,CLAUDIN_SUBTYPE_Her2,CLAUDIN_SUBTYPE_LumA,CLAUDIN_SUBTYPE_LumB,CLAUDIN_SUBTYPE_NC,CLAUDIN_SUBTYPE_Normal,CLAUDIN_SUBTYPE_claudin-low,CLAUDIN_SUBTYPE_nan,THREEGENE_ER+/HER2- Low Prolif,THREEGENE_ER-/HER2-,THREEGENE_HER2+,...,HISTOLOGICAL_SUBTYPE_Metaplastic,HISTOLOGICAL_SUBTYPE_Mixed,HISTOLOGICAL_SUBTYPE_Mucinous,HISTOLOGICAL_SUBTYPE_Other,HISTOLOGICAL_SUBTYPE_Tubular/ cribriform,HISTOLOGICAL_SUBTYPE_nan,BREAST_SURGERY_MASTECTOMY,BREAST_SURGERY_nan,LATERALITY_Right,LATERALITY_nan,CANCER_TYPE_Breast Sarcoma,CANCER_TYPE_DETAILED_Breast Angiosarcoma,CANCER_TYPE_DETAILED_Breast Invasive Ductal Carcinoma,CANCER_TYPE_DETAILED_Breast Invasive Lobular Carcinoma,CANCER_TYPE_DETAILED_Breast Invasive Mixed Mucinous Carcinoma,CANCER_TYPE_DETAILED_Breast Mixed Ductal and Lobular Carcinoma,CANCER_TYPE_DETAILED_Invasive Breast Carcinoma,CANCER_TYPE_DETAILED_Metaplastic Breast Cancer,ONCOTREE_CODE_BREAST,ONCOTREE_CODE_IDC,ONCOTREE_CODE_ILC,ONCOTREE_CODE_IMMC,ONCOTREE_CODE_MBC,ONCOTREE_CODE_MDLC,ONCOTREE_CODE_PBS
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0


# 元のdfとonehotしたdfの結合
dfの形などを調査

2つのdfのshapeを確認

In [20]:
df_MB_dtype_encoded.shape, df_MB_onehot_encoded.shape

((1980, 36), (1980, 53))

In [21]:
# 単に結合しただけでは形が異なる
pd.concat([df_MB_dtype_encoded, df_MB_onehot_encoded], axis=1).shape

(1985, 89)

In [22]:
# concatがうまく行かない理由はindexの順番。
# 元のdfのnullをdropしたため。

# なのでindexをresetしてからconcatする

# onehotで生成した元のカラムの削除
df_MB_onehot_concated = pd.concat(
    [
        df_MB_dtype_encoded.reset_index(drop=True),
        df_MB_onehot_encoded.reset_index(drop=True),
    ],
    axis=1,
).drop(onehot_encoding_columns, axis=1)
display(df_MB_onehot_concated.shape)

display(df_MB_onehot_concated.info())

(1980, 79)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1980 entries, 0 to 1979
Data columns (total 79 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                                                          --------------  -----  
 0   PATIENT_ID                                                      1980 non-null   object 
 1   LYMPH_NODES_EXAMINED_POSITIVE                                   1980 non-null   float64
 2   NPI                                                             1980 non-null   float64
 3   CELLULARITY                                                     1916 non-null   float64
 4   CHEMOTHERAPY                                                    1979 non-null   object 
 5   ER_IHC                                                          1937 non-null   float64
 6   HER2_SNP6                                                       1979 non-null   float64
 7   HORMONE_THERAPY                                    

None

In [23]:
# save
make_dir(config.INTERIM_PICKLE_PREPROCESSED_DIR)
df_MB_onehot_concated.to_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_DIR + "/2.0.2-df_MB_onehot_concated.pkl"
)