In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sweetviz as sv
import dtale

import config

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)


def make_dir(dir_name: str):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [2]:
df_patient = pd.read_table(
    config.RAW_BRCA_METABRIC_DIR + "/data_clinical_patient.txt", header=4
)
df_patient.head()

Unnamed: 0,PATIENT_ID,LYMPH_NODES_EXAMINED_POSITIVE,NPI,CELLULARITY,CHEMOTHERAPY,COHORT,ER_IHC,HER2_SNP6,HORMONE_THERAPY,INFERRED_MENOPAUSAL_STATE,SEX,INTCLUST,AGE_AT_DIAGNOSIS,OS_MONTHS,OS_STATUS,CLAUDIN_SUBTYPE,THREEGENE,VITAL_STATUS,LATERALITY,RADIO_THERAPY,HISTOLOGICAL_SUBTYPE,BREAST_SURGERY,RFS_STATUS,RFS_MONTHS
0,MB-0000,10.0,6.044,,NO,1.0,Positve,NEUTRAL,YES,Post,Female,4ER+,75.65,140.5,0:LIVING,claudin-low,ER-/HER2-,Living,Right,YES,Ductal/NST,MASTECTOMY,0:Not Recurred,138.65
1,MB-0002,0.0,4.02,High,NO,1.0,Positve,NEUTRAL,YES,Pre,Female,4ER+,43.19,84.633333,0:LIVING,LumA,ER+/HER2- High Prolif,Living,Right,YES,Ductal/NST,BREAST CONSERVING,0:Not Recurred,83.52
2,MB-0005,1.0,4.03,High,YES,1.0,Positve,NEUTRAL,YES,Pre,Female,3,48.87,163.7,1:DECEASED,LumB,,Died of Disease,Right,NO,Ductal/NST,MASTECTOMY,1:Recurred,151.28
3,MB-0006,3.0,4.05,Moderate,YES,1.0,Positve,NEUTRAL,YES,Pre,Female,9,47.68,164.933333,0:LIVING,LumB,,Living,Right,YES,Mixed,MASTECTOMY,0:Not Recurred,162.76
4,MB-0008,8.0,6.08,High,YES,1.0,Positve,NEUTRAL,YES,Post,Female,9,76.97,41.366667,1:DECEASED,LumB,ER+/HER2- High Prolif,Died of Disease,Right,YES,Mixed,MASTECTOMY,1:Recurred,18.55


In [3]:
df_patient.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509 entries, 0 to 2508
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   PATIENT_ID                     2509 non-null   object 
 1   LYMPH_NODES_EXAMINED_POSITIVE  2243 non-null   float64
 2   NPI                            2287 non-null   float64
 3   CELLULARITY                    1917 non-null   object 
 4   CHEMOTHERAPY                   1980 non-null   object 
 5   COHORT                         2498 non-null   float64
 6   ER_IHC                         2426 non-null   object 
 7   HER2_SNP6                      1980 non-null   object 
 8   HORMONE_THERAPY                1980 non-null   object 
 9   INFERRED_MENOPAUSAL_STATE      1980 non-null   object 
 10  SEX                            2509 non-null   object 
 11  INTCLUST                       1980 non-null   object 
 12  AGE_AT_DIAGNOSIS               2498 non-null   f

In [4]:
# dtale.show(df_patient)

## 特徴量の型に合わせたエンコーディング

In [5]:
# astypeするためのdict作成
def list2dict(str_dtype: str, columns: list, dic: dict = {}) -> dict:
    for column in columns:
        dic[column] = str_dtype
    return dic

In [6]:
target_columns = [
    "OS_MONTHS",
    "RFS_MONTHS",
    "VITAL_STATUS",
]

int_columns = [
    "LYMPH_NODES_EXAMINED_POSITIVE",
    "OS_MONTHS",
    "RFS_MONTHS",
]

float_columns = [
    "NPI",
    "AGE_AT_DIAGNOSIS",
]

str_columns = []

# 質的変数（順序尺度）
qualitative_ordinal_columns = [
    "CELLULARITY",
    "ER_IHC",
    "HER2_SNP6",
    "INFERRED_MENOPAUSAL_STATE",
]
# 質的変数（名義尺度）
qualitative_name_columns = [
    "COHORT",
    "INTCLUST",
    "CLAUDIN_SUBTYPE",
    "THREEGENE",
    "HISTOLOGICAL_SUBTYPE",
    "BREAST_SURGERY",
    "LATERALITY",
    "VITAL_STATUS",
]

bool_columns = [
    "CHEMOTHERAPY",
    "HORMONE_THERAPY",
    "RADIO_THERAPY",
    "OS_STATUS",
    "RFS_STATUS",
]

meanless_columns = [
    "PATIENT_ID",
    "SEX",
]

print(
    df_patient.shape[1],
    len(int_columns)
    + len(float_columns)
    + len(str_columns)
    + len(qualitative_ordinal_columns)
    + len(qualitative_name_columns)
    + len(bool_columns)
    + len(meanless_columns),
)
assert df_patient.shape[1] == len(int_columns) + len(float_columns) + len(
    str_columns
) + len(qualitative_ordinal_columns) + len(qualitative_name_columns) + len(
    bool_columns
) + len(
    meanless_columns
), "lack or too much columns"

24 24


### 質的変数（順序尺度）のエンコーディング

In [7]:
qualitative_ordinal_columns

['CELLULARITY', 'ER_IHC', 'HER2_SNP6', 'INFERRED_MENOPAUSAL_STATE']

In [8]:
for i in qualitative_ordinal_columns:
    display(df_patient[i].unique())

array([nan, 'High', 'Moderate', 'Low'], dtype=object)

array(['Positve', 'Negative', nan], dtype=object)

array(['NEUTRAL', 'LOSS', nan, 'GAIN', 'UNDEF'], dtype=object)

array(['Post', 'Pre', nan], dtype=object)

In [9]:
df_patient_converted = df_patient.copy()

df_patient_converted["CELLULARITY_converted"] = df_patient_converted["CELLULARITY"].map(
    {"High": 3, "Moderate": 2, "Low": 1}
)
df_patient_converted["ER_IHC_converted"] = df_patient_converted["ER_IHC"].map(
    {"Positve": 1, "Negative": -1}
)
df_patient_converted["HER2_SNP6_converted"] = df_patient_converted["HER2_SNP6"].map(
    {"GAIN": 3, "NEUTRAL": 2, "LOSS": 1, "UNDEF": 0}
)
df_patient_converted["INFERRED_MENOPAUSAL_STATE_converted"] = df_patient_converted[
    "INFERRED_MENOPAUSAL_STATE"
].map({"Post": 1, "Pre": -1})

qualitative_ordinal_columns = [
    "CELLULARITY_converted",
    "ER_IHC_converted",
    "HER2_SNP6_converted",
    "INFERRED_MENOPAUSAL_STATE_converted",
]

df_patient_converted = df_patient_converted.drop(
    ["CELLULARITY", "ER_IHC", "HER2_SNP6", "INFERRED_MENOPAUSAL_STATE"], axis=1
)

### boolのエンコーディング

astype(bool)のみではうまく変換されなかった

In [10]:
bool_columns

['CHEMOTHERAPY', 'HORMONE_THERAPY', 'RADIO_THERAPY', 'OS_STATUS', 'RFS_STATUS']

In [11]:
for i in bool_columns:
    display(df_patient[i].unique())

array(['NO', 'YES', nan], dtype=object)

array(['YES', 'NO', nan], dtype=object)

array(['YES', 'NO', nan], dtype=object)

array(['0:LIVING', '1:DECEASED', nan], dtype=object)

array(['0:Not Recurred', '1:Recurred', nan], dtype=object)

In [12]:
df_patient_converted["CHEMOTHERAPY_converted"] = df_patient_converted[
    "CHEMOTHERAPY"
].map({"YES": 1, "NO": 0})
df_patient_converted["HORMONE_THERAPY_converted"] = df_patient_converted[
    "HORMONE_THERAPY"
].map({"YES": 1, "NO": 0})
df_patient_converted["RADIO_THERAPY_converted"] = df_patient_converted[
    "RADIO_THERAPY"
].map({"YES": 1, "NO": 0})
df_patient_converted["OS_STATUS"] = df_patient_converted["OS_STATUS"].map(
    {"1:DECEASED": 1, "0:LIVING": 0}
)
df_patient_converted["RFS_STATUS"] = df_patient_converted["RFS_STATUS"].map(
    {"1:Recurred": 1, "0:Not Recurred": 0}
)

bool_columns = [
    "CHEMOTHERAPY_converted",
    "HORMONE_THERAPY_converted",
    "RADIO_THERAPY_converted",
    "OS_STATUS",
    "RFS_STATUS",
]

df_patient_converted = df_patient_converted.drop(
    ["CHEMOTHERAPY", "HORMONE_THERAPY", "RADIO_THERAPY"], axis=1
)

In [13]:
astype_dict = {}
astype_dict = list2dict("float", int_columns)
astype_dict = list2dict("float", float_columns)
astype_dict = list2dict("str", str_columns)
astype_dict = list2dict("float", qualitative_ordinal_columns)
astype_dict = list2dict("str", qualitative_name_columns)
astype_dict = list2dict("bool", bool_columns)
display(astype_dict, len(astype_dict))
assert (
    len(astype_dict) + len(meanless_columns) == df_patient_converted.shape[1]
), "lack or too much columns"

{'LYMPH_NODES_EXAMINED_POSITIVE': 'float',
 'OS_MONTHS': 'float',
 'RFS_MONTHS': 'float',
 'NPI': 'float',
 'AGE_AT_DIAGNOSIS': 'float',
 'CELLULARITY_converted': 'float',
 'ER_IHC_converted': 'float',
 'HER2_SNP6_converted': 'float',
 'INFERRED_MENOPAUSAL_STATE_converted': 'float',
 'COHORT': 'str',
 'INTCLUST': 'str',
 'CLAUDIN_SUBTYPE': 'str',
 'THREEGENE': 'str',
 'HISTOLOGICAL_SUBTYPE': 'str',
 'BREAST_SURGERY': 'str',
 'LATERALITY': 'str',
 'VITAL_STATUS': 'str',
 'CHEMOTHERAPY_converted': 'bool',
 'HORMONE_THERAPY_converted': 'bool',
 'RADIO_THERAPY_converted': 'bool',
 'OS_STATUS': 'bool',
 'RFS_STATUS': 'bool'}

22

In [14]:
df_patient_converted = df_patient_converted.astype(astype_dict).copy()
df_patient_converted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509 entries, 0 to 2508
Data columns (total 24 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   PATIENT_ID                           2509 non-null   object 
 1   LYMPH_NODES_EXAMINED_POSITIVE        2243 non-null   float64
 2   NPI                                  2287 non-null   float64
 3   COHORT                               2509 non-null   object 
 4   SEX                                  2509 non-null   object 
 5   INTCLUST                             2509 non-null   object 
 6   AGE_AT_DIAGNOSIS                     2498 non-null   float64
 7   OS_MONTHS                            1981 non-null   float64
 8   OS_STATUS                            2509 non-null   bool   
 9   CLAUDIN_SUBTYPE                      2509 non-null   object 
 10  THREEGENE                            2509 non-null   object 
 11  VITAL_STATUS                  

In [15]:
df_patient.head()

Unnamed: 0,PATIENT_ID,LYMPH_NODES_EXAMINED_POSITIVE,NPI,CELLULARITY,CHEMOTHERAPY,COHORT,ER_IHC,HER2_SNP6,HORMONE_THERAPY,INFERRED_MENOPAUSAL_STATE,SEX,INTCLUST,AGE_AT_DIAGNOSIS,OS_MONTHS,OS_STATUS,CLAUDIN_SUBTYPE,THREEGENE,VITAL_STATUS,LATERALITY,RADIO_THERAPY,HISTOLOGICAL_SUBTYPE,BREAST_SURGERY,RFS_STATUS,RFS_MONTHS
0,MB-0000,10.0,6.044,,NO,1.0,Positve,NEUTRAL,YES,Post,Female,4ER+,75.65,140.5,0:LIVING,claudin-low,ER-/HER2-,Living,Right,YES,Ductal/NST,MASTECTOMY,0:Not Recurred,138.65
1,MB-0002,0.0,4.02,High,NO,1.0,Positve,NEUTRAL,YES,Pre,Female,4ER+,43.19,84.633333,0:LIVING,LumA,ER+/HER2- High Prolif,Living,Right,YES,Ductal/NST,BREAST CONSERVING,0:Not Recurred,83.52
2,MB-0005,1.0,4.03,High,YES,1.0,Positve,NEUTRAL,YES,Pre,Female,3,48.87,163.7,1:DECEASED,LumB,,Died of Disease,Right,NO,Ductal/NST,MASTECTOMY,1:Recurred,151.28
3,MB-0006,3.0,4.05,Moderate,YES,1.0,Positve,NEUTRAL,YES,Pre,Female,9,47.68,164.933333,0:LIVING,LumB,,Living,Right,YES,Mixed,MASTECTOMY,0:Not Recurred,162.76
4,MB-0008,8.0,6.08,High,YES,1.0,Positve,NEUTRAL,YES,Post,Female,9,76.97,41.366667,1:DECEASED,LumB,ER+/HER2- High Prolif,Died of Disease,Right,YES,Mixed,MASTECTOMY,1:Recurred,18.55


In [16]:
df_patient_converted.head()

Unnamed: 0,PATIENT_ID,LYMPH_NODES_EXAMINED_POSITIVE,NPI,COHORT,SEX,INTCLUST,AGE_AT_DIAGNOSIS,OS_MONTHS,OS_STATUS,CLAUDIN_SUBTYPE,THREEGENE,VITAL_STATUS,LATERALITY,HISTOLOGICAL_SUBTYPE,BREAST_SURGERY,RFS_STATUS,RFS_MONTHS,CELLULARITY_converted,ER_IHC_converted,HER2_SNP6_converted,INFERRED_MENOPAUSAL_STATE_converted,CHEMOTHERAPY_converted,HORMONE_THERAPY_converted,RADIO_THERAPY_converted
0,MB-0000,10.0,6.044,1.0,Female,4ER+,75.65,140.5,False,claudin-low,ER-/HER2-,Living,Right,Ductal/NST,MASTECTOMY,False,138.65,,1.0,2.0,1.0,False,True,True
1,MB-0002,0.0,4.02,1.0,Female,4ER+,43.19,84.633333,False,LumA,ER+/HER2- High Prolif,Living,Right,Ductal/NST,BREAST CONSERVING,False,83.52,3.0,1.0,2.0,-1.0,False,True,True
2,MB-0005,1.0,4.03,1.0,Female,3,48.87,163.7,True,LumB,,Died of Disease,Right,Ductal/NST,MASTECTOMY,True,151.28,3.0,1.0,2.0,-1.0,True,True,False
3,MB-0006,3.0,4.05,1.0,Female,9,47.68,164.933333,False,LumB,,Living,Right,Mixed,MASTECTOMY,False,162.76,2.0,1.0,2.0,-1.0,True,True,True
4,MB-0008,8.0,6.08,1.0,Female,9,76.97,41.366667,True,LumB,ER+/HER2- High Prolif,Died of Disease,Right,Mixed,MASTECTOMY,True,18.55,3.0,1.0,2.0,1.0,True,True,True


In [17]:
# pickleとして保存
make_dir(config.INTERIM_PICKLE_EDA_DIR)
df_patient_converted.to_pickle(
    config.INTERIM_PICKLE_EDA_DIR + "/1.0.0-data_clinical_patient.pkl"
)