In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sweetviz as sv
import dtale

import config

#最大表示列数の指定（ここでは50列を指定）
pd.set_option('display.max_columns', 50)

def make_dir(dir_name: str):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [2]:
df_patient=pd.read_table(config.RAW_BRCA_METABRIC_DIR+'/data_clinical_patient.txt',header=4)
df_patient.head()

Unnamed: 0,PATIENT_ID,LYMPH_NODES_EXAMINED_POSITIVE,NPI,CELLULARITY,CHEMOTHERAPY,COHORT,ER_IHC,HER2_SNP6,HORMONE_THERAPY,INFERRED_MENOPAUSAL_STATE,SEX,INTCLUST,AGE_AT_DIAGNOSIS,OS_MONTHS,OS_STATUS,CLAUDIN_SUBTYPE,THREEGENE,VITAL_STATUS,LATERALITY,RADIO_THERAPY,HISTOLOGICAL_SUBTYPE,BREAST_SURGERY,RFS_STATUS,RFS_MONTHS
0,MB-0000,10.0,6.044,,NO,1.0,Positve,NEUTRAL,YES,Post,Female,4ER+,75.65,140.5,0:LIVING,claudin-low,ER-/HER2-,Living,Right,YES,Ductal/NST,MASTECTOMY,0:Not Recurred,138.65
1,MB-0002,0.0,4.02,High,NO,1.0,Positve,NEUTRAL,YES,Pre,Female,4ER+,43.19,84.633333,0:LIVING,LumA,ER+/HER2- High Prolif,Living,Right,YES,Ductal/NST,BREAST CONSERVING,0:Not Recurred,83.52
2,MB-0005,1.0,4.03,High,YES,1.0,Positve,NEUTRAL,YES,Pre,Female,3,48.87,163.7,1:DECEASED,LumB,,Died of Disease,Right,NO,Ductal/NST,MASTECTOMY,1:Recurred,151.28
3,MB-0006,3.0,4.05,Moderate,YES,1.0,Positve,NEUTRAL,YES,Pre,Female,9,47.68,164.933333,0:LIVING,LumB,,Living,Right,YES,Mixed,MASTECTOMY,0:Not Recurred,162.76
4,MB-0008,8.0,6.08,High,YES,1.0,Positve,NEUTRAL,YES,Post,Female,9,76.97,41.366667,1:DECEASED,LumB,ER+/HER2- High Prolif,Died of Disease,Right,YES,Mixed,MASTECTOMY,1:Recurred,18.55


In [3]:
df_patient.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509 entries, 0 to 2508
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   PATIENT_ID                     2509 non-null   object 
 1   LYMPH_NODES_EXAMINED_POSITIVE  2243 non-null   float64
 2   NPI                            2287 non-null   float64
 3   CELLULARITY                    1917 non-null   object 
 4   CHEMOTHERAPY                   1980 non-null   object 
 5   COHORT                         2498 non-null   float64
 6   ER_IHC                         2426 non-null   object 
 7   HER2_SNP6                      1980 non-null   object 
 8   HORMONE_THERAPY                1980 non-null   object 
 9   INFERRED_MENOPAUSAL_STATE      1980 non-null   object 
 10  SEX                            2509 non-null   object 
 11  INTCLUST                       1980 non-null   object 
 12  AGE_AT_DIAGNOSIS               2498 non-null   f

In [4]:
dtale.show(df_patient)



## 特徴量の型に合わせたエンコーディング

In [5]:
df_patient_converted=df_patient.copy()

In [6]:
# astypeするためのdict作成
def list2dict(str_dtype: str, columns:list, dic:dict={}) -> dict:
    for column in columns:
        dic[column]=str_dtype
    return dic

In [26]:
target_columns=[
    'OS_MONTHS',
    'OS_STATUS',
    'RFS_STATUS',
    'RFS_MONTHS',
    'VITAL_STATUS',
]

int_columns=[
    'LYMPH_NODES_EXAMINED_POSITIVE',
]

float_columns=[
    'NPI',
    'AGE_AT_DIAGNOSIS',
]

str_columns=[
]

# 質的変数（順序尺度）
qualitative_ordinal_columns=[
    'CELLULARITY',  
    'ER_IHC',
    'HER2_SNP6',
    'INFERRED_MENOPAUSAL_STATE',
]
# 質的変数（名義尺度）
qualitative_name_columns=[
    'COHORT',
    'INTCLUST',
    'CLAUDIN_SUBTYPE',
    'THREEGENE',
    'HISTOLOGICAL_SUBTYPE',
    'BREAST_SURGERY',
    'LATERALITY',
]

bool_columns=[
    'CHEMOTHERAPY',
    'HORMONE_THERAPY',
    'RADIO_THERAPY',
]

meanless_columns=[
    'PATIENT_ID',
    'SEX',
]

print(df_patient.shape[1],
    len(target_columns)+\
    len(int_columns)+\
    len(float_columns)+\
    len(str_columns)+\
    len(qualitative_ordinal_columns)+\
    len(qualitative_name_columns)+\
    len(bool_columns)+\
    len(meanless_columns))
assert  df_patient.shape[1]==\
    len(target_columns)+\
    len(int_columns)+\
    len(float_columns)+\
    len(str_columns)+\
    len(qualitative_ordinal_columns)+\
    len(qualitative_name_columns)+\
    len(bool_columns)+\
    len(meanless_columns), 'lack or too much columns'

24 24
Executing shutdown due to inactivity...


2022-04-17 00:35:03,463 - INFO     - Executing shutdown due to inactivity...


Executing shutdown...


2022-04-17 00:35:03,504 - INFO     - Executing shutdown...


### 質的変数（順序尺度）のエンコーディング

In [8]:
qualitative_ordinal_columns

['CELLULARITY', 'ER_IHC', 'HER2_SNP6', 'INFERRED_MENOPAUSAL_STATE']

In [9]:
for i in qualitative_ordinal_columns:
    display(df_patient[i].unique())

array([nan, 'High', 'Moderate', 'Low'], dtype=object)

array(['Positve', 'Negative', nan], dtype=object)

array(['NEUTRAL', 'LOSS', nan, 'GAIN', 'UNDEF'], dtype=object)

array(['Post', 'Pre', nan], dtype=object)

In [10]:
df_patient_converted['CELLULARITY_converted']=df_patient_converted['CELLULARITY'].map({'High':3,'Moderate':2,'Low':1})
df_patient_converted['ER_IHC_converted']=df_patient_converted['ER_IHC'].map({'Positve':1, 'Negative':-1})
df_patient_converted['HER2_SNP6_converted']=df_patient_converted['HER2_SNP6'].map({'GAIN':3, 'NEUTRAL':2, 'LOSS':1, 'UNDEF':0})
df_patient_converted['INFERRED_MENOPAUSAL_STATE_converted']=df_patient_converted['INFERRED_MENOPAUSAL_STATE'].map({'Post':1, 'Pre':-1})

qualitative_ordinal_columns=[
    'CELLULARITY_converted',
    'ER_IHC_converted',
    'HER2_SNP6_converted',
    'INFERRED_MENOPAUSAL_STATE_converted',
]

df_patient_converted=df_patient_converted.drop(['CELLULARITY', 'ER_IHC', 'HER2_SNP6', 'INFERRED_MENOPAUSAL_STATE'], axis=1)

### boolのエンコーディング

astype(bool)のみではうまく変換されなかった

In [11]:
bool_columns

['CHEMOTHERAPY', 'HORMONE_THERAPY', 'RADIO_THERAPY']

In [12]:
for i in bool_columns:
    display(df_patient[i].unique())

array(['NO', 'YES', nan], dtype=object)

array(['YES', 'NO', nan], dtype=object)

array(['YES', 'NO', nan], dtype=object)

In [13]:
df_patient_converted['CHEMOTHERAPY_converted']=df_patient_converted['CHEMOTHERAPY'].map({'YES':1,'NO':0})
df_patient_converted['HORMONE_THERAPY_converted']=df_patient_converted['HORMONE_THERAPY'].map({'YES':1,'NO':0})
df_patient_converted['RADIO_THERAPY_converted']=df_patient_converted['RADIO_THERAPY'].map({'YES':1,'NO':0})

bool_columns=[
    'CHEMOTHERAPY_converted',
    'HORMONE_THERAPY_converted',
    'RADIO_THERAPY_converted',
]

df_patient_converted=df_patient_converted.drop(['CHEMOTHERAPY', 'HORMONE_THERAPY', 'RADIO_THERAPY'], axis=1)

In [14]:
astype_dict={}
astype_dict=list2dict('float', int_columns)
astype_dict=list2dict('float', float_columns)
astype_dict=list2dict('str', str_columns)
astype_dict=list2dict('float', qualitative_ordinal_columns)
astype_dict=list2dict('str', qualitative_name_columns)
astype_dict=list2dict('bool', bool_columns)
display(astype_dict, len(astype_dict))
assert len(astype_dict)+len(target_columns)+len(meanless_columns)== df_patient_converted.shape[1], 'lack or too much columns'

{'LYMPH_NODES_EXAMINED_POSITIVE': 'float',
 'NPI': 'float',
 'AGE_AT_DIAGNOSIS': 'float',
 'CELLULARITY_converted': 'float',
 'ER_IHC_converted': 'float',
 'HER2_SNP6_converted': 'float',
 'INFERRED_MENOPAUSAL_STATE_converted': 'float',
 'COHORT': 'str',
 'INTCLUST': 'str',
 'CLAUDIN_SUBTYPE': 'str',
 'THREEGENE': 'str',
 'HISTOLOGICAL_SUBTYPE': 'str',
 'BREAST_SURGERY': 'str',
 'LATERALITY': 'str',
 'CHEMOTHERAPY_converted': 'bool',
 'HORMONE_THERAPY_converted': 'bool',
 'RADIO_THERAPY_converted': 'bool'}

17

In [15]:
df_patient_converted=df_patient_converted.astype(astype_dict)
df_patient_converted.dtypes

PATIENT_ID                              object
LYMPH_NODES_EXAMINED_POSITIVE          float64
NPI                                    float64
COHORT                                  object
SEX                                     object
INTCLUST                                object
AGE_AT_DIAGNOSIS                       float64
OS_MONTHS                              float64
OS_STATUS                               object
CLAUDIN_SUBTYPE                         object
THREEGENE                               object
VITAL_STATUS                            object
LATERALITY                              object
HISTOLOGICAL_SUBTYPE                    object
BREAST_SURGERY                          object
RFS_STATUS                              object
RFS_MONTHS                             float64
CELLULARITY_converted                  float64
ER_IHC_converted                       float64
HER2_SNP6_converted                    float64
INFERRED_MENOPAUSAL_STATE_converted    float64
CHEMOTHERAPY_

In [16]:
df_patient.head()

Unnamed: 0,PATIENT_ID,LYMPH_NODES_EXAMINED_POSITIVE,NPI,CELLULARITY,CHEMOTHERAPY,COHORT,ER_IHC,HER2_SNP6,HORMONE_THERAPY,INFERRED_MENOPAUSAL_STATE,SEX,INTCLUST,AGE_AT_DIAGNOSIS,OS_MONTHS,OS_STATUS,CLAUDIN_SUBTYPE,THREEGENE,VITAL_STATUS,LATERALITY,RADIO_THERAPY,HISTOLOGICAL_SUBTYPE,BREAST_SURGERY,RFS_STATUS,RFS_MONTHS
0,MB-0000,10.0,6.044,,NO,1.0,Positve,NEUTRAL,YES,Post,Female,4ER+,75.65,140.5,0:LIVING,claudin-low,ER-/HER2-,Living,Right,YES,Ductal/NST,MASTECTOMY,0:Not Recurred,138.65
1,MB-0002,0.0,4.02,High,NO,1.0,Positve,NEUTRAL,YES,Pre,Female,4ER+,43.19,84.633333,0:LIVING,LumA,ER+/HER2- High Prolif,Living,Right,YES,Ductal/NST,BREAST CONSERVING,0:Not Recurred,83.52
2,MB-0005,1.0,4.03,High,YES,1.0,Positve,NEUTRAL,YES,Pre,Female,3,48.87,163.7,1:DECEASED,LumB,,Died of Disease,Right,NO,Ductal/NST,MASTECTOMY,1:Recurred,151.28
3,MB-0006,3.0,4.05,Moderate,YES,1.0,Positve,NEUTRAL,YES,Pre,Female,9,47.68,164.933333,0:LIVING,LumB,,Living,Right,YES,Mixed,MASTECTOMY,0:Not Recurred,162.76
4,MB-0008,8.0,6.08,High,YES,1.0,Positve,NEUTRAL,YES,Post,Female,9,76.97,41.366667,1:DECEASED,LumB,ER+/HER2- High Prolif,Died of Disease,Right,YES,Mixed,MASTECTOMY,1:Recurred,18.55


In [17]:
df_patient_converted.head()

Unnamed: 0,PATIENT_ID,LYMPH_NODES_EXAMINED_POSITIVE,NPI,COHORT,SEX,INTCLUST,AGE_AT_DIAGNOSIS,OS_MONTHS,OS_STATUS,CLAUDIN_SUBTYPE,THREEGENE,VITAL_STATUS,LATERALITY,HISTOLOGICAL_SUBTYPE,BREAST_SURGERY,RFS_STATUS,RFS_MONTHS,CELLULARITY_converted,ER_IHC_converted,HER2_SNP6_converted,INFERRED_MENOPAUSAL_STATE_converted,CHEMOTHERAPY_converted,HORMONE_THERAPY_converted,RADIO_THERAPY_converted
0,MB-0000,10.0,6.044,1.0,Female,4ER+,75.65,140.5,0:LIVING,claudin-low,ER-/HER2-,Living,Right,Ductal/NST,MASTECTOMY,0:Not Recurred,138.65,,1.0,2.0,1.0,False,True,True
1,MB-0002,0.0,4.02,1.0,Female,4ER+,43.19,84.633333,0:LIVING,LumA,ER+/HER2- High Prolif,Living,Right,Ductal/NST,BREAST CONSERVING,0:Not Recurred,83.52,3.0,1.0,2.0,-1.0,False,True,True
2,MB-0005,1.0,4.03,1.0,Female,3,48.87,163.7,1:DECEASED,LumB,,Died of Disease,Right,Ductal/NST,MASTECTOMY,1:Recurred,151.28,3.0,1.0,2.0,-1.0,True,True,False
3,MB-0006,3.0,4.05,1.0,Female,9,47.68,164.933333,0:LIVING,LumB,,Living,Right,Mixed,MASTECTOMY,0:Not Recurred,162.76,2.0,1.0,2.0,-1.0,True,True,True
4,MB-0008,8.0,6.08,1.0,Female,9,76.97,41.366667,1:DECEASED,LumB,ER+/HER2- High Prolif,Died of Disease,Right,Mixed,MASTECTOMY,1:Recurred,18.55,3.0,1.0,2.0,1.0,True,True,True


In [18]:
# pickleとして保存
make_dir(config.INTERIM_PICKLE_EDA_DIR)
df_patient_converted.to_pickle(config.INTERIM_PICKLE_EDA_DIR+'/data_clinical_patient_1.0.pkl')

In [19]:
target = 'OS_MONTHS'
target_column=[target]
drop_columns=[i for i in target_columns+meanless_columns if i not in target_column]

データの分割

In [20]:
df_patient_MB=df_patient[df_patient['PATIENT_ID'].str.contains('MB')]
df_patient_MTST=df_patient[df_patient['PATIENT_ID'].str.contains('MTS-T')]

In [21]:
df_patient_MB.describe()

Unnamed: 0,LYMPH_NODES_EXAMINED_POSITIVE,NPI,COHORT,AGE_AT_DIAGNOSIS,OS_MONTHS,RFS_MONTHS
count,1909.0,1984.0,1985.0,1985.0,1981.0,1985.0
mean,2.004715,4.013186,2.616121,61.100544,125.244271,110.117406
std,4.08049,1.163166,1.233252,12.948374,76.111772,76.297209
min,0.0,1.0,1.0,21.93,0.0,0.0
25%,0.0,3.044,1.0,51.45,60.866667,41.74
50%,0.0,4.042,3.0,61.84,116.466667,100.72
75%,2.0,5.04,3.0,70.61,185.133333,167.76
max,45.0,6.68,5.0,96.29,355.2,346.38


In [22]:
df_patient_MTST.describe()

Unnamed: 0,LYMPH_NODES_EXAMINED_POSITIVE,NPI,COHORT,AGE_AT_DIAGNOSIS,OS_MONTHS,RFS_MONTHS
count,334.0,303.0,513.0,513.0,0.0,403.0
mean,1.640719,4.130937,4.0,57.788168,,102.562804
std,3.62875,1.344243,3.370367,13.03744,,77.394361
min,0.0,1.14,1.0,27.0,,0.03
25%,0.0,3.28,1.0,47.55,,37.17
50%,0.0,4.32,1.0,58.0,,86.88
75%,2.0,5.0,7.0,67.0,,166.89
max,30.0,7.2,9.0,89.0,,384.21


nullを含む行の確認

In [23]:
df_patient_MB[df_patient_MB.isnull().any(axis=1)]

Unnamed: 0,PATIENT_ID,LYMPH_NODES_EXAMINED_POSITIVE,NPI,CELLULARITY,CHEMOTHERAPY,COHORT,ER_IHC,HER2_SNP6,HORMONE_THERAPY,INFERRED_MENOPAUSAL_STATE,SEX,INTCLUST,AGE_AT_DIAGNOSIS,OS_MONTHS,OS_STATUS,CLAUDIN_SUBTYPE,THREEGENE,VITAL_STATUS,LATERALITY,RADIO_THERAPY,HISTOLOGICAL_SUBTYPE,BREAST_SURGERY,RFS_STATUS,RFS_MONTHS
0,MB-0000,10.0,6.044,,NO,1.0,Positve,NEUTRAL,YES,Post,Female,4ER+,75.65,140.500000,0:LIVING,claudin-low,ER-/HER2-,Living,Right,YES,Ductal/NST,MASTECTOMY,0:Not Recurred,138.65
2,MB-0005,1.0,4.030,High,YES,1.0,Positve,NEUTRAL,YES,Pre,Female,3,48.87,163.700000,1:DECEASED,LumB,,Died of Disease,Right,NO,Ductal/NST,MASTECTOMY,1:Recurred,151.28
3,MB-0006,3.0,4.050,Moderate,YES,1.0,Positve,NEUTRAL,YES,Pre,Female,9,47.68,164.933333,0:LIVING,LumB,,Living,Right,YES,Mixed,MASTECTOMY,0:Not Recurred,162.76
6,MB-0014,1.0,4.020,Moderate,YES,1.0,Positve,LOSS,YES,Post,Female,3,56.45,164.333333,0:LIVING,LumB,,Living,Right,YES,Ductal/NST,BREAST CONSERVING,0:Not Recurred,162.17
7,MB-0020,,6.130,High,YES,1.0,Negative,NEUTRAL,NO,Post,Female,4ER-,70.00,22.400000,1:DECEASED,Normal,ER-/HER2-,Died of Disease,Left,YES,Lobular,MASTECTOMY,1:Recurred,11.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1979,MB-7294,1.0,4.030,High,NO,4.0,Positve,GAIN,YES,Post,Female,1,59.20,82.733333,1:DECEASED,LumB,ER+/HER2- High Prolif,Died of Disease,,NO,Ductal/NST,MASTECTOMY,1:Recurred,81.64
1981,MB-7296,1.0,5.040,High,NO,4.0,Positve,GAIN,NO,Pre,Female,5,42.88,44.733333,1:DECEASED,LumB,,Died of Disease,,YES,Ductal/NST,MASTECTOMY,1:Recurred,16.09
1982,MB-7297,45.0,6.050,High,NO,4.0,Positve,NEUTRAL,YES,Post,Female,1,62.90,175.966667,1:DECEASED,LumB,,Died of Disease,Left,YES,Ductal/NST,MASTECTOMY,1:Recurred,121.18
1983,MB-7298,12.0,5.050,Moderate,NO,4.0,Positve,NEUTRAL,YES,Post,Female,1,61.16,86.233333,1:DECEASED,LumB,ER+/HER2- High Prolif,Died of Other Causes,,NO,Ductal/NST,MASTECTOMY,0:Not Recurred,85.10


dtypesの修正

In [24]:
# 死因が乳がん以外（と推定されるもの）と、drop_columnsに含まれる列、nanを含む行の削除
df_patient_MB_filtered=df_patient_MB[df_patient_MB['VITAL_STATUS']!='Died of Other Causes']
df_patient_living=df_patient_MB_filtered[df_patient_MB_filtered['VITAL_STATUS']!='Living'].drop(drop_columns, axis=1).dropna()
df_patient_dead=df_patient_MB_filtered[df_patient_MB_filtered['VITAL_STATUS']!='Diof Disease'].drop(drop_columns, axis=1).dropna()