In [17]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sweetviz as sv
import dtale

import config

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)


def make_dir(dir_name: str):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [18]:
df_sample = pd.read_table(
    config.RAW_BRCA_METABRIC_DIR + "/data_clinical_sample.txt", header=4
)
df_sample.head()

Unnamed: 0,PATIENT_ID,SAMPLE_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,ER_STATUS,HER2_STATUS,GRADE,ONCOTREE_CODE,PR_STATUS,SAMPLE_TYPE,TUMOR_SIZE,TUMOR_STAGE,TMB_NONSYNONYMOUS
0,MB-0000,MB-0000,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,3.0,IDC,Negative,Primary,22.0,2.0,0.0
1,MB-0002,MB-0002,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,3.0,IDC,Positive,Primary,10.0,1.0,2.615035
2,MB-0005,MB-0005,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,2.0,IDC,Positive,Primary,15.0,2.0,2.615035
3,MB-0006,MB-0006,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Positive,Negative,2.0,MDLC,Positive,Primary,25.0,2.0,1.307518
4,MB-0008,MB-0008,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Positive,Negative,3.0,MDLC,Positive,Primary,40.0,2.0,2.615035


In [19]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509 entries, 0 to 2508
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PATIENT_ID            2509 non-null   object 
 1   SAMPLE_ID             2509 non-null   object 
 2   CANCER_TYPE           2509 non-null   object 
 3   CANCER_TYPE_DETAILED  2509 non-null   object 
 4   ER_STATUS             2469 non-null   object 
 5   HER2_STATUS           1980 non-null   object 
 6   GRADE                 2388 non-null   float64
 7   ONCOTREE_CODE         2509 non-null   object 
 8   PR_STATUS             1980 non-null   object 
 9   SAMPLE_TYPE           2509 non-null   object 
 10  TUMOR_SIZE            2360 non-null   float64
 11  TUMOR_STAGE           1788 non-null   float64
 12  TMB_NONSYNONYMOUS     2509 non-null   float64
dtypes: float64(4), object(9)
memory usage: 254.9+ KB


In [20]:
# dtale.show(df_sample)

## 特徴量の型に合わせたエンコーディング

In [21]:
df_sample_converted = df_sample.copy()

In [22]:
target_columns = []

int_columns = []

float_columns = [
    "TUMOR_SIZE",
    "TMB_NONSYNONYMOUS",
]

ｓｔｒ_columns = []

# 質的変数（順序尺度）
qualitative_ordinal_columns = [
    "ER_STATUS",
    "HER2_STATUS",
    "GRADE",
    "PR_STATUS",
    "TUMOR_STAGE",
]
# 質的変数（名義尺度）
qualitative_name_columns = [
    "CANCER_TYPE",
    "CANCER_TYPE_DETAILED",
    "ONCOTREE_CODE",
]

bool_columns = []

meanless_columns = [
    "PATIENT_ID",
    "SAMPLE_ID",
    "SAMPLE_TYPE",
]
print(
    df_sample.shape[1],
    len(target_columns)
    + len(int_columns)
    + len(float_columns)
    + len(ｓｔｒ_columns)
    + len(qualitative_ordinal_columns)
    + len(qualitative_name_columns)
    + len(bool_columns)
    + len(meanless_columns),
)
assert df_sample.shape[1] == len(target_columns) + len(int_columns) + len(
    float_columns
) + len(ｓｔｒ_columns) + len(qualitative_ordinal_columns) + len(
    qualitative_name_columns
) + len(
    bool_columns
) + len(
    meanless_columns
), "columns lack or too much"

13 13


### 質的変数（順序特徴量）のエンコーディング

In [23]:
qualitative_ordinal_columns

['ER_STATUS', 'HER2_STATUS', 'GRADE', 'PR_STATUS', 'TUMOR_STAGE']

In [24]:
for i in qualitative_ordinal_columns:
    display(df_sample[i].unique())

array(['Positive', 'Negative', nan], dtype=object)

array(['Negative', nan, 'Positive'], dtype=object)

array([ 3.,  2.,  1., nan])

array(['Negative', 'Positive', nan], dtype=object)

array([ 2.,  1.,  4.,  3.,  0., nan])

In [25]:
df_sample_converted["ER_STATUS_converted"] = df_sample_converted["ER_STATUS"].map(
    {"Positive": 1, "Negative": -1}
)
df_sample_converted["HER2_STATUS_converted"] = df_sample_converted["HER2_STATUS"].map(
    {"Positive": 1, "Negative": -1}
)
# df_sample_converted['GRADE_converted']=df_sample_converted['GRADEGRADE'].map()
df_sample_converted["PR_STATUS_converted"] = df_sample_converted["PR_STATUS"].map(
    {"Positive": 1, "Negative": -1}
)
# df_sample_converted['TUMOR_STAGE_converted']=df_sample_converted['TUMOR_STAGE'].map()

qualitative_ordinal_columns = [
    "ER_STATUS_converted",
    "HER2_STATUS_converted",
    "GRADE",
    "PR_STATUS_converted",
    "TUMOR_STAGE",
]

df_sample_converted = df_sample_converted.drop(
    ["ER_STATUS", "HER2_STATUS", "PR_STATUS"], axis=1
)

### boolのエンコーディング

In [26]:
bool_columns

[]

In [27]:
# astypeするためのdict作成
def list2dict(str_dtype: str, columns: list, dic: dict = {}) -> dict:
    for column in columns:
        dic[column] = str_dtype
    return dic

In [28]:
astype_dict = {}
astype_dict = list2dict("float", int_columns)
astype_dict = list2dict("float", float_columns)
astype_dict = list2dict("str", str_columns)
astype_dict = list2dict("float", qualitative_ordinal_columns)
astype_dict = list2dict("str", qualitative_name_columns)
astype_dict = list2dict("bool", bool_columns)

display(
    "astype_dict size: ",
    len(astype_dict),
    "target_columns size: ",
    len(target_columns),
    "meanless_columns size: ",
    len(meanless_columns),
    "original size: ",
    df_sample.shape[1],
)

assert (
    len(astype_dict) + len(target_columns) + len(meanless_columns) == df_sample.shape[1]
), "lack or too much columns"

'astype_dict size: '

10

'target_columns size: '

0

'meanless_columns size: '

3

'original size: '

13

In [29]:
df_sample_converted = df_sample_converted.astype(astype_dict)
df_sample_converted.dtypes

PATIENT_ID                object
SAMPLE_ID                 object
CANCER_TYPE               object
CANCER_TYPE_DETAILED      object
GRADE                    float64
ONCOTREE_CODE             object
SAMPLE_TYPE               object
TUMOR_SIZE               float64
TUMOR_STAGE              float64
TMB_NONSYNONYMOUS        float64
ER_STATUS_converted      float64
HER2_STATUS_converted    float64
PR_STATUS_converted      float64
dtype: object

In [30]:
df_sample.head()
df_sample

Unnamed: 0,PATIENT_ID,SAMPLE_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,ER_STATUS,HER2_STATUS,GRADE,ONCOTREE_CODE,PR_STATUS,SAMPLE_TYPE,TUMOR_SIZE,TUMOR_STAGE,TMB_NONSYNONYMOUS
0,MB-0000,MB-0000,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,3.0,IDC,Negative,Primary,22.0,2.0,0.000000
1,MB-0002,MB-0002,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,3.0,IDC,Positive,Primary,10.0,1.0,2.615035
2,MB-0005,MB-0005,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,2.0,IDC,Positive,Primary,15.0,2.0,2.615035
3,MB-0006,MB-0006,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Positive,Negative,2.0,MDLC,Positive,Primary,25.0,2.0,1.307518
4,MB-0008,MB-0008,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Positive,Negative,3.0,MDLC,Positive,Primary,40.0,2.0,2.615035
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2504,MTS-T2428,MTS-T2428,Breast Cancer,Invasive Breast Carcinoma,Positive,,1.0,BRCA,,Primary,27.0,1.0,2.615035
2505,MTS-T2429,MTS-T2429,Breast Cancer,Invasive Breast Carcinoma,Positive,,2.0,BRCA,,Primary,28.0,2.0,5.230071
2506,MTS-T2430,MTS-T2430,Breast Cancer,Invasive Breast Carcinoma,,,,BRCA,,Primary,,0.0,7.845106
2507,MTS-T2431,MTS-T2431,Breast Cancer,Invasive Breast Carcinoma,,,,BRCA,,Primary,,0.0,9.152624


In [31]:
df_sample_converted.head()

Unnamed: 0,PATIENT_ID,SAMPLE_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,GRADE,ONCOTREE_CODE,SAMPLE_TYPE,TUMOR_SIZE,TUMOR_STAGE,TMB_NONSYNONYMOUS,ER_STATUS_converted,HER2_STATUS_converted,PR_STATUS_converted
0,MB-0000,MB-0000,Breast Cancer,Breast Invasive Ductal Carcinoma,3.0,IDC,Primary,22.0,2.0,0.0,1.0,-1.0,-1.0
1,MB-0002,MB-0002,Breast Cancer,Breast Invasive Ductal Carcinoma,3.0,IDC,Primary,10.0,1.0,2.615035,1.0,-1.0,1.0
2,MB-0005,MB-0005,Breast Cancer,Breast Invasive Ductal Carcinoma,2.0,IDC,Primary,15.0,2.0,2.615035,1.0,-1.0,1.0
3,MB-0006,MB-0006,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,2.0,MDLC,Primary,25.0,2.0,1.307518,1.0,-1.0,1.0
4,MB-0008,MB-0008,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,3.0,MDLC,Primary,40.0,2.0,2.615035,1.0,-1.0,1.0


In [32]:
# pickleとして保存
make_dir(config.INTERIM_PICKLE_EDA_DIR)
df_sample_converted.to_pickle(
    config.INTERIM_PICKLE_EDA_DIR + "/1.0.1-data_clinical_sample.pkl"
)