In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sweetviz as sv
import dtale

import config

#最大表示列数の指定（ここでは50列を指定）
pd.set_option('display.max_columns', 50)

def make_dir(dir_name: str):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [2]:
df_sample=pd.read_table(config.RAW_BRCA_METABRIC_DIR+'/data_clinical_sample.txt',header=4)
df_sample.head()

Unnamed: 0,PATIENT_ID,SAMPLE_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,ER_STATUS,HER2_STATUS,GRADE,ONCOTREE_CODE,PR_STATUS,SAMPLE_TYPE,TUMOR_SIZE,TUMOR_STAGE,TMB_NONSYNONYMOUS
0,MB-0000,MB-0000,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,3.0,IDC,Negative,Primary,22.0,2.0,0.0
1,MB-0002,MB-0002,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,3.0,IDC,Positive,Primary,10.0,1.0,2.615035
2,MB-0005,MB-0005,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,2.0,IDC,Positive,Primary,15.0,2.0,2.615035
3,MB-0006,MB-0006,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Positive,Negative,2.0,MDLC,Positive,Primary,25.0,2.0,1.307518
4,MB-0008,MB-0008,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Positive,Negative,3.0,MDLC,Positive,Primary,40.0,2.0,2.615035


In [3]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509 entries, 0 to 2508
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PATIENT_ID            2509 non-null   object 
 1   SAMPLE_ID             2509 non-null   object 
 2   CANCER_TYPE           2509 non-null   object 
 3   CANCER_TYPE_DETAILED  2509 non-null   object 
 4   ER_STATUS             2469 non-null   object 
 5   HER2_STATUS           1980 non-null   object 
 6   GRADE                 2388 non-null   float64
 7   ONCOTREE_CODE         2509 non-null   object 
 8   PR_STATUS             1980 non-null   object 
 9   SAMPLE_TYPE           2509 non-null   object 
 10  TUMOR_SIZE            2360 non-null   float64
 11  TUMOR_STAGE           1788 non-null   float64
 12  TMB_NONSYNONYMOUS     2509 non-null   float64
dtypes: float64(4), object(9)
memory usage: 254.9+ KB


In [22]:
dtale.show(df_sample)



## 特徴量の型に合わせたエンコーディング

In [5]:
df_sample_converted=df_sample.copy()

In [6]:
target_columns=[
]

int_columns=[
]

float_columns=[
    'TUMOR_SIZE',
    'TMB_NONSYNONYMOUS',
]

ｓｔｒ_columns=[
]

# 質的変数（順序尺度）
qualitative_ordinal_columns=[
    'ER_STATUS',
    'HER2_STATUS',
    'GRADE',
    'PR_STATUS',
    'TUMOR_STAGE',
]
# 質的変数（名義尺度）
qualitative_name_columns=[
    'CANCER_TYPE',
    'CANCER_TYPE_DETAILED',
    'ONCOTREE_CODE',    
]

bool_columns=[
]

meanless_columns=[
    'PATIENT_ID',
    'SAMPLE_ID',
    'SAMPLE_TYPE',
]
print(df_sample.shape[1],
    len(target_columns)+\
    len(int_columns)+\
    len(float_columns)+\
    len(ｓｔｒ_columns)+\
    len(qualitative_ordinal_columns)+\
    len(qualitative_name_columns)+\
    len(bool_columns)+\
    len(meanless_columns))
assert  df_sample.shape[1]==\
    len(target_columns)+\
    len(int_columns)+\
    len(float_columns)+\
    len(ｓｔｒ_columns)+\
    len(qualitative_ordinal_columns)+\
    len(qualitative_name_columns)+\
    len(bool_columns)+\
    len(meanless_columns), 'columns lack or too much'

13 13


### 質的変数（順序特徴量）のエンコーディング

In [7]:
qualitative_ordinal_columns

['ER_STATUS', 'HER2_STATUS', 'GRADE', 'PR_STATUS', 'TUMOR_STAGE']

In [8]:
for i in qualitative_ordinal_columns:
    display(df_sample[i].unique())

array(['Positive', 'Negative', nan], dtype=object)

array(['Negative', nan, 'Positive'], dtype=object)

array([ 3.,  2.,  1., nan])

array(['Negative', 'Positive', nan], dtype=object)

array([ 2.,  1.,  4.,  3.,  0., nan])

In [9]:
df_sample_converted['ER_STATUS_converted']=df_sample_converted['ER_STATUS'].map({'Positive':1, 'Negative':-1})
df_sample_converted['HER2_STATUS_converted']=df_sample_converted['HER2_STATUS'].map({'Positive':1, 'Negative':-1})
#df_sample_converted['GRADE_converted']=df_sample_converted['GRADEGRADE'].map()
df_sample_converted['PR_STATUS_converted']=df_sample_converted['PR_STATUS'].map({'Positive':1, 'Negative':-1})
#df_sample_converted['TUMOR_STAGE_converted']=df_sample_converted['TUMOR_STAGE'].map()

qualitative_ordinal_columns=[
    'ER_STATUS_converted',
    'HER2_STATUS_converted',
    'GRADE',
    'PR_STATUS_converted',
    'TUMOR_STAGE',
]

df_sample_converted=df_sample_converted.drop(['ER_STATUS', 'HER2_STATUS', 'PR_STATUS'], axis=1)

### boolのエンコーディング

In [10]:
bool_columns

[]

In [11]:
# astypeするためのdict作成
def list2dict(str_dtype: str, columns:list, dic:dict={}) -> dict:
    for column in columns:
        dic[column]=str_dtype
    return dic

In [12]:
astype_dict={}
astype_dict=list2dict('float', int_columns)
astype_dict=list2dict('float', float_columns)
astype_dict=list2dict('str', str_columns)
astype_dict=list2dict('float', qualitative_ordinal_columns)
astype_dict=list2dict('str', qualitative_name_columns)
astype_dict=list2dict('bool', bool_columns)

display('astype_dict size: ', len(astype_dict), 
        'target_columns size: ', len(target_columns), 
        'meanless_columns size: ',len(meanless_columns),
        'original size: ', df_sample.shape[1])

assert len(astype_dict)+len(target_columns)+len(meanless_columns)== df_sample.shape[1], 'lack or too much columns'

'astype_dict size: '

10

'target_columns size: '

0

'meanless_columns size: '

3

'original size: '

13

In [13]:
df_sample_converted=df_sample_converted.astype(astype_dict)
df_sample_converted.dtypes

PATIENT_ID                object
SAMPLE_ID                 object
CANCER_TYPE               object
CANCER_TYPE_DETAILED      object
GRADE                    float64
ONCOTREE_CODE             object
SAMPLE_TYPE               object
TUMOR_SIZE               float64
TUMOR_STAGE              float64
TMB_NONSYNONYMOUS        float64
ER_STATUS_converted      float64
HER2_STATUS_converted    float64
PR_STATUS_converted      float64
dtype: object

In [14]:
df_sample.head()
df_sample

Unnamed: 0,PATIENT_ID,SAMPLE_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,ER_STATUS,HER2_STATUS,GRADE,ONCOTREE_CODE,PR_STATUS,SAMPLE_TYPE,TUMOR_SIZE,TUMOR_STAGE,TMB_NONSYNONYMOUS
0,MB-0000,MB-0000,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,3.0,IDC,Negative,Primary,22.0,2.0,0.0
1,MB-0002,MB-0002,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,3.0,IDC,Positive,Primary,10.0,1.0,2.615035
2,MB-0005,MB-0005,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,2.0,IDC,Positive,Primary,15.0,2.0,2.615035
3,MB-0006,MB-0006,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Positive,Negative,2.0,MDLC,Positive,Primary,25.0,2.0,1.307518
4,MB-0008,MB-0008,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Positive,Negative,3.0,MDLC,Positive,Primary,40.0,2.0,2.615035


In [15]:
df_sample_converted.head()

Unnamed: 0,PATIENT_ID,SAMPLE_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,GRADE,ONCOTREE_CODE,SAMPLE_TYPE,TUMOR_SIZE,TUMOR_STAGE,TMB_NONSYNONYMOUS,ER_STATUS_converted,HER2_STATUS_converted,PR_STATUS_converted
0,MB-0000,MB-0000,Breast Cancer,Breast Invasive Ductal Carcinoma,3.0,IDC,Primary,22.0,2.0,0.0,1.0,-1.0,-1.0
1,MB-0002,MB-0002,Breast Cancer,Breast Invasive Ductal Carcinoma,3.0,IDC,Primary,10.0,1.0,2.615035,1.0,-1.0,1.0
2,MB-0005,MB-0005,Breast Cancer,Breast Invasive Ductal Carcinoma,2.0,IDC,Primary,15.0,2.0,2.615035,1.0,-1.0,1.0
3,MB-0006,MB-0006,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,2.0,MDLC,Primary,25.0,2.0,1.307518,1.0,-1.0,1.0
4,MB-0008,MB-0008,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,3.0,MDLC,Primary,40.0,2.0,2.615035,1.0,-1.0,1.0


In [16]:
# pickleとして保存
make_dir(config.INTERIM_PICKLE_EDA_DIR)
df_sample_converted.to_pickle(config.INTERIM_PICKLE_EDA_DIR+'/data_clinical_sample_1.1.pkl')

In [17]:
df_sample_MB=df_sample[df_sample['PATIENT_ID'].str.contains('MB')]
df_sample_MTST=df_sample[df_sample['PATIENT_ID'].str.contains('MTS-T')]

In [18]:
df_sample_MB.describe()

Unnamed: 0,GRADE,TUMOR_SIZE,TUMOR_STAGE,TMB_NONSYNONYMOUS
count,1897.0,1959.0,1470.0,1985.0
mean,2.414338,26.273486,1.736735,6.993408
std,0.648891,15.368776,0.641813,5.437249
min,1.0,1.0,0.0,0.0
25%,2.0,17.0,1.0,3.922553
50%,3.0,23.0,2.0,6.537589
75%,3.0,30.0,2.0,9.152624
max,3.0,182.0,4.0,104.601416


In [19]:
df_sample_MTST.describe()

Unnamed: 0,GRADE,TUMOR_SIZE,TUMOR_STAGE,TMB_NONSYNONYMOUS
count,491.0,401.0,318.0,524.0
mean,2.403259,25.959252,1.606918,6.305529
std,0.651771,15.397713,0.705688,4.818393
min,1.0,1.0,0.0,0.0
25%,2.0,15.0,1.0,2.615035
50%,2.0,22.0,2.0,5.230071
75%,3.0,30.0,2.0,7.845106
max,3.0,130.0,4.0,45.76312


nullを含む行の確認

In [20]:
df_sample_MB[df_sample_MB.isnull().any(axis=1)]

Unnamed: 0,PATIENT_ID,SAMPLE_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,ER_STATUS,HER2_STATUS,GRADE,ONCOTREE_CODE,PR_STATUS,SAMPLE_TYPE,TUMOR_SIZE,TUMOR_STAGE,TMB_NONSYNONYMOUS
9,MB-0025,MB-0025,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,,3.0,IDC,,Primary,34.0,2.0,6.537589
42,MB-0110,MB-0110,Breast Cancer,Invasive Breast Carcinoma,Negative,Negative,,BRCA,Negative,Primary,70.0,0.0,0.000000
72,MB-0144,MB-0144,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,3.0,IDC,Positive,Primary,26.0,,6.537589
81,MB-0153,MB-0153,Breast Cancer,Invasive Breast Carcinoma,Negative,Negative,,BRCA,Negative,Primary,28.0,,0.000000
87,MB-0159,MB-0159,Breast Cancer,Invasive Breast Carcinoma,Negative,Positive,,BRCA,Negative,Primary,23.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1980,MB-7295,MB-7295,Breast Cancer,Breast Invasive Lobular Carcinoma,Positive,Negative,3.0,ILC,Positive,Primary,25.0,,5.230071
1981,MB-7296,MB-7296,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Positive,3.0,IDC,Negative,Primary,20.0,,7.845106
1982,MB-7297,MB-7297,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,3.0,IDC,Positive,Primary,25.0,,5.230071
1983,MB-7298,MB-7298,Breast Cancer,Breast Invasive Ductal Carcinoma,Positive,Negative,2.0,IDC,Positive,Primary,25.0,,19.612766
