In [4]:
import os
import numpy as np
import pandas as pd

import config

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)
pd.set_option('display.max_rows', 50)


def make_dir(dir_name: str):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)


dfの読み込み

In [5]:
# PATIENT_IDにMBを含むデータのみ含むデータフレームを読み込み
df_MB_dropped=pd.read_pickle(config.INTERIM_PICKLE_PREPROCESSED_DIR+'/2.1-df_MB_dropped.pkl')
df_MB_dropped.shape

(1124, 36)

# 各columnの整理
dfの各columnについて、型と値の整理を行う

In [6]:
target_columns=[
    # patient
    'OS_MONTHS',
    'RFS_MONTHS',
    'OS_STATUS',
    'RFS_STATUS',
    'VITAL_STATUS',
]

int_columns=[
    # patient
    'LYMPH_NODES_EXAMINED_POSITIVE',
    
    'OS_MONTHS',
    'RFS_MONTHS',
]

float_columns=[
    # patient
    'NPI',
    'AGE_AT_DIAGNOSIS',
    # sample
    'TUMOR_SIZE',
    'TMB_NONSYNONYMOUS',
]

str_columns=[
]

# 質的変数（順序尺度）
qualitative_ordinal_columns=[
    # patient
    'CELLULARITY',  
    'ER_IHC',
    'HER2_SNP6',
    'INFERRED_MENOPAUSAL_STATE',
    # sample
    'ER_STATUS',
    'HER2_STATUS',
    'GRADE',
    'PR_STATUS',
    'TUMOR_STAGE',
]
# 質的変数（名義尺度）
qualitative_name_columns=[
    # patient
    'COHORT',
    'INTCLUST',
    'CLAUDIN_SUBTYPE',
    'THREEGENE',
    'HISTOLOGICAL_SUBTYPE',
    'BREAST_SURGERY',
    'LATERALITY',
    
    'VITAL_STATUS',
    # sample
    'CANCER_TYPE',
    'CANCER_TYPE_DETAILED',
    'ONCOTREE_CODE',

]

bool_columns=[
    # patient
    'CHEMOTHERAPY',
    'HORMONE_THERAPY',
    'RADIO_THERAPY',
]

meanless_columns=[
    # patient
    'PATIENT_ID',
    'SEX',
    
    'OS_STATUS',
    'RFS_STATUS',
    # sample
    #'PATIENT_ID',
    'SAMPLE_ID',
    'SAMPLE_TYPE',
]

print(df_MB_dropped.shape[1],
    len(int_columns)+\
    len(float_columns)+\
    len(ｓｔｒ_columns)+\
    len(qualitative_ordinal_columns)+\
    len(qualitative_name_columns)+\
    len(bool_columns)+\
    len(meanless_columns))
assert  df_MB_dropped.shape[1]==\
    len(int_columns)+\
    len(float_columns)+\
    len(ｓｔｒ_columns)+\
    len(qualitative_ordinal_columns)+\
    len(qualitative_name_columns)+\
    len(bool_columns)+\
    len(meanless_columns), 'lack or too much columns'

36 36


## 質的変数（順序尺度）のエンコーディング

qualitative_ordinal_columnsは、順序のあるstrの変数で構成されている。  
そこで大小関係に対応するようstrをintに変換する。  

In [7]:
df_MB_dropped[qualitative_ordinal_columns].head()

Unnamed: 0,CELLULARITY,ER_IHC,HER2_SNP6,INFERRED_MENOPAUSAL_STATE,ER_STATUS,HER2_STATUS,GRADE,PR_STATUS,TUMOR_STAGE
1,High,Positve,NEUTRAL,Pre,Positive,Negative,3.0,Positive,1.0
4,High,Positve,NEUTRAL,Post,Positive,Negative,3.0,Positive,2.0
5,Moderate,Positve,NEUTRAL,Post,Positive,Negative,3.0,Positive,4.0
10,Moderate,Positve,GAIN,Post,Positive,Negative,3.0,Negative,2.0
11,High,Negative,LOSS,Post,Positive,Negative,2.0,Negative,2.0


In [8]:
for column in qualitative_ordinal_columns:
    print(column)
    display(df_MB_dropped[column].value_counts())
    print('-----'*10)

CELLULARITY


High        564
Moderate    428
Low         132
Name: CELLULARITY, dtype: int64

--------------------------------------------------
ER_IHC


Positve     875
Negative    249
Name: ER_IHC, dtype: int64

--------------------------------------------------
HER2_SNP6


NEUTRAL    813
GAIN       251
LOSS        58
UNDEF        2
Name: HER2_SNP6, dtype: int64

--------------------------------------------------
INFERRED_MENOPAUSAL_STATE


Post    866
Pre     258
Name: INFERRED_MENOPAUSAL_STATE, dtype: int64

--------------------------------------------------
ER_STATUS


Positive    870
Negative    254
Name: ER_STATUS, dtype: int64

--------------------------------------------------
HER2_STATUS


Negative    984
Positive    140
Name: HER2_STATUS, dtype: int64

--------------------------------------------------
GRADE


3.0    587
2.0    448
1.0     89
Name: GRADE, dtype: int64

--------------------------------------------------
PR_STATUS


Positive    584
Negative    540
Name: PR_STATUS, dtype: int64

--------------------------------------------------
TUMOR_STAGE


2.0    640
1.0    381
3.0     95
4.0      8
Name: TUMOR_STAGE, dtype: int64

--------------------------------------------------


In [9]:
df_MB_dropped['CELLULARITY'].map({'High':3,'Moderate':2,'Low':1})

1       3
4       3
5       2
10      2
11      3
       ..
1697    3
1698    3
1700    3
1702    3
1743    3
Name: CELLULARITY, Length: 1124, dtype: int64

In [12]:
df_MB_dropped['CELLULARITY'].replace({'High':3,'Moderate':2,'Low':1}, inplace=True)
df_MB_dropped['ER_IHC'].replace({'Positve':1, 'Negative':-1}, inplace=True)
df_MB_dropped['HER2_SNP6'].replace({'GAIN':3, 'NEUTRAL':2, 'LOSS':1, 'UNDEF':0}, inplace=True)
df_MB_dropped['INFERRED_MENOPAUSAL_STATE'].replace({'Post':1, 'Pre':-1}, inplace=True)
df_MB_dropped['ER_STATUS'].replace({'Positive':1, 'Negative':-1}, inplace=True)
df_MB_dropped['HER2_STATUS'].replace({'Positive':1, 'Negative':-1}, inplace=True)
#df_MB_dropped['GRADEGRADE']
df_MB_dropped['PR_STATUS'].replace({'Positive':1, 'Negative':-1}, inplace=True)
#df_MB_dropped['TUMOR_STAGE']

for column in qualitative_ordinal_columns:
    print(column)
    display(df_MB_dropped[column].value_counts())
    print('-----'*10)

CELLULARITY


3    564
2    428
1    132
Name: CELLULARITY, dtype: int64

--------------------------------------------------
ER_IHC


 1    875
-1    249
Name: ER_IHC, dtype: int64

--------------------------------------------------
HER2_SNP6


2    813
3    251
1     58
0      2
Name: HER2_SNP6, dtype: int64

--------------------------------------------------
INFERRED_MENOPAUSAL_STATE


 1    866
-1    258
Name: INFERRED_MENOPAUSAL_STATE, dtype: int64

--------------------------------------------------
ER_STATUS


 1    870
-1    254
Name: ER_STATUS, dtype: int64

--------------------------------------------------
HER2_STATUS


-1    984
 1    140
Name: HER2_STATUS, dtype: int64

--------------------------------------------------
GRADE


3.0    587
2.0    448
1.0     89
Name: GRADE, dtype: int64

--------------------------------------------------
PR_STATUS


 1    584
-1    540
Name: PR_STATUS, dtype: int64

--------------------------------------------------
TUMOR_STAGE


2.0    640
1.0    381
3.0     95
4.0      8
Name: TUMOR_STAGE, dtype: int64

--------------------------------------------------
