In [1]:
import os

import numpy as np
import pandas as pd

import config

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)
pd.set_option('display.max_rows', 50)


def make_dir(dir_name: str):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)


# basic process

基本的なプロセス  
2つのデータを読み込み、PATIENT_IDカラムをキーとして結合する  
その後、MB~とMTX-T~でPATIENT_IDが別れているので、分割し、各データを保存する

In [2]:
df_patient = pd.read_table(
    config.RAW_BRCA_METABRIC_DIR + "/data_clinical_patient.txt", header=4
)
df_sample = pd.read_table(
    config.RAW_BRCA_METABRIC_DIR + "/data_clinical_sample.txt", header=4
)


In [3]:
df_patient.shape, df_sample.shape

((2509, 24), (2509, 13))

In [4]:
# 重複カラム確認
df_tmp=pd.concat([df_patient, df_sample], axis=1)
display(df_tmp.shape)
df_tmp.T.duplicated()

(2509, 37)

PATIENT_ID                       False
LYMPH_NODES_EXAMINED_POSITIVE    False
NPI                              False
CELLULARITY                      False
CHEMOTHERAPY                     False
COHORT                           False
ER_IHC                           False
HER2_SNP6                        False
HORMONE_THERAPY                  False
INFERRED_MENOPAUSAL_STATE        False
SEX                              False
INTCLUST                         False
AGE_AT_DIAGNOSIS                 False
OS_MONTHS                        False
OS_STATUS                        False
CLAUDIN_SUBTYPE                  False
THREEGENE                        False
VITAL_STATUS                     False
LATERALITY                       False
RADIO_THERAPY                    False
HISTOLOGICAL_SUBTYPE             False
BREAST_SURGERY                   False
RFS_STATUS                       False
RFS_MONTHS                       False
PATIENT_ID                        True
SAMPLE_ID                

df_patientとdf_sampleをconcatした場合、PATIENT_ID*2とSAMPLE_IDは内容が完全に同じ

In [5]:
# df_patientとdf_sampleを結合
df_merged = pd.merge(df_patient, df_sample)
df_merged.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2509 entries, 0 to 2508
Data columns (total 36 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   PATIENT_ID                     2509 non-null   object 
 1   LYMPH_NODES_EXAMINED_POSITIVE  2243 non-null   float64
 2   NPI                            2287 non-null   float64
 3   CELLULARITY                    1917 non-null   object 
 4   CHEMOTHERAPY                   1980 non-null   object 
 5   COHORT                         2498 non-null   float64
 6   ER_IHC                         2426 non-null   object 
 7   HER2_SNP6                      1980 non-null   object 
 8   HORMONE_THERAPY                1980 non-null   object 
 9   INFERRED_MENOPAUSAL_STATE      1980 non-null   object 
 10  SEX                            2509 non-null   object 
 11  INTCLUST                       1980 non-null   object 
 12  AGE_AT_DIAGNOSIS               2498 non-null   f

In [6]:
#　手を加える前のデータフレームの形状
df_merged.shape, df_merged.isnull().sum()

((2509, 36),
 PATIENT_ID                         0
 LYMPH_NODES_EXAMINED_POSITIVE    266
 NPI                              222
 CELLULARITY                      592
 CHEMOTHERAPY                     529
 COHORT                            11
 ER_IHC                            83
 HER2_SNP6                        529
 HORMONE_THERAPY                  529
 INFERRED_MENOPAUSAL_STATE        529
 SEX                                0
 INTCLUST                         529
 AGE_AT_DIAGNOSIS                  11
 OS_MONTHS                        528
 OS_STATUS                        528
 CLAUDIN_SUBTYPE                  529
 THREEGENE                        745
 VITAL_STATUS                     529
 LATERALITY                       639
 RADIO_THERAPY                    529
 HISTOLOGICAL_SUBTYPE             135
 BREAST_SURGERY                   554
 RFS_STATUS                        21
 RFS_MONTHS                       121
 SAMPLE_ID                          0
 CANCER_TYPE                        0

In [7]:
# データを大きく2つに分割できるので、ここで分割
df_MB=df_merged[df_merged['PATIENT_ID'].str.contains('MB')]
df_MTST=df_merged[df_merged['PATIENT_ID'].str.contains('MTS-T')]
df_MB.shape, df_MTST.shape

((1985, 36), (524, 36))

In [8]:
# save
make_dir(config.INTERIM_PICKLE_PREPROCESSED_DIR)
df_merged.to_pickle(config.INTERIM_PICKLE_PREPROCESSED_DIR+'/2.0-df_merged.pkl')
df_MB.to_pickle(config.INTERIM_PICKLE_PREPROCESSED_DIR+'/2.0-df_MB.pkl')
df_MTST.to_pickle(config.INTERIM_PICKLE_PREPROCESSED_DIR+'/2.0-df_MTST.pkl')