In [1]:
import os


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn_analyzer import CustomPairPlot
from IPython.display import HTML
import sweetviz as sv

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.experimental import (
    enable_iterative_imputer,
)  # IterativeImputerをimportするために必要
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

from tqdm import tqdm


import config

SEED = config.SEED


from functions import *

fix_seed(SEED)

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)
pd.set_option("display.width", 2000)

  y: pd.Series(),


In [4]:
# 2.0.0実行済みであること
df_clinical = pd.read_pickle(
    config.INTERIM_PICKLE_PREPROCESSED_OS5YEARS_CLINICAL_DIR + "/X.pkl"
)
df_mrna_agilent_microarray = pd.read_table(
    config.RAW_BRCA_METABRIC_DIR + "/data_mrna_agilent_microarray.txt", index_col=0
).T
df_mrna_agilent_microarray = df_mrna_agilent_microarray.drop(
    "Entrez_Gene_Id"
).sort_index()

In [5]:
df_clinical.tail()

Unnamed: 0,LYMPH_NODES_EXAMINED_POSITIVE,NPI,CELLULARITY,CHEMOTHERAPY,INFERRED_MENOPAUSAL_STATE,AGE_AT_DIAGNOSIS,GRADE,PR_STATUS,TMB_NONSYNONYMOUS,ER_IHC,HER2_SNP6,ER_STATUS,HER2_STATUS,TUMOR_SIZE,HORMONE_THERAPY,RADIO_THERAPY,RFS_STATUS,RFS_MONTHS,COHORT_2.0,COHORT_3.0,COHORT_4.0,COHORT_5.0,INTCLUST_10,INTCLUST_2,INTCLUST_3,...,INTCLUST_6,INTCLUST_7,INTCLUST_8,INTCLUST_9,CLAUDIN_SUBTYPE_Her2,CLAUDIN_SUBTYPE_LumA,CLAUDIN_SUBTYPE_LumB,CLAUDIN_SUBTYPE_NC,CLAUDIN_SUBTYPE_Normal,CLAUDIN_SUBTYPE_claudin-low,HISTOLOGICAL_SUBTYPE_Lobular,HISTOLOGICAL_SUBTYPE_Medullary,HISTOLOGICAL_SUBTYPE_Mixed,HISTOLOGICAL_SUBTYPE_Mucinous,HISTOLOGICAL_SUBTYPE_Other,HISTOLOGICAL_SUBTYPE_Tubular/ cribriform,BREAST_SURGERY_MASTECTOMY,CANCER_TYPE_DETAILED_Breast Invasive Ductal Carcinoma,CANCER_TYPE_DETAILED_Breast Invasive Lobular Carcinoma,CANCER_TYPE_DETAILED_Breast Invasive Mixed Mucinous Carcinoma,CANCER_TYPE_DETAILED_Breast Mixed Ductal and Lobular Carcinoma,ONCOTREE_CODE_IDC,ONCOTREE_CODE_ILC,ONCOTREE_CODE_IMMC,ONCOTREE_CODE_MDLC
1578,1.0,5.05,3,0,0,43.1,3,1,5.230071,1,2,1,0,25.0,1,1,0,194.28,0,0,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0
1579,1.0,5.04,3,0,0,42.88,3,0,7.845106,1,3,1,1,20.0,0,1,1,16.09,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0
1580,45.0,6.05,3,0,1,62.9,3,1,5.230071,1,2,1,0,25.0,1,1,1,121.18,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0
1581,12.0,5.05,2,0,1,61.16,2,1,19.612766,1,2,1,0,25.0,1,0,0,85.1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0
1582,1.0,5.04,3,0,1,60.02,3,0,3.922553,1,2,1,0,20.0,1,1,0,199.24,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0


In [6]:
df_mrna_agilent_microarray.tail()

Hugo_Symbol,RERE,RNF165,CD049690,BC033982,PHF7,CIDEA,PAPD4,AI082173,SLC17A3,SDS,ATP6V1C2,F3,FAM71C,AK055082,BU687559,LIN52,PCOTH,GRM1,FXN,SLC9A1,PML,CR749394,CD164,DB337918,MOBKL2A,...,TMEM146,SLIT3-AS1,PSMC3IP,TEAD1,LOC253724,BM725158,GRHPR,EGLN2,TBC1D4,ANGPTL2,AF086457,AK124197,ENOX1,SBF2-AS1,VN1R4,BX115874,BX107598,UGCGL1,VPS72,CSMD3,CC2D1A,CB986545,IGSF9,DA110839,FAM71A
MB-7295,8.589374,6.002483,5.116077,4.996112,5.802742,5.47455,8.25169,5.373561,5.546402,6.532649,5.535139,6.28202,5.440358,5.637957,5.172188,6.486128,6.074573,5.281556,6.05833,8.722802,5.891944,5.408865,6.315133,5.54577,6.707501,...,5.394435,5.324246,6.132406,5.471503,5.372043,5.36438,8.455392,10.303605,6.686198,9.987552,5.354638,5.529167,5.722099,5.240904,5.521765,5.488675,5.461927,7.536208,8.3145,5.3373,6.278034,5.426899,6.923887,5.251896,5.168953
MB-7296,8.40266,6.104059,5.443955,5.224513,5.256086,6.659117,8.641838,5.48837,5.522964,7.102716,5.353956,5.546199,5.289827,5.272168,5.424468,6.491748,5.865518,5.564988,6.132467,8.814733,6.236368,5.237184,6.25771,5.360478,6.223216,...,5.304696,5.396539,6.221306,5.315972,5.300309,5.268524,8.523291,11.353435,6.623708,8.441388,5.443857,5.644035,5.453321,5.171671,5.441341,5.37591,5.393811,7.709596,8.105717,5.397931,6.325456,5.217163,6.524268,5.505418,5.252479
MB-7297,8.236918,5.40287,5.533742,5.468793,5.571897,5.439574,8.979375,5.28346,5.804127,6.59389,5.353883,5.705768,5.5313,5.313886,5.212158,6.906297,6.116738,5.132988,6.031882,8.53087,5.744546,5.287748,6.865706,5.380624,6.589864,...,5.466731,5.291783,7.066555,5.382979,5.550651,5.335161,8.286103,10.428968,6.769603,6.564861,5.591439,5.597712,5.334787,5.18292,5.339665,5.487068,5.406575,7.729912,7.944622,5.412713,6.254337,5.405552,6.121864,5.46227,5.357823
MB-7298,8.376571,5.617954,5.375647,5.305559,5.631592,5.734358,8.628511,5.232599,5.472525,6.175189,5.473054,5.963092,5.308848,5.491198,5.050838,6.507257,5.484692,5.355723,6.003971,9.396055,5.921776,5.54512,6.611871,5.362488,6.783883,...,5.294014,5.389626,6.250953,5.489047,5.725481,5.3823,8.473675,10.452393,6.571327,7.32375,5.334621,5.560818,5.406241,5.351516,5.361063,5.214711,5.689935,7.329023,8.348807,5.474224,6.415853,5.344851,7.029076,5.519022,5.51229
MB-7299,9.283964,7.137196,5.221194,5.54935,5.462977,5.222914,8.25169,5.36538,5.660058,6.683335,5.327812,5.818325,5.066311,5.361826,5.256033,6.292127,5.387614,5.342313,5.98311,8.88337,6.408964,5.444065,6.55785,5.416992,6.794207,...,5.281974,5.253435,6.125892,5.466416,5.467026,5.384024,8.202645,10.394254,6.886155,8.423223,5.312663,5.772291,5.646755,5.479193,5.709304,5.227597,5.382574,7.763496,8.053959,5.238813,6.898575,5.353198,6.76996,5.46461,5.232599


In [8]:
df_clinical.shape, df_mrna_agilent_microarray.shape

((1583, 53), (1904, 24368))

# clinicalデータとgenes expressionデータの結合

In [14]:
df_merged = pd.merge(
    df_clinical,
    df_mrna_agilent_microarray.reset_index(),
    right_index=True,
    left_index=True,
).drop("index", axis=1)

print(
    df_clinical.shape,
    df_mrna_agilent_microarray.shape,
    df_merged.shape,
)

df_merged.tail()

(1583, 53) (1904, 24368) (1583, 24421)


Unnamed: 0,LYMPH_NODES_EXAMINED_POSITIVE,NPI,CELLULARITY,CHEMOTHERAPY,INFERRED_MENOPAUSAL_STATE,AGE_AT_DIAGNOSIS,GRADE,PR_STATUS,TMB_NONSYNONYMOUS,ER_IHC,HER2_SNP6,ER_STATUS,HER2_STATUS,TUMOR_SIZE,HORMONE_THERAPY,RADIO_THERAPY,RFS_STATUS,RFS_MONTHS,COHORT_2.0,COHORT_3.0,COHORT_4.0,COHORT_5.0,INTCLUST_10,INTCLUST_2,INTCLUST_3,...,TMEM146,SLIT3-AS1,PSMC3IP,TEAD1,LOC253724,BM725158,GRHPR,EGLN2,TBC1D4,ANGPTL2,AF086457,AK124197,ENOX1,SBF2-AS1,VN1R4,BX115874,BX107598,UGCGL1,VPS72,CSMD3,CC2D1A,CB986545,IGSF9,DA110839,FAM71A
1578,1.0,5.05,3,0,0,43.1,3,1,5.230071,1,2,1,0,25.0,1,1,0,194.28,0,0,1,0,0,0,1,...,5.368408,5.216533,5.999093,5.394435,5.389051,5.152642,8.249881,10.816712,6.454274,9.25692,5.528259,5.771137,5.747687,5.396607,5.431527,5.556651,5.515717,7.535728,8.487352,5.287668,6.156895,5.348169,7.806515,5.189215,5.2986
1579,1.0,5.04,3,0,0,42.88,3,0,7.845106,1,3,1,1,20.0,0,1,1,16.09,0,0,1,0,0,0,0,...,5.384797,5.723339,6.304016,5.408074,5.346244,5.259019,7.798771,10.889767,6.421828,8.713788,5.454481,5.683612,5.381797,5.557409,5.484585,5.42592,5.313217,7.854612,7.832333,5.430415,6.013555,5.313546,6.806361,5.648131,5.060495
1580,45.0,6.05,3,0,1,62.9,3,1,5.230071,1,2,1,0,25.0,1,1,1,121.18,0,0,1,0,0,0,0,...,5.379413,5.644884,5.808844,5.195478,5.245868,5.449873,8.260991,10.649259,6.973637,8.629244,5.59347,5.469359,5.521638,5.642379,5.441612,5.278494,5.402007,7.631791,8.254785,5.280922,5.969769,5.109117,7.247896,5.323828,5.30824
1581,12.0,5.05,2,0,1,61.16,2,1,19.612766,1,2,1,0,25.0,1,0,0,85.1,0,0,1,0,0,0,0,...,5.324631,5.550131,6.053629,5.399861,5.444989,5.296182,7.519804,12.472484,6.729247,7.620289,5.381888,6.207269,5.269649,5.46059,5.353742,5.666947,5.469093,8.205037,8.492129,5.46507,6.385105,5.392199,6.928579,5.439369,5.274503
1582,1.0,5.04,3,0,1,60.02,3,0,3.922553,1,2,1,0,20.0,1,1,0,199.24,0,0,1,0,1,0,0,...,5.346286,5.266171,6.216931,5.574385,5.469051,5.372375,7.836978,11.220022,6.986047,9.266545,5.382705,6.368075,5.849225,5.581379,5.536659,5.358802,5.518456,7.797744,7.763496,5.248346,6.447561,5.264026,6.990406,5.308361,5.511644


# 目的クラスの生成

## 考えうる組み合わせ
1. 治療法とOSの組み合わせ（先行研究）
2. CLAUDIN_SUBTYPEとOSの組み合わせ（LumAなどが重要？）
3. 再発の有無で場合分け（再発していない場合の死亡事例は皆無である）