# Preprocess PanCancer data

In [197]:
import pandas as pd
import numpy as np

df_gene_exp = pd.read_table("./tcga_RSEM_gene_tpm", sep='\t', index_col=0).sort_index(axis='rows').sort_index(axis='columns')
print("Genes={}; Samples={};".format(*df_gene_exp.shape))
df_gene_exp.head()

Genes=60498; Samples=10535;


Unnamed: 0_level_0,TCGA-02-0047-01,TCGA-02-0055-01,TCGA-02-2483-01,TCGA-02-2485-01,TCGA-04-1331-01,TCGA-04-1332-01,TCGA-04-1337-01,TCGA-04-1338-01,TCGA-04-1341-01,TCGA-04-1343-01,...,TCGA-ZR-A9CJ-01,TCGA-ZS-A9CD-01,TCGA-ZS-A9CE-01,TCGA-ZS-A9CF-01,TCGA-ZS-A9CF-02,TCGA-ZS-A9CG-01,TCGA-ZT-A8OM-01,TCGA-ZU-A8S4-01,TCGA-ZU-A8S4-11,TCGA-ZX-AA5X-01
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003.14,5.4712,5.1498,5.6448,6.1709,5.7911,4.1907,4.3463,3.9856,4.0251,4.2921,...,4.4784,6.033,5.4845,5.1363,5.1583,7.1371,1.5998,4.656,5.323,4.8115
ENSG00000000005.5,-3.1714,4.1652,-5.5735,-3.1714,-2.6349,-2.3147,-5.0116,-5.5735,-4.2934,0.9115,...,-5.0116,-9.9658,-9.9658,-5.5735,-4.6082,-0.2671,-1.1172,-9.9658,-9.9658,-3.1714
ENSG00000000419.12,4.6753,6.0251,5.8263,5.1768,5.7963,4.3169,6.8252,5.243,4.9031,6.5546,...,6.7702,5.067,4.6611,4.5261,4.6317,4.8798,2.8321,5.5874,4.0037,5.2192
ENSG00000000457.13,2.0742,2.1013,1.9564,2.4198,2.1988,0.8246,1.1641,1.5013,0.5955,0.3685,...,2.1988,1.8762,2.128,3.0428,3.5473,2.1313,-0.6873,1.787,0.9642,2.5061
ENSG00000000460.16,2.2573,2.4571,2.5036,3.0995,2.8442,1.4281,1.0007,1.4174,0.7407,0.9419,...,3.0498,0.044,0.2522,1.8036,2.4623,3.0825,2.1444,2.6208,0.5955,2.6624


In [198]:
df_gene_exp.isnull().values.any()

False

# Data exploration

We now explore the samples type (tumor or normal), and then some clinical information associated to them.

## Tumor-Normal binary variable

We first load a dataset that contains information about the PanCancer sample types and diseases.

In [200]:
df_pancan_sample = pd.read_csv("./TCGA_phenotype_denseDataOnlyDownload.tsv", 
                               sep='\t', index_col=0).sort_index(axis='rows')

print(df_pancan_sample.shape)
pancan_sample = df_pancan_sample.index
print(pancan_sample.duplicated().any())
df_pancan_sample.head()

(12804, 3)
False


Unnamed: 0_level_0,sample_type_id,sample_type,_primary_disease
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA-01-0628-11,11.0,Solid Tissue Normal,ovarian serous cystadenocarcinoma
TCGA-01-0629-11,,,ovarian serous cystadenocarcinoma
TCGA-01-0630-11,11.0,Solid Tissue Normal,ovarian serous cystadenocarcinoma
TCGA-01-0631-11,11.0,Solid Tissue Normal,ovarian serous cystadenocarcinoma
TCGA-01-0633-11,11.0,Solid Tissue Normal,ovarian serous cystadenocarcinoma


there are no duplicated samples:

We select the PanCancer samples contained both in the expression and samples type datasets:

In [201]:
pancan_sample_common = df_gene_exp.columns.intersection(pancan_sample)
len(pancan_sample_common)

10534

In [203]:
df_pancan_sample = df_pancan_sample.loc[pancan_sample_common]
df_pancan_sample.shape

(10534, 3)

In [204]:
# Check NAs
df_pancan_sample.isnull().any()

sample_type_id      False
sample_type         False
_primary_disease    False
dtype: bool

In [205]:
# Sample type variable
df_pancan_sample.sample_type.value_counts(normalize=False)

Primary Tumor                                      9185
Solid Tissue Normal                                 727
Metastatic                                          392
Primary Blood Derived Cancer - Peripheral Blood     173
Recurrent Tumor                                      45
Additional - New Primary                             11
Additional Metastatic                                 1
Name: sample_type, dtype: int64

We create a tumor/normal binary variable using the sample type, with no NA values in the column:

In [206]:
df_pancan_sample["tumor_normal"] = df_pancan_sample.apply(
    lambda row: "Normal" if row["sample_type"] == "Solid Tissue Normal" else "Tumor", axis=1)

In [207]:
# Tumor/Normal variable
df_pancan_sample.tumor_normal.value_counts(normalize=False)

Tumor     9807
Normal     727
Name: tumor_normal, dtype: int64

## Clinical variables

In [208]:
df_pancan_clinical = pd.read_table("./Survival_SupplementalTable_S1_20171025_xena_sp.tsv", 
                                   index_col=0).sort_index(axis='rows')
print(df_pancan_clinical.shape)
df_pancan_clinical.head()

(12591, 35)


Unnamed: 0_level_0,_PATIENT,cancer type abbreviation,age_at_initial_pathologic_diagnosis,gender,race,ajcc_pathologic_tumor_stage,clinical_stage,histological_type,histological_grade,initial_pathologic_dx_year,...,_TIME_TO_EVENT,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,PFI.time,Redaction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-02-0001-01,TCGA-02-0001,GBM,44.0,FEMALE,WHITE,,,Untreated primary (de novo) GBM,,2002.0,...,358.0,1.0,358.0,1.0,358.0,,,1.0,137.0,
TCGA-02-0003-01,TCGA-02-0003,GBM,50.0,MALE,WHITE,,,Untreated primary (de novo) GBM,,2003.0,...,144.0,1.0,144.0,1.0,144.0,,,1.0,40.0,
TCGA-02-0006-01,TCGA-02-0006,GBM,56.0,FEMALE,WHITE,,,Untreated primary (de novo) GBM,,2002.0,...,558.0,1.0,558.0,1.0,558.0,,,1.0,302.0,
TCGA-02-0007-01,TCGA-02-0007,GBM,40.0,FEMALE,WHITE,,,Treated primary GBM,,2002.0,...,705.0,1.0,705.0,1.0,705.0,,,1.0,518.0,
TCGA-02-0009-01,TCGA-02-0009,GBM,61.0,FEMALE,WHITE,,,Untreated primary (de novo) GBM,,2003.0,...,322.0,1.0,322.0,1.0,322.0,,,1.0,264.0,


In [209]:
pancan_clinical = df_pancan_clinical.index
pancan_clinical.duplicated().any()

False

We select the PanCancer samples contained both in the expression and clinical datasets:

In [210]:
pancan_clinical_common = df_gene_exp.columns.intersection(pancan_clinical)
len(pancan_clinical_common)

10496

In [211]:
df_pancan_clinical = df_pancan_clinical.loc[pancan_clinical_common]
df_pancan_clinical.shape

(10496, 35)

In [212]:
# Overall survival
variable = "OS"
print("Number of samples with this information:",
      sum(df_pancan_clinical[variable].value_counts(normalize=False)))

df_pancan_clinical[variable].value_counts(normalize=True)

Number of samples with this information: 10489


0.0    0.687196
1.0    0.312804
Name: OS, dtype: float64

In [213]:
# Progression-free interval
variable = "PFI"
print("Number of samples with this information:",
      sum(df_pancan_clinical[variable].value_counts(normalize=False)))

df_pancan_clinical[variable].value_counts(normalize=True)

Number of samples with this information: 10316


0.0    0.653742
1.0    0.346258
Name: PFI, dtype: float64

In [214]:
# Disease-specific survival
variable = "DSS"
print("Number of samples with this information:",
      sum(df_pancan_clinical[variable].value_counts(normalize=False)))

df_pancan_clinical[variable].value_counts(normalize=True)

Number of samples with this information: 10013


0.0    0.785978
1.0    0.214022
Name: DSS, dtype: float64

In [215]:
# Disease-free interval
variable = "DFI"
print("Number of samples with this information:",
      sum(df_pancan_clinical[variable].value_counts(normalize=False)))

df_pancan_clinical[variable].value_counts(normalize=True)

Number of samples with this information: 5335


0.0    0.797751
1.0    0.202249
Name: DFI, dtype: float64

In [216]:
gene_std = np.std(df_gene_exp.T, axis=0)
gene_std.value_counts()

1.776357e-15    5055
2.442082e+00      16
2.824627e-01      11
1.057468e+00      11
1.037920e-01      10
                ... 
2.651409e+00       1
2.901854e+00       1
6.161371e-01       1
3.080871e+00       1
2.216237e+00       1
Length: 55068, dtype: int64

In [218]:
constant_genes = gene_std[gene_std < 1e-10]
constant_genes.shape

(5055,)

In [219]:
df_filter_gene_exp = df_gene_exp.T.drop(labels=constant_genes.index, axis=1, inplace=False)

In [220]:
df_filter_gene_exp.shape

(10535, 55443)

In [222]:
from statsmodels import robust

mad_genes = robust.scale.mad(df_filter_gene_exp, axis=0)
mad_genes = pd.Series(mad_genes, index=df_filter_gene_exp.columns)
print(mad_genes.shape)
mad_genes.describe()

(55443,)


count    55443.000000
mean         0.940348
std          1.401372
min          0.000000
25%          0.000000
50%          0.000000
75%          1.466294
max         11.234270
dtype: float64

In [223]:
print(sum(mad_genes==0), sum(mad_genes==0)/len(mad_genes))
no_zero_mad_genes = mad_genes[mad_genes != 0]
no_zero_mad_genes.shape

28662 0.5169633677831286


(26781,)

In [224]:
no_zero_mad_genes.describe()

count    26781.000000
mean         1.946742
std          1.451355
min          0.402230
25%          0.914469
50%          1.504990
75%          2.363564
max         11.234270
dtype: float64

In [225]:
filter_mad_genes = no_zero_mad_genes.sort_values(ascending=False)[:20000]
filter_mad_genes.shape

(20000,)

In [226]:
filter_mad_genes.describe()

count    20000.000000
mean         2.363739
std          1.459286
min          0.920844
25%          1.367404
50%          1.895210
75%          2.814276
max         11.234270
dtype: float64

In [227]:
df_filter_gene_exp_20 = df_filter_gene_exp[filter_mad_genes.index]
df_filter_gene_exp_20.shape

(10535, 20000)

In [228]:
df_gene_exp = df_filter_gene_exp_20
df_gene_exp.head()

sample,ENSG00000160182.2,ENSG00000279009.1,ENSG00000257767.2,ENSG00000211935.3,ENSG00000105388.14,ENSG00000129455.15,ENSG00000143556.8,ENSG00000230937.9,ENSG00000242371.1,ENSG00000131002.11,...,ENSG00000179454.13,ENSG00000099974.7,ENSG00000168807.16,ENSG00000146067.15,ENSG00000114127.10,ENSG00000233476.3,ENSG00000169241.17,ENSG00000184428.12,ENSG00000275202.1,ENSG00000267544.1
TCGA-02-0047-01,-9.9658,-9.9658,5.1811,-1.4305,-4.035,-3.1714,-2.7274,-9.9658,0.346,3.483,...,2.5213,1.5709,2.2051,4.8059,2.1606,4.0454,3.7614,2.9432,1.1833,0.1124
TCGA-02-0055-01,-9.9658,1.2756,1.7532,-0.9132,-9.9658,-9.9658,-2.2447,-9.9658,0.1519,-4.6082,...,0.8246,1.9712,2.4386,4.6697,1.8282,4.9842,5.3509,3.3856,0.5955,0.4447
TCGA-02-2483-01,-9.9658,2.7314,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,3.6042,...,1.5064,0.605,1.2333,3.8611,1.3679,3.9883,4.1211,4.6277,-0.013,0.9493
TCGA-02-2485-01,-9.9658,-9.9658,-9.9658,-2.8262,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,3.863,...,1.8524,1.766,1.5758,4.889,2.167,4.1587,3.5399,4.4176,0.9862,1.6604
TCGA-04-1331-01,-1.1172,-9.9658,-9.9658,-9.9658,-9.9658,7.5607,-9.9658,4.1764,-1.2481,-9.9658,...,1.1577,0.7321,2.4753,5.9693,1.7009,2.2573,5.2235,4.2795,-0.4521,-0.6193


In [229]:
survival_sample = df_pancan_clinical.index
survival_sample.duplicated().any()

False

In [230]:
print(df_pancan_clinical.shape)
df_pancan_clinical.head()

(10496, 35)


Unnamed: 0,_PATIENT,cancer type abbreviation,age_at_initial_pathologic_diagnosis,gender,race,ajcc_pathologic_tumor_stage,clinical_stage,histological_type,histological_grade,initial_pathologic_dx_year,...,_TIME_TO_EVENT,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,PFI.time,Redaction
TCGA-02-0047-01,TCGA-02-0047,GBM,78.0,MALE,WHITE,,,Untreated primary (de novo) GBM,,2005.0,...,448.0,1.0,448.0,1.0,448.0,,,1.0,57.0,
TCGA-02-0055-01,TCGA-02-0055,GBM,62.0,FEMALE,WHITE,,,Untreated primary (de novo) GBM,,2005.0,...,76.0,1.0,76.0,1.0,76.0,,,1.0,6.0,
TCGA-02-2483-01,TCGA-02-2483,GBM,43.0,MALE,ASIAN,,,Untreated primary (de novo) GBM,,2008.0,...,466.0,0.0,466.0,0.0,466.0,,,0.0,466.0,
TCGA-02-2485-01,TCGA-02-2485,GBM,53.0,MALE,BLACK OR AFRICAN AMERICAN,,,Untreated primary (de novo) GBM,,2009.0,...,470.0,0.0,470.0,0.0,470.0,,,1.0,186.0,
TCGA-04-1331-01,TCGA-04-1331,OV,78.0,FEMALE,WHITE,,Stage IIIC,Serous Cystadenocarcinoma,G3,2004.0,...,1336.0,1.0,1336.0,1.0,1336.0,1.0,459.0,1.0,459.0,Redacted


In [231]:
df_pancan_clinical['cancer type abbreviation'].value_counts(normalize=False, dropna=False)

BRCA    1211
KIRC     603
LUAD     574
THCA     571
HNSC     564
LUSC     548
PRAD     548
LGG      522
SKCM     470
STAD     450
OV       427
BLCA     426
LIHC     421
COAD     329
KIRP     321
CESC     309
SARC     264
ESCA     195
UCEC     194
PCPG     185
PAAD     183
LAML     173
GBM      165
TGCT     137
THYM     121
READ     102
KICH      91
MESO      87
UVM       79
ACC       77
UCS       57
DLBC      47
CHOL      45
Name: cancer type abbreviation, dtype: int64

In [232]:
# Samples are filtered from the survival dataset, as we are interesting in survival analysis
lung_sample = df_pancan_clinical[df_pancan_clinical['cancer type abbreviation'].apply(lambda x: x in ["LUAD", "LUSC"])].index
lung_sample.shape

(1122,)

In [233]:
print(df_pancan_sample.shape)
df_pancan_sample.head()

(10534, 4)


Unnamed: 0,sample_type_id,sample_type,_primary_disease,tumor_normal
TCGA-02-0047-01,1.0,Primary Tumor,glioblastoma multiforme,Tumor
TCGA-02-0055-01,1.0,Primary Tumor,glioblastoma multiforme,Tumor
TCGA-02-2483-01,1.0,Primary Tumor,glioblastoma multiforme,Tumor
TCGA-02-2485-01,1.0,Primary Tumor,glioblastoma multiforme,Tumor
TCGA-04-1331-01,1.0,Primary Tumor,ovarian serous cystadenocarcinoma,Tumor


In [234]:
df_gene_exp_lung = df_gene_exp.loc[lung_sample]
df_gene_exp_lung.shape

(1122, 20000)

In [235]:
df_sample_lung = df_pancan_sample.loc[lung_sample]
df_sample_lung.shape

(1122, 4)

In [236]:
df_survival_lung = df_pancan_clinical.loc[lung_sample, :]
df_survival_lung.shape

(1122, 35)

In [237]:
# Tumor/normal variable
variable = "tumor_normal"
print("Number of samples with this information:",
      sum(df_sample_lung[variable].value_counts(normalize=False, dropna=False)))

df_sample_lung[variable].value_counts(normalize=False, dropna=False)

Number of samples with this information: 1122


Tumor     1013
Normal     109
Name: tumor_normal, dtype: int64

In [238]:
# Sample type variable
variable = "sample_type"
print("Number of samples with this information:",
      sum(df_sample_lung[variable].value_counts(normalize=False, dropna=False)))

df_sample_lung[variable].value_counts(normalize=False, dropna=False)

Number of samples with this information: 1122


Primary Tumor          1011
Solid Tissue Normal     109
Recurrent Tumor           2
Name: sample_type, dtype: int64

In [239]:
# Overall survival
print("Number of samples with this information:",
      sum(df_survival_lung.OS.value_counts(normalize=False)))

df_survival_lung.OS.value_counts(normalize=False, dropna=False)

Number of samples with this information: 1122


0.0    663
1.0    459
Name: OS, dtype: int64

In [240]:
# Disease specific survival
print("Number of samples with this information:",
      sum(df_survival_lung.DSS.value_counts(normalize=False)))

df_survival_lung.DSS.value_counts(normalize=False, dropna=False)

Number of samples with this information: 1023


0.0    791
1.0    232
NaN     99
Name: DSS, dtype: int64

In [241]:
# Progression-free interval
print("Number of samples with this information:",
      sum(df_survival_lung['PFI'].value_counts(normalize=False)))

df_survival_lung['PFI'].value_counts(normalize=False, dropna=False)

Number of samples with this information: 1122


0.0    726
1.0    396
Name: PFI, dtype: int64

In [242]:
# Disease-free interval
print("Number of samples with this information:",
      sum(df_survival_lung['DFI'].value_counts(normalize=False)))

df_survival_lung['DFI'].value_counts(normalize=False, dropna=False)

Number of samples with this information: 664


0.0    502
NaN    458
1.0    162
Name: DFI, dtype: int64

In [243]:
no_lung_sample = df_pancan_clinical.index.difference(lung_sample)
len(no_lung_sample)

9374

In [244]:
len(no_lung_sample) == (df_pancan_clinical.shape[0] - df_gene_exp_lung.shape[0])

True

In [245]:
df_gene_exp_no_lung = df_gene_exp.loc[no_lung_sample]
df_gene_exp_no_lung.shape

(9374, 20000)

In [246]:
df_sample_no_lung = df_pancan_sample.loc[sample_no_lung_common]
df_sample_no_lung.shape

(9374, 4)

In [247]:
# Sample type variable
df_sample_no_lung.sample_type.value_counts(normalize=True, dropna=False)

Primary Tumor                                      0.869639
Solid Tissue Normal                                0.064327
Metastatic                                         0.041818
Primary Blood Derived Cancer - Peripheral Blood    0.018455
Recurrent Tumor                                    0.004587
Additional - New Primary                           0.001067
Additional Metastatic                              0.000107
Name: sample_type, dtype: float64

In [248]:
# Tumor/Normal variable
df_sample_no_lung.tumor_normal.value_counts(normalize=True, dropna=False)

Tumor     0.935673
Normal    0.064327
Name: tumor_normal, dtype: float64

In [249]:
survival_no_lung_common = df_gene_exp_no_lung.index.intersection(df_pancan_clinical.index)
len(survival_no_lung_common)

9374

In [250]:
df_survival_no_lung = df_pancan_clinical.loc[survival_no_lung_common]
df_survival_no_lung.shape

(9374, 35)

In [251]:
# Overall survival
variable = "OS"
print("Number of samples with this information:",
      sum(df_survival_no_lung[variable].value_counts(normalize=False)))

df_survival_no_lung[variable].value_counts(normalize=True, dropna=False)

Number of samples with this information: 9367


0.0    0.698208
1.0    0.301045
NaN    0.000747
Name: OS, dtype: float64

In [252]:
# Progression-free interval
variable = "PFI"
print("Number of samples with this information:",
      sum(df_survival_no_lung[variable].value_counts(normalize=False)))

df_survival_no_lung[variable].value_counts(normalize=True, dropna=False)

Number of samples with this information: 9194


0.0    0.641988
1.0    0.338809
NaN    0.019202
Name: PFI, dtype: float64

In [253]:
# Disease-specific survival
variable = "DSS"
print("Number of samples with this information:",
      sum(df_survival_no_lung[variable].value_counts(normalize=False)))

df_survival_no_lung[variable].value_counts(normalize=True, dropna=False)

Number of samples with this information: 8990


0.0    0.755174
1.0    0.203862
NaN    0.040964
Name: DSS, dtype: float64

In [254]:
# Disease-free interval
variable = "DFI"
print("Number of samples with this information:",
      sum(df_survival_no_lung[variable].value_counts(normalize=False)))

df_survival_no_lung[variable].value_counts(normalize=True, dropna=False)

Number of samples with this information: 4671


NaN    0.501707
0.0    0.400469
1.0    0.097824
Name: DFI, dtype: float64

In [257]:
df_filter_gene_exp_20.to_pickle('./mad_filter_pancan_all_TCGA_20.pkl')
df_gene_exp_lung.to_pickle('./Lung_pancan_exp.pkl')
df_sample_lung.to_pickle('./Lung_pancan_sample.pkl')
df_survival_lung.to_pickle('./Lung_pancan_survival.pkl')
df_gene_exp_no_lung.to_pickle('./non_Lung_pancan_exp.pkl')
df_sample_no_lung.to_pickle('./non_Lung_pancan_sampletype.pkl')
df_survival_no_lung.to_pickle('./non_Lung_pancan_sampleclinical.pkl')