# TCGA: Data load, filter, transform

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Imports
import pandas as pd

from tcga_rna import *
from util import *

In [3]:
path_raw = Path('data/tcga/raw')

cancer_types = ['ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'COAD'
                , 'COADREAD', 'COADREAD', 'DLBC', 'ESCA', 'GBM', 'GBMLGG'
                , 'HNSC', 'KICH', 'KIPAN', 'KIRC', 'KIRP', 'LAML', 'LGG'
                , 'LIHC', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG'
                , 'PRAD', 'READ', 'READ', 'SARC', 'SKCM', 'STAD', 'STES'
                , 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCEC', 'UCS', 'UVM']

for cancer_type in cancer_types:
    print(f"\nCancer type: {cancer_type}")
    files = path_raw.find_files(f"*{cancer_type}*RSEM_genes_normalized*.txt")
    if len(files) != 1:
        print(f"ERROR: Cancer type '{cancer_type}', expecting one result, got {len(files)}: {files}")
    else:
        df = load_filter_transform(files[0])
        if df.shape[1] == 0:
            print(f"WARNING: No genes left after filtering,dataframe shape: {df.shape}")
        else:
            rnaseq_save(df, cancer_type, "rna.filtered_transformed")


Cancer type: ACC
Loading file: 'data/tcga/raw/gdac.broadinstitute.org_ACC.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0/ACC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt'
Applying filter_invalid_genes, x.shape: (20531, 79)
Applying filter_duplicated_gene_names, x.shape: (20502, 79)
Applying filter_low_normals_count, x.shape: (20500, 79)
Applying filter_too_many_missing, x.shape: (0, 79)
Applying filter_non_normals, x.shape: (0, 79)
Applying logp1_normalize, x.shape: (0, 79)
Applying rename_genes, x.shape: (0, 79)

Cancer type: BLCA
Loading file: 'data/tcga/raw/gdac.broadinstitute.org_BLCA.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0/BLCA.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt'
Applying filter_invalid_genes, x.shape: (20531, 427)
Applying filter_duplicated_gene_names, x

ERROR: Cancer type 'GBM', expecting one result, got 2: [PosixPath('data/tcga/raw/gdac.broadinstitute.org_GBM.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0/GBM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt'), PosixPath('data/tcga/raw/gdac.broadinstitute.org_GBMLGG.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0/GBMLGG.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt')]

Cancer type: GBMLGG
Loading file: 'data/tcga/raw/gdac.broadinstitute.org_GBMLGG.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0/GBMLGG.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt'
Applying filter_invalid_genes, x.shape: (20531, 701)
Applying filter_duplicated_gene_names, x.shape: (20502, 701)
Applying filter_low_n


Cancer type: MESO
Loading file: 'data/tcga/raw/gdac.broadinstitute.org_MESO.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0/MESO.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt'
Applying filter_invalid_genes, x.shape: (20531, 87)
Applying filter_duplicated_gene_names, x.shape: (20502, 87)
Applying filter_low_normals_count, x.shape: (20500, 87)
Applying filter_too_many_missing, x.shape: (0, 87)
Applying filter_non_normals, x.shape: (0, 87)
Applying logp1_normalize, x.shape: (0, 87)
Applying rename_genes, x.shape: (0, 87)

Cancer type: OV
Loading file: 'data/tcga/raw/gdac.broadinstitute.org_OV.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0/OV.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt'
Applying filter_invalid_genes, x.shape: (20531, 307)
Applying filter_duplicated_gene_names, x.sh

Applying filter_non_normals, x.shape: (15928, 646)
Applying logp1_normalize, x.shape: (11512, 646)
Applying rename_genes, x.shape: (11512, 646)
Saving to file: 'data/tcga/STES.rna.filtered_transformed.csv'

Cancer type: TGCT
Loading file: 'data/tcga/raw/gdac.broadinstitute.org_TGCT.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0/TGCT.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt'
Applying filter_invalid_genes, x.shape: (20531, 156)
Applying filter_duplicated_gene_names, x.shape: (20502, 156)
Applying filter_low_normals_count, x.shape: (20500, 156)
Applying filter_too_many_missing, x.shape: (0, 156)
Applying filter_non_normals, x.shape: (0, 156)
Applying logp1_normalize, x.shape: (0, 156)
Applying rename_genes, x.shape: (0, 156)

Cancer type: THCA
Loading file: 'data/tcga/raw/gdac.broadinstitute.org_THCA.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__