# Preprocessing TCGA Data

In [1]:
import os
import numpy as np
import pandas as pd
import glob
from matplotlib import pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style('ticks')
sns.set_context('paper')

from tabulate import tabulate
from sklearn.preprocessing import MinMaxScaler

In [6]:
tcga_path = "/mnt/c/Users/owlsa/Documents/GitHub/cits4010/data/TCGA"
raw_data_path = "/mnt/c/Users/owlsa/Documents/GitHub/cits4010/data/TCGA/raw_data"
processed_data_path = "/mnt/c/Users/owlsa/Documents/GitHub/cits4010/data/TCGA/processed_data"
clinical_data_path = "/mnt/c/Users/owlsa/Documents/GitHub/cits4010/data/TCGA/clinical/clinical.tsv"

# Verifying files

Reading the clinical data for the entire TCGA project

## Preprocessing the Clinical Data

In [6]:
clinical_data = pd.read_table(clinical_data_path, delimiter='\t')

# drop uneeded columns
cols = clinical_data.columns.drop(["case_submitter_id", "project_id", "age_at_index", "gender", "race", "tissue_or_organ_of_origin", "site_of_resection_or_biopsy", "primary_diagnosis", "ajcc_pathologic_stage"])
clinical_data = clinical_data.drop(columns=cols)

# convert nulls to 0s
clinical_data = clinical_data.replace(["'--", "Not Reported", "not reported"], '0') # encode missing data with 0
clinical_data = clinical_data.drop_duplicates(ignore_index=True)

# convert dtypes
clinical_data['age_at_index'] = clinical_data['age_at_index'].apply(pd.to_numeric, downcast='integer')
clinical_data = clinical_data.convert_dtypes(infer_objects=True)

clinical_data.head()

Unnamed: 0,case_submitter_id,project_id,age_at_index,gender,race,ajcc_pathologic_stage,primary_diagnosis,site_of_resection_or_biopsy,tissue_or_organ_of_origin
0,TCGA-DD-AAVP,TCGA-LIHC,48,male,asian,Stage I,"Hepatocellular carcinoma, NOS",Liver,Liver
1,TCGA-KK-A7B2,TCGA-PRAD,68,male,white,0,"Adenocarcinoma, NOS",Prostate gland,Prostate gland
2,TCGA-DC-6158,TCGA-READ,70,male,white,Stage I,"Adenocarcinoma, NOS","Rectum, NOS","Rectum, NOS"
3,TCGA-DD-A4NP,TCGA-LIHC,32,male,white,Stage I,"Hepatocellular carcinoma, NOS",Liver,Liver
4,TCGA-HQ-A5ND,TCGA-BLCA,78,male,0,Stage IV,Transitional cell carcinoma,"Bladder, NOS","Bladder, NOS"


Helper function for counting the values in each column of the df and outputting the results as tables

In [7]:
categorical_columns = ['gender', 'race', 'ajcc_pathologic_stage', 'primary_diagnosis', 
                       'site_of_resection_or_biopsy', 'tissue_or_organ_of_origin']
def get_column_counts(df):
    for col in categorical_columns:
        data = pd.DataFrame(df[col].value_counts())
        print(col)
        print(tabulate(data, headers=['Field', 'Count'], tablefmt='simple_grid'))

get_column_counts(clinical_data)

gender
┌─────────┬─────────┐
│ Field   │   Count │
├─────────┼─────────┤
│ female  │    5365 │
├─────────┼─────────┤
│ male    │    4982 │
├─────────┼─────────┤
│ 0       │      39 │
└─────────┴─────────┘
race
┌───────────────────────────────────────────┬─────────┐
│ Field                                     │   Count │
├───────────────────────────────────────────┼─────────┤
│ white                                     │    7791 │
├───────────────────────────────────────────┼─────────┤
│ 0                                         │     973 │
├───────────────────────────────────────────┼─────────┤
│ black or african american                 │     923 │
├───────────────────────────────────────────┼─────────┤
│ asian                                     │     660 │
├───────────────────────────────────────────┼─────────┤
│ american indian or alaska native          │      26 │
├───────────────────────────────────────────┼─────────┤
│ native hawaiian or other pacific islander │      13 │
└─────

Scale Age Data

In [5]:
scaler = MinMaxScaler()
clinical_data['age_at_index'] = scaler.fit_transform(clinical_data['age_at_index'].values.reshape(-1, 1))

One-hot encoding text data

In [6]:
categorical_columns = pd.get_dummies(clinical_data.drop(columns=['age_at_index', 'case_submitter_id', 'project_id']), drop_first=True)
clinical_data = pd.concat([clinical_data[['age_at_index', 'case_submitter_id', 'project_id']], categorical_columns], axis=1)

## Splitting the data by project

Separating the clinical data by the specific TCGA project (e.g. TCGA ACC, TCGA OV)

In [8]:
# separate the clinical data by project
project_clinical_data = {}
project_names = sorted([project.replace('TCGA-', '') for project in set(clinical_data['project_id'].values)])

In [9]:
project_names

['ACC',
 'BLCA',
 'BRCA',
 'CESC',
 'CHOL',
 'COAD',
 'DLBC',
 'ESCA',
 'GBM',
 'HNSC',
 'KICH',
 'KIRC',
 'KIRP',
 'LAML',
 'LGG',
 'LIHC',
 'LUAD',
 'LUSC',
 'MESO',
 'OV',
 'PAAD',
 'PCPG',
 'PRAD',
 'READ',
 'SARC',
 'SKCM',
 'STAD',
 'TGCT',
 'THCA',
 'THYM',
 'UCEC',
 'UCS',
 'UVM']

Reading the sample sheets from each TCGA directory (there is one per project)

In [13]:
sample_sheets = []

for project_name in project_names:
    file_pattern = os.path.join(raw_data_path, project_name, 'gdc_sample_sheet.*.tsv')
    file_name = glob.glob(file_pattern)
    sample_sheet = pd.read_table(file_name[0])
    sample_sheets.append(sample_sheet)

sample_sheet = pd.concat([s for s in sample_sheets], axis=0, join='inner')

sample_sheet = sample_sheet.drop(columns=['Data Category', 'Data Type'])
sample_sheet = sample_sheet.replace(["'--", "Not Reported", "not reported"], '0') # encode missing data with 0

sample_sheet = sample_sheet.convert_dtypes(infer_objects=True)
sample_sheet = pd.concat([sample_sheet, pd.get_dummies(sample_sheet['Sample Type'], prefix='Sample Type')], axis=1)
sample_sheet = sample_sheet.drop(columns=['Sample Type'])
sample_sheet

Unnamed: 0,File ID,File Name,Project ID,Case ID,Sample ID,Sample Type_Additional - New Primary,Sample Type_Additional Metastatic,Sample Type_Metastatic,Sample Type_Primary Blood Derived Cancer - Peripheral Blood,Sample Type_Primary Tumor,Sample Type_Recurrent Tumor,Sample Type_Solid Tissue Normal
0,674f8521-6181-49a9-b32c-729d83d0b8bd,4100a850-68a9-4bb9-ae91-dd63d64cb1aa.rna_seq.a...,TCGA-ACC,TCGA-OR-A5K2,TCGA-OR-A5K2-01A,False,False,False,False,True,False,False
1,7069c2e2-9e61-45ba-9f14-75bd90b5d6ec,8026738f-cc73-4c9e-be15-28366f272caa.rna_seq.a...,TCGA-ACC,TCGA-OR-A5LN,TCGA-OR-A5LN-01A,False,False,False,False,True,False,False
2,251e5d31-2d8d-4858-9b82-c78548e6987b,dd76dd00-b1e5-4245-8059-6b2966cf257d.rna_seq.a...,TCGA-ACC,TCGA-OR-A5JG,TCGA-OR-A5JG-01A,False,False,False,False,True,False,False
3,66e177ca-c09a-462d-a025-1e9d003ee9e9,925136ec-3825-49cd-8aa3-1c536e4a410c.rna_seq.a...,TCGA-ACC,TCGA-OR-A5KX,TCGA-OR-A5KX-01A,False,False,False,False,True,False,False
4,a3365732-0ae5-4536-aa3a-5f57447ffffb,2efca6e0-4edb-42a2-9d3a-ee8f25624bf9.rna_seq.a...,TCGA-ACC,TCGA-OR-A5LR,TCGA-OR-A5LR-01A,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
75,8c5e6f18-3a16-4065-ab12-d3391da154e9,61b1ca49-d26b-432d-bc34-98c4644f8b4f.rna_seq.a...,TCGA-UVM,TCGA-VD-A8KK,TCGA-VD-A8KK-01A,False,False,False,False,True,False,False
76,ea09a6c8-0521-4b9e-8cc0-ba0e7f855aa8,8acb6203-ddd0-4edc-8017-7e3b2953b7a2.rna_seq.a...,TCGA-UVM,TCGA-V4-A9EJ,TCGA-V4-A9EJ-01A,False,False,False,False,True,False,False
77,ddf84756-4ed2-4deb-9321-9fc456232cdc,a199f5da-dda7-406f-967b-f9c5ff771a40.rna_seq.a...,TCGA-UVM,TCGA-V4-A9EW,TCGA-V4-A9EW-01A,False,False,False,False,True,False,False
78,b5213c67-a73f-4b3d-ac9b-deedfbc08160,97aede8f-8315-4d49-8f04-41b4c266d78c.rna_seq.a...,TCGA-UVM,TCGA-VD-AA8P,TCGA-VD-AA8P-01A,False,False,False,False,True,False,False


In [14]:
joined_data = sample_sheet.merge(clinical_data, left_on='Case ID', right_on='case_submitter_id', how='left', validate='many_to_one').reset_index()
joined_data = joined_data.drop(columns=['File ID','project_id', 'Project ID', 'Case ID', 'Sample ID', 'case_submitter_id', 'index'])
joined_data['File Name'] = joined_data['File Name'].str.replace('.rna_seq.augmented_star_gene_counts.tsv', '')
joined_data = joined_data.set_index('File Name')
joined_data

Unnamed: 0_level_0,Sample Type_Additional - New Primary,Sample Type_Additional Metastatic,Sample Type_Metastatic,Sample Type_Primary Blood Derived Cancer - Peripheral Blood,Sample Type_Primary Tumor,Sample Type_Recurrent Tumor,Sample Type_Solid Tissue Normal,age_at_index,gender_female,gender_male,...,tissue_or_organ_of_origin_Trigone of bladder,tissue_or_organ_of_origin_Unknown primary site,tissue_or_organ_of_origin_Upper Gum,"tissue_or_organ_of_origin_Upper lobe, lung",tissue_or_organ_of_origin_Upper third of esophagus,tissue_or_organ_of_origin_Upper-inner quadrant of breast,tissue_or_organ_of_origin_Upper-outer quadrant of breast,tissue_or_organ_of_origin_Ureteric orifice,"tissue_or_organ_of_origin_Uterus, NOS","tissue_or_organ_of_origin_Ventral surface of tongue, NOS"
File Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4100a850-68a9-4bb9-ae91-dd63d64cb1aa,False,False,False,False,True,False,False,0.355556,True,False,...,False,False,False,False,False,False,False,False,False,False
8026738f-cc73-4c9e-be15-28366f272caa,False,False,False,False,True,False,False,0.344444,True,False,...,False,False,False,False,False,False,False,False,False,False
dd76dd00-b1e5-4245-8059-6b2966cf257d,False,False,False,False,True,False,False,0.677778,False,True,...,False,False,False,False,False,False,False,False,False,False
925136ec-3825-49cd-8aa3-1c536e4a410c,False,False,False,False,True,False,False,0.277778,True,False,...,False,False,False,False,False,False,False,False,False,False
2efca6e0-4edb-42a2-9d3a-ee8f25624bf9,False,False,False,False,True,False,False,0.333333,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61b1ca49-d26b-432d-bc34-98c4644f8b4f,False,False,False,False,True,False,False,0.600000,False,True,...,False,False,False,False,False,False,False,False,False,False
8acb6203-ddd0-4edc-8017-7e3b2953b7a2,False,False,False,False,True,False,False,0.422222,False,True,...,False,False,False,False,False,False,False,False,False,False
a199f5da-dda7-406f-967b-f9c5ff771a40,False,False,False,False,True,False,False,0.488889,True,False,...,False,False,False,False,False,False,False,False,False,False
97aede8f-8315-4d49-8f04-41b4c266d78c,False,False,False,False,True,False,False,0.711111,True,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
joined_data.to_pickle(processed_data_path+f'/clinical_data/clinical_data.pkl')

# Collating Gene Expression

In [10]:
# getting a list of genes
gene_info = pd.read_table('C:/Users/owlsa/Documents/GitHub/cits4010/data/TCGA/raw_data/BRCA/0a7dc8b9-4196-41d3-ada1-a50cb36bfd2b/3c9c665e-9327-45f0-b412-224e8948deb2.rna_seq.augmented_star_gene_counts.tsv', header=1, usecols=['gene_id', 'gene_name', 'gene_type'])
gene_info = gene_info.drop(index=[0, 1, 2, 3])
gene_info = gene_info.set_index('gene_id', drop=True)
gene_info.to_csv('gene_info.csv')

In [11]:
# for project_name in project_names:
#     project_dir = os.path.join(raw_data_path, project_name)

#     # get file names from directory
#     file_pattern = os.path.join(project_dir,"*", '*.rna_seq.augmented_star_gene_counts.tsv')
#     file_list = glob.glob(file_pattern)
    
#     num_merges = 0
#     for file in file_list:
#         df = pd.read_table(file, header=1, usecols=['gene_id', 'unstranded'], index_col='gene_id')
#         df = df.drop(index=['N_unmapped', 'N_multimapping', 'N_noFeature', 'N_ambiguous'])
#         df = df.rename(columns={'unstranded': str(os.path.basename(file)).replace('.rna_seq.augmented_star_gene_counts.tsv', '')})
#         gene_info = gene_info.merge(df, how='left', left_index=True, right_index=True, validate='one_to_one')
#         num_merges += 1
        
#     print(f"Added {num_merges} {project_name} files to dataframe")

In [12]:
def normalise(df):
    housekeeping_genes = ["ENSG00000111640.15", "ENSG00000075624.17", "ENSG00000134644.16", "ENSG00000150991.15"]

    exp = df.copy()
    for col in exp.columns:
        sm: float = 0
        cnt = 0
        for gene in housekeeping_genes:
            if gene not in exp.index:
                continue
            sm += exp.at[gene, col]
            cnt += 1

        mean = sm / cnt
        exp[col] = exp[col].astype(float)
        exp.loc[:, col] = exp.loc[:, col] / mean

    exp = np.log(exp+1)
    return exp

In [13]:
# for project_name in project_names:
#     project_dir = os.path.join(raw_data_path, project_name)
#     file_pattern = os.path.join(project_dir,"*", '*.rna_seq.augmented_star_gene_counts.tsv')
#     file_list = glob.glob(file_pattern)

#     dfs = []
#     for file in file_list:
#         df = pd.read_table(file, header=1, usecols=['gene_id', 'unstranded'], index_col='gene_id')
#         df = df.drop(index=['N_unmapped', 'N_multimapping', 'N_noFeature', 'N_ambiguous'])
#         df = df.rename(columns={'unstranded': str(os.path.basename(file)).replace('.rna_seq.augmented_star_gene_counts.tsv', '')})
#         dfs.append(df)
#     print(f"Read {len(dfs)} files.")

#     df = pd.concat([d for d in dfs], axis=1, join='inner')
#     df = normalise(df)
#     df.to_pickle(processed_data_path+f'/gene_expression_data/{project_name}.pkl')
#     print(f"Saved {project_name} gene expression data")

# Concatenating Gene Expression and Clinical Data

In [8]:
clinical_data = pd.read_pickle(processed_data_path+f'/clinical_data/clinical_data.pkl')

In [2]:
project_names = [
    'ACC',
    'BLCA',
    'BRCA',
    'CESC',
    'CHOL',
    'COAD',
    'DLBC',
    'ESCA',
    'GBM',
    'HNSC',
    'KICH',
    'KIRC',
    'KIRP',
    'LAML',
    'LGG',
    'LIHC',
    'LUAD',
    'LUSC',
    'MESO',
    'OV',
    'PAAD',
    'PCPG',
    'PRAD',
    'READ',
    'SARC',
    'SKCM',
    'STAD',
    'TGCT',
    'THCA',
    'THYM',
    'UCEC',
    'UCS',
    'UVM'
]

In [4]:
import gc
import pandas as pd
import numpy as np

In [10]:
dfs = []
def join_clinical_data(project_name):
    gene_exp = pd.read_pickle(processed_data_path+f'/gene_expression_data/{project_name}.pkl')
    gene_exp = gene_exp.T
    joined = gene_exp.join(clinical_data, how='inner')
    return joined

for project_name in project_names:
    data = join_clinical_data(project_name)
    data = data.to_numpy()
    dfs.append(data)
    print(f"Added {project_name}")
    del data
    gc.collect()

Added ACC
Added BLCA
Added BRCA
Added CESC
Added CHOL
Added COAD
Added DLBC
Added ESCA
Added GBM
Added HNSC
Added KICH
Added KIRC
Added KIRP
Added LAML
Added LGG
Added LIHC
Added LUAD
Added LUSC
Added MESO


: 

In [18]:
VAE_data = pd.concat([df for df in dfs], axis=0, join='inner')
VAE_data.to_pickle('VAE_data')