# Extracting Data from TCGA Datasets (ENSG)

# Import, Set working directory

In [1]:
import pandas as pd
import re
import time

In [2]:
import os
def main():
    print("Current Working Directory " , os.getcwd())
    if os.path.exists("C:/Users/micha.DESKTOP-8HA2IGV/OneDrive/Programming/Propulsion Project/intelligencia_backup/intelligencia") :
        # Change the current working Directory    
        os.chdir("C:/Users/micha.DESKTOP-8HA2IGV/OneDrive/Programming/Propulsion Project/intelligencia_backup/intelligencia")
        print("New Working Directory " , os.getcwd())
    else:
        print("Can't change the Current Working Directory")    
        print("Current Working Directory " , os.getcwd())
if __name__ == '__main__':
    main()

Current Working Directory  C:\Users\micha.DESKTOP-8HA2IGV\OneDrive\Programming\Propulsion Project\intelligencia_backup\intelligencia\Mike
New Working Directory  C:\Users\micha.DESKTOP-8HA2IGV\OneDrive\Programming\Propulsion Project\intelligencia_backup\intelligencia


Preprocess a provided CSV file to smaller chunks with relevant data for our analysis.

## Data Exploration

In [2]:
df_phenotype = pd.read_csv('Data/TcgaTargetGTEX_phenotype.txt', '\t')
df_phenotype.head()

Unnamed: 0,sample,detailed_category,primary disease or tissue,_primary_site,_sample_type,_gender,_study
0,TCGA-V4-A9EE-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
1,TCGA-VD-AA8N-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
2,TCGA-V4-A9EI-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
3,TCGA-VD-AA8O-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
4,TCGA-WC-A888-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA


In [45]:
df_phenotype.shape

(19131, 7)

In [15]:
df = pd.read_csv('Data/TcgaTargetGtex_rsem_gene_tpm.txt', delimiter='\t', usecols=['sample'])
df.head()

Unnamed: 0,sample
0,ENSG00000242268.2
1,ENSG00000259041.1
2,ENSG00000270112.3
3,ENSG00000167578.16
4,ENSG00000278814.1


In [210]:
df.shape

(60498, 1)

## Filter for Protein-Coding Genes

Get all protein coding genes to be able to filter only meaningful ensembl samples.

In [4]:
# Unique Protein Coding Genes
def create_upcg():
    df_mart = pd.read_csv('Data/mart_export.txt')
    return set(df_mart[df_mart['Gene type'] == 'protein_coding']['Gene stable ID'].unique())

## Class: Data Filter
Retrieve data according to 1. primary site, 2. gender, 3. sample type, 4. detailed category
- get columns
- get output path
- split

In [5]:
class DataFilter:
    def __init__(self, data_filepath, phenotype_filepath, upcg=None, data_output='Output'):
        self.data_output = data_output
        self.data_filepath = data_filepath
        self.phenotype_filepath = phenotype_filepath
        self.upcg = upcg
        self.df_phenotype = pd.read_csv(phenotype_filepath, delimiter='\t')

    def get_columns(self, sites, types=["Primary Tumor", "Normal Tissue"], genders=["Female", "Male"], categories=None):
        cond = self.df_phenotype['_primary_site'].isin(sites)
        cond &= self.df_phenotype['_gender'].isin(genders)
        cond &= self.df_phenotype['_sample_type'].isin(types)
        
        if categories != None:
            cond &= self.df_phenotype['detailed_category'].isin(categories)

        return ['sample'] + self.df_phenotype[cond]['sample'].tolist()
    
    def get_output_path(self, sites, categories):
        items = sites if categories == None else categories
        name = '_'.join([re.sub('\s', '', s) for s in items])        
        return '{}/Chunk_{}.csv'.format(self.data_output, name)

    def split(self, sites, **kwargs):
        start = time.time()

        # Filter columns to load a smaller data subset.
        filtered_cols = self.get_columns(sites, **kwargs)

        print('Processing data with {} samples.'.format(len(filtered_cols)))
        
        # Load the original big fat data file with specific columns for a final chunk.
        df = pd.read_csv(self.data_filepath, delimiter='\t', usecols=filtered_cols)
        
        # Filter all ensembls that are not representing protein-coding genes.
        if self.upcg != None:
            df = df[df['sample'].str.replace(r'\.\d+', '').isin(self.upcg)]

        # Transpose the matrix to get ensembls as headers.
        df = df.set_index('sample').transpose()
        df.insert(loc=0, column='label', value=df.index.astype(str).str.contains('TCGA-').astype(int))

        output_path = self.get_output_path(sites, kwargs.get('categories'))
        df.to_csv(output_path)
        
        print('Finished in {:.1f} min\n'.format((time.time() - start) / 60))
        return df

In [6]:
data_filter = DataFilter('Data/TcgaTargetGtex_rsem_gene_tpm.txt', 'Data/TcgaTargetGTEX_phenotype.txt', create_upcg())
#data_filter.split(["Lung"], categories=["Lung Squamous Cell Carcinoma", "Lung"])
#data_filter.split(["Lung"], categories=["Lung Adenocarcinoma", "Lung"])
#data_filter.split(["Thyroid", "Thyroid Gland"])
data_filter.split(["Colon"])
#data_filter.split(["Skin"])
#data_filter.split(["Breast"], genders=["Female"])

Processing data with 595 samples.
Finished in 2.5 min



sample,label,ENSG00000167578.16,ENSG00000078237.5,ENSG00000146083.11,ENSG00000158486.13,ENSG00000198242.13,ENSG00000134108.12,ENSG00000172137.18,ENSG00000276644.4,ENSG00000094963.13,...,ENSG00000107863.16,ENSG00000213782.7,ENSG00000146707.14,ENSG00000158417.10,ENSG00000089177.17,ENSG00000186115.12,ENSG00000009694.13,ENSG00000123685.8,ENSG00000105063.18,ENSG00000181518.3
GTEX-WQUQ-2526-SM-4MVNO,0,5.212600,2.160600,5.148200,-0.734600,10.230200,5.091700,4.894400,0.001400,3.366100,...,4.512300,5.529900,4.349200,4.821300,2.543700,-0.940600,-0.375200,2.087800,5.262300,-9.965800
TCGA-D5-6929-01,1,5.078600,3.385600,4.507200,0.791600,10.872100,4.524300,1.245500,2.909000,-3.625900,...,3.325000,5.210300,1.526600,5.458200,3.784600,2.307700,-9.965800,0.264200,4.954200,-9.965800
TCGA-AA-3511-01,1,4.883200,3.140900,5.675800,0.264200,10.566800,4.269900,0.227700,2.879900,-2.052900,...,3.766700,5.041800,3.249600,5.189900,3.591100,4.461500,-6.506400,0.465700,5.140400,-9.965800
GTEX-13G51-2126-SM-5IJD9,0,4.474500,1.967500,4.510400,-6.506400,10.240100,5.040100,0.465700,-0.940600,2.330800,...,4.642200,5.313600,3.697200,6.021900,2.737900,-9.965800,1.064200,1.328300,4.947200,-9.965800
TCGA-QG-A5YX-01,1,4.719800,3.400700,4.534900,0.434000,10.736500,4.779800,-4.608200,1.064200,-3.307600,...,4.542900,4.996900,1.770200,5.642400,4.145800,1.189700,-9.965800,-0.182800,5.802700,-9.965800
GTEX-1122O-1526-SM-5N9CL,0,5.858000,1.310900,5.649900,2.014700,9.521200,4.684300,2.299000,-0.103100,-0.432500,...,4.735600,4.314000,1.967500,4.053200,4.015300,3.928000,-3.171400,-0.150400,4.790800,-9.965800
GTEX-UPIC-1726-SM-4IHKG,0,5.509700,1.131600,5.757800,1.705300,9.819300,4.370200,1.787000,0.138800,1.220900,...,4.291400,4.429700,3.258700,4.240400,3.443700,2.653300,-1.685000,0.824600,4.942600,-9.965800
TCGA-CM-6164-01,1,4.943500,2.648700,5.280300,0.537000,10.660300,5.155100,1.043300,0.971600,-1.354800,...,4.366700,5.109800,2.189400,5.379600,3.586300,3.127800,-5.573500,1.189700,5.289900,-9.965800
TCGA-A6-A567-01,1,5.084500,3.079100,5.775900,1.949000,10.916300,5.291300,1.334000,3.204900,-3.625900,...,3.455600,5.412800,2.459700,5.293900,4.077300,0.537000,-6.506400,0.723300,5.303800,-9.965800
TCGA-G4-6320-01,1,4.955600,1.345400,4.522400,-0.687300,10.842100,4.171600,-3.046900,2.316400,-6.506400,...,3.037600,4.684300,3.162000,4.404700,3.085900,-1.055900,-9.965800,0.099000,5.427600,-9.965800


# Create new Chunks

## Create All cancers Chunk

In [None]:
chunk_df = pd.DataFrame()
for chunk, cancer in zip(chunks, cancers):
    chunk.index = chunk.iloc[:,0]
    chunk.drop(columns = "Unnamed: 0", inplace = True)
    chunk.columns = [(re.sub('\.\d+', '', gene)) for gene in chunk.columns]
    
    chunk["label"].replace(1, "1_" + cancer, inplace = True)
    chunk["label"].replace(0, "0_" + cancer, inplace = True)
    
    chunk_df = chunk_df.append(chunk)

In [20]:
chunk_df = pd.read_csv("Output/Chunk_AllCancers.csv", index_col = "Unnamed: 0"); chunk_df.head()

Unnamed: 0_level_0,label,ENSG00000167578,ENSG00000078237,ENSG00000146083,ENSG00000158486,ENSG00000198242,ENSG00000134108,ENSG00000172137,ENSG00000276644,ENSG00000094963,...,ENSG00000107863,ENSG00000213782,ENSG00000146707,ENSG00000158417,ENSG00000089177,ENSG00000186115,ENSG00000009694,ENSG00000123685,ENSG00000105063,ENSG00000181518
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-C8-A1HL-01,1_breast,4.9782,2.6624,3.958,-0.3752,10.7877,5.7415,0.6239,3.345,-0.1993,...,3.2251,4.9016,2.4727,5.6511,5.1094,-9.9658,-6.5064,1.0293,4.4095,-9.9658
TCGA-EW-A2FS-01,1_breast,5.7035,1.2696,4.2189,-4.2934,10.146,5.7801,1.4011,4.0532,2.3193,...,3.5863,4.9327,3.1556,5.851,4.4277,-6.5064,-3.0469,1.46,4.4081,-9.9658
TCGA-B6-A402-01,1_breast,4.1252,1.6234,5.018,-2.8262,9.8935,4.8929,2.9729,-3.6259,6.0603,...,5.1107,5.3013,3.9801,5.8592,3.1669,-9.9658,-5.0116,2.5061,4.449,-9.9658
TCGA-A2-A3XX-01,1_breast,4.8734,1.5998,4.1859,-2.114,9.8904,4.7247,9.4821,-3.458,5.2126,...,4.1676,5.1169,4.4223,5.7195,3.0393,-6.5064,-5.5735,2.0707,4.1078,-9.9658
TCGA-Z7-A8R5-01,1_breast,5.4973,1.3846,3.3017,-5.5735,11.1398,4.6651,3.208,1.46,2.8522,...,2.6114,4.7176,4.0875,3.6578,2.1509,-9.9658,-4.6082,2.8522,4.49,-9.9658


## Create All cancers 0 vs. 1 Chunk

In [207]:
chunk_df_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4781 entries, TCGA-C8-A1HL-01 to GTEX-OOBK-2626-SM-2HMKY
Columns: 19664 entries, label to ENSG00000181518
dtypes: float64(19663), object(1)
memory usage: 717.3+ MB


In [33]:
chunk_df_all = chunk_df.copy()
chunk_df_all['label'] = chunk_df_all['label'].astype(str).str[0]

In [35]:
chunk_df_all.to_csv("Output/Chunk_AllCancers_0vs1.csv")

## Create Lung_A 1 vs Lung_S 1 Chunk

1_lung_a = 1
1_lung_s = 0

In [38]:
Chunk_LungA1_vs_LungS1 = chunk_df[chunk_df["label"].str.contains("1_lung")]; Chunk_LungA1_vs_LungS1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1011 entries, TCGA-05-4420-01 to TCGA-77-6844-01
Columns: 19664 entries, label to ENSG00000181518
dtypes: float64(19663), object(1)
memory usage: 151.7+ MB


In [40]:
Chunk_LungA1_vs_LungS1["label"] = Chunk_LungA1_vs_LungS1["label"].replace("1_lung_s", "0")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [41]:
Chunk_LungA1_vs_LungS1["label"] = Chunk_LungA1_vs_LungS1['label'].astype(str).str[0]
Chunk_LungA1_vs_LungS1.to_csv("Output/Chunk_LungA1_vs_LungS1.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# Testing Functions

In [189]:
pd.read_csv('Output/Chunk_Bladder.csv', index_col=0).head(1)

Unnamed: 0,label,ENSG00000242268.2,ENSG00000259041.1,ENSG00000270112.3,ENSG00000167578.16,ENSG00000278814.1,ENSG00000078237.5,ENSG00000269416.5,ENSG00000263642.1,ENSG00000146083.11,...,ENSG00000009694.13,ENSG00000238244.3,ENSG00000216352.1,ENSG00000123685.8,ENSG00000267117.1,ENSG00000273233.1,ENSG00000105063.18,ENSG00000231119.2,ENSG00000280861.1,ENSG00000181518.3
GTEX-U3ZM-0826-SM-4DXU6,0,-1.0262,-9.9658,-9.9658,4.8525,-9.9658,2.1213,-9.9658,-9.9658,4.4771,...,0.03,-9.9658,-9.9658,0.03,-9.9658,-9.9658,4.7437,-3.458,-9.9658,-9.9658


In [5]:
xx = pd.read_csv('Output/Chunk_Colon.csv', index_col=0).T.iloc[1:].reset_index()
xx.head()

Unnamed: 0,index,GTEX-WQUQ-2526-SM-4MVNO,TCGA-D5-6929-01,TCGA-AA-3511-01,GTEX-13G51-2126-SM-5IJD9,TCGA-QG-A5YX-01,GTEX-1122O-1526-SM-5N9CL,GTEX-UPIC-1726-SM-4IHKG,TCGA-CM-6164-01,TCGA-A6-A567-01,...,GTEX-11P82-1026-SM-5BC5J,TCGA-A6-2684-01,GTEX-ZLV1-1226-SM-4WWBX,TCGA-AZ-4315-01,TCGA-CM-4751-01,TCGA-AA-3675-01,GTEX-X15G-1326-SM-4PQZJ,TCGA-G4-6317-01,TCGA-DM-A28A-01,TCGA-AA-3660-01
0,ENSG00000167578.16,5.2126,5.0786,4.8832,4.4745,4.7198,5.858,5.5097,4.9435,5.0845,...,5.323,2.881222,5.337,4.6474,5.3629,4.366,5.1883,4.3723,4.6866,4.8739
1,ENSG00000078237.5,2.1606,3.3856,3.1409,1.9675,3.4007,1.3109,1.1316,2.6487,3.0791,...,1.7229,2.513734,1.4494,4.3695,3.4197,2.4857,1.7446,2.4649,3.0358,5.4741
2,ENSG00000146083.11,5.1482,4.5072,5.6758,4.5104,4.5349,5.6499,5.7578,5.2803,5.7759,...,4.8635,3.709065,5.5925,3.6646,5.5435,4.9341,5.1482,4.9012,6.0418,5.076
3,ENSG00000158486.13,-0.7346,0.7916,0.2642,-6.5064,0.434,2.0147,1.7053,0.537,1.949,...,0.4657,0.342166,1.2394,-0.9686,0.4865,-0.5756,1.2576,0.2277,0.9115,0.4552
4,ENSG00000198242.13,10.2302,10.8721,10.5668,10.2401,10.7365,9.5212,9.8193,10.6603,10.9163,...,9.2025,8.704179,9.471,10.128,10.5024,11.2458,9.7463,12.4733,10.7132,10.4904


In [3]:
xx.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19653,19654,19655,19656,19657,19658,19659,19660,19661,19662
index,ENSG00000167578.16,ENSG00000078237.5,ENSG00000146083.11,ENSG00000158486.13,ENSG00000198242.13,ENSG00000134108.12,ENSG00000172137.18,ENSG00000276644.4,ENSG00000094963.13,ENSG00000182141.9,...,ENSG00000107863.16,ENSG00000213782.7,ENSG00000146707.14,ENSG00000158417.10,ENSG00000089177.17,ENSG00000186115.12,ENSG00000009694.13,ENSG00000123685.8,ENSG00000105063.18,ENSG00000181518.3
GTEX-WQUQ-2526-SM-4MVNO,5.2126,2.1606,5.1482,-0.7346,10.2302,5.0917,4.8944,0.0014,3.3661,1.0222,...,4.5123,5.5299,4.3492,4.8213,2.5437,-0.9406,-0.3752,2.0878,5.2623,-9.9658
TCGA-D5-6929-01,5.0786,3.3856,4.5072,0.7916,10.8721,4.5243,1.2455,2.909,-3.6259,1.5165,...,3.325,5.2103,1.5266,5.4582,3.7846,2.3077,-9.9658,0.2642,4.9542,-9.9658
TCGA-AA-3511-01,4.8832,3.1409,5.6758,0.2642,10.5668,4.2699,0.2277,2.8799,-2.0529,1.3109,...,3.7667,5.0418,3.2496,5.1899,3.5911,4.4615,-6.5064,0.4657,5.1404,-9.9658
GTEX-13G51-2126-SM-5IJD9,4.4745,1.9675,4.5104,-6.5064,10.2401,5.0401,0.4657,-0.9406,2.3308,1.2085,...,4.6422,5.3136,3.6972,6.0219,2.7379,-9.9658,1.0642,1.3283,4.9472,-9.9658
TCGA-QG-A5YX-01,4.7198,3.4007,4.5349,0.434,10.7365,4.7798,-4.6082,1.0642,-3.3076,1.2992,...,4.5429,4.9969,1.7702,5.6424,4.1458,1.1897,-9.9658,-0.1828,5.8027,-9.9658
GTEX-1122O-1526-SM-5N9CL,5.858,1.3109,5.6499,2.0147,9.5212,4.6843,2.299,-0.1031,-0.4325,1.5612,...,4.7356,4.314,1.9675,4.0532,4.0153,3.928,-3.1714,-0.1504,4.7908,-9.9658
GTEX-UPIC-1726-SM-4IHKG,5.5097,1.1316,5.7578,1.7053,9.8193,4.3702,1.787,0.1388,1.2209,1.595,...,4.2914,4.4297,3.2587,4.2404,3.4437,2.6533,-1.685,0.8246,4.9426,-9.9658
TCGA-CM-6164-01,4.9435,2.6487,5.2803,0.537,10.6603,5.1551,1.0433,0.9716,-1.3548,1.8036,...,4.3667,5.1098,2.1894,5.3796,3.5863,3.1278,-5.5735,1.1897,5.2899,-9.9658
TCGA-A6-A567-01,5.0845,3.0791,5.7759,1.949,10.9163,5.2913,1.334,3.2049,-3.6259,1.0007,...,3.4556,5.4128,2.4597,5.2939,4.0773,0.537,-6.5064,0.7233,5.3038,-9.9658
