# Splicing Datasets (ENSG)

Preprocess a provided CSV file to smaller chunks with relevant data for our analysis.

In [1]:
import pandas as pd
import re
import time

## Data Exploration

In [2]:
df_phenotype = pd.read_csv('Data/TcgaTargetGTEX_phenotype.txt', '\t')
df_phenotype.head()

Unnamed: 0,sample,detailed_category,primary disease or tissue,_primary_site,_sample_type,_gender,_study
0,TCGA-V4-A9EE-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
1,TCGA-VD-AA8N-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
2,TCGA-V4-A9EI-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
3,TCGA-VD-AA8O-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
4,TCGA-WC-A888-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA


In [45]:
df_phenotype.shape

(19131, 7)

In [15]:
df = pd.read_csv('Data/TcgaTargetGtex_rsem_gene_tpm.txt', delimiter='\t', usecols=['sample'])
df.head()

Unnamed: 0,sample
0,ENSG00000242268.2
1,ENSG00000259041.1
2,ENSG00000270112.3
3,ENSG00000167578.16
4,ENSG00000278814.1


In [210]:
df.shape

(60498, 1)

## Gene Filter

Get all protein coding genes to be able to filter only meaningful ensembl samples.

In [4]:
# Unique Protein Coding Genes
def create_upcg():
    df_mart = pd.read_csv('Data/mart_export.txt')
    return set(df_mart[df_mart['Gene type'] == 'protein_coding']['Gene stable ID'].unique())

## Data Filter

In [5]:
class DataFilter:
    def __init__(self, data_filepath, phenotype_filepath, upcg=None, data_output='Output'):
        self.data_output = data_output
        self.data_filepath = data_filepath
        self.phenotype_filepath = phenotype_filepath
        self.upcg = upcg
        self.df_phenotype = pd.read_csv(phenotype_filepath, delimiter='\t')

    def get_columns(self, sites, types=["Primary Tumor", "Normal Tissue"], genders=["Female", "Male"], categories=None):
        cond = self.df_phenotype['_primary_site'].isin(sites)
        cond &= self.df_phenotype['_gender'].isin(genders)
        cond &= self.df_phenotype['_sample_type'].isin(types)
        
        if categories != None:
            cond &= self.df_phenotype['detailed_category'].isin(categories)

        return ['sample'] + self.df_phenotype[cond]['sample'].tolist()
    
    def get_output_path(self, sites, categories):
        items = sites if categories == None else categories
        name = '_'.join([re.sub('\s', '', s) for s in items])        
        return '{}/Chunk_{}.csv'.format(self.data_output, name)

    def split(self, sites, **kwargs):
        start = time.time()

        # Filter columns to load a smaller data subset.
        filtered_cols = self.get_columns(sites, **kwargs)

        print('Processing data with {} samples.'.format(len(filtered_cols)))
        
        # Load the original big fat data file with specific columns for a final chunk.
        df = pd.read_csv(self.data_filepath, delimiter='\t', usecols=filtered_cols)
        
        # Filter all ensembls that are not representing protein-coding genes.
        if self.upcg != None:
            df = df[df['sample'].str.replace(r'\.\d+', '').isin(self.upcg)]

        # Transpose the matrix to get ensembls as headers.
        df = df.set_index('sample').transpose()
        df.insert(loc=0, column='label', value=df.index.astype(str).str.contains('TCGA-').astype(int))

        output_path = self.get_output_path(sites, kwargs.get('categories'))
        df.to_csv(output_path)
        
        print('Finished in {:.1f} min\n'.format((time.time() - start) / 60))
        return df

In [6]:
data_filter = DataFilter('Data/TcgaTargetGtex_rsem_gene_tpm.txt', 'Data/TcgaTargetGTEX_phenotype.txt', create_upcg())
#data_filter.split(["Lung"], categories=["Lung Squamous Cell Carcinoma", "Lung"])
#data_filter.split(["Lung"], categories=["Lung Adenocarcinoma", "Lung"])
#data_filter.split(["Thyroid", "Thyroid Gland"])
data_filter.split(["Colon"])
#data_filter.split(["Skin"])
#data_filter.split(["Breast"], genders=["Female"])

Processing data with 595 samples.
Finished in 2.5 min



sample,label,ENSG00000167578.16,ENSG00000078237.5,ENSG00000146083.11,ENSG00000158486.13,ENSG00000198242.13,ENSG00000134108.12,ENSG00000172137.18,ENSG00000276644.4,ENSG00000094963.13,...,ENSG00000107863.16,ENSG00000213782.7,ENSG00000146707.14,ENSG00000158417.10,ENSG00000089177.17,ENSG00000186115.12,ENSG00000009694.13,ENSG00000123685.8,ENSG00000105063.18,ENSG00000181518.3
GTEX-WQUQ-2526-SM-4MVNO,0,5.212600,2.160600,5.148200,-0.734600,10.230200,5.091700,4.894400,0.001400,3.366100,...,4.512300,5.529900,4.349200,4.821300,2.543700,-0.940600,-0.375200,2.087800,5.262300,-9.965800
TCGA-D5-6929-01,1,5.078600,3.385600,4.507200,0.791600,10.872100,4.524300,1.245500,2.909000,-3.625900,...,3.325000,5.210300,1.526600,5.458200,3.784600,2.307700,-9.965800,0.264200,4.954200,-9.965800
TCGA-AA-3511-01,1,4.883200,3.140900,5.675800,0.264200,10.566800,4.269900,0.227700,2.879900,-2.052900,...,3.766700,5.041800,3.249600,5.189900,3.591100,4.461500,-6.506400,0.465700,5.140400,-9.965800
GTEX-13G51-2126-SM-5IJD9,0,4.474500,1.967500,4.510400,-6.506400,10.240100,5.040100,0.465700,-0.940600,2.330800,...,4.642200,5.313600,3.697200,6.021900,2.737900,-9.965800,1.064200,1.328300,4.947200,-9.965800
TCGA-QG-A5YX-01,1,4.719800,3.400700,4.534900,0.434000,10.736500,4.779800,-4.608200,1.064200,-3.307600,...,4.542900,4.996900,1.770200,5.642400,4.145800,1.189700,-9.965800,-0.182800,5.802700,-9.965800
GTEX-1122O-1526-SM-5N9CL,0,5.858000,1.310900,5.649900,2.014700,9.521200,4.684300,2.299000,-0.103100,-0.432500,...,4.735600,4.314000,1.967500,4.053200,4.015300,3.928000,-3.171400,-0.150400,4.790800,-9.965800
GTEX-UPIC-1726-SM-4IHKG,0,5.509700,1.131600,5.757800,1.705300,9.819300,4.370200,1.787000,0.138800,1.220900,...,4.291400,4.429700,3.258700,4.240400,3.443700,2.653300,-1.685000,0.824600,4.942600,-9.965800
TCGA-CM-6164-01,1,4.943500,2.648700,5.280300,0.537000,10.660300,5.155100,1.043300,0.971600,-1.354800,...,4.366700,5.109800,2.189400,5.379600,3.586300,3.127800,-5.573500,1.189700,5.289900,-9.965800
TCGA-A6-A567-01,1,5.084500,3.079100,5.775900,1.949000,10.916300,5.291300,1.334000,3.204900,-3.625900,...,3.455600,5.412800,2.459700,5.293900,4.077300,0.537000,-6.506400,0.723300,5.303800,-9.965800
TCGA-G4-6320-01,1,4.955600,1.345400,4.522400,-0.687300,10.842100,4.171600,-3.046900,2.316400,-6.506400,...,3.037600,4.684300,3.162000,4.404700,3.085900,-1.055900,-9.965800,0.099000,5.427600,-9.965800


## Just Testing 

In [189]:
pd.read_csv('Output/Chunk_Bladder.csv', index_col=0).head(1)

Unnamed: 0,label,ENSG00000242268.2,ENSG00000259041.1,ENSG00000270112.3,ENSG00000167578.16,ENSG00000278814.1,ENSG00000078237.5,ENSG00000269416.5,ENSG00000263642.1,ENSG00000146083.11,...,ENSG00000009694.13,ENSG00000238244.3,ENSG00000216352.1,ENSG00000123685.8,ENSG00000267117.1,ENSG00000273233.1,ENSG00000105063.18,ENSG00000231119.2,ENSG00000280861.1,ENSG00000181518.3
GTEX-U3ZM-0826-SM-4DXU6,0,-1.0262,-9.9658,-9.9658,4.8525,-9.9658,2.1213,-9.9658,-9.9658,4.4771,...,0.03,-9.9658,-9.9658,0.03,-9.9658,-9.9658,4.7437,-3.458,-9.9658,-9.9658


In [249]:
xx = pd.read_csv('Output/Chunk_Bladder.csv', index_col=0).T.iloc[1:].reset_index()
xx.head()

Unnamed: 0,index,GTEX-U3ZM-0826-SM-4DXU6,GTEX-SE5C-1026-SM-4BRUG,GTEX-U4B1-1226-SM-4DXT7,GTEX-SNMC-0826-SM-4DM66,GTEX-S4Q7-0926-SM-4AD5D,GTEX-SNOS-0526-SM-4DM54,GTEX-S4UY-0926-SM-4AD6O,GTEX-S3XE-1226-SM-4AD4L,GTEX-S32W-1126-SM-4AD5V
0,ENSG00000242268.2,-1.0262,-9.9658,-9.9658,-0.9686,-1.1811,-9.9658,-0.6873,-3.816,-9.9658
1,ENSG00000259041.1,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658
2,ENSG00000270112.3,-9.9658,-9.9658,-9.9658,-2.7274,-9.9658,-2.3884,-6.5064,-2.5479,-4.6082
3,ENSG00000167578.16,4.8525,4.7236,5.0175,5.6473,5.3449,5.3886,5.3702,4.9538,5.3555
4,ENSG00000278814.1,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658
