### Data processing

In [4]:
## The all_cancers_manifest.txt file is the file that has the file_id and file_name (columns are like so: id\tfilename\tmd5\tsize\tstate).
## The metadata JSON file is the dict that contains the file_id, case_id, and project_id (cancer type).
## The hierarchy is [{..., "file_id": 123, "associated_entities": [{..., "case_id": "6cb6f179-defd-4661-af0a-c353b74c0c49"}, ...], 
##                    "cases": [{..., "project": {"project_id": "TCGA-BRCA"}, ...}], ...}, {<NEXT ENTRY>}, ...].
## The data are stored in folders named after the file_id.

import pandas as pd
import numpy as np
from scipy.sparse import lil_matrix
import os
import re
import pickle
import simplejson as json
import csv
import gc
import datetime as dt
import psutil
from IPython.display import clear_output

def metaDataGatherer(dtype):
    ## Make a dict with all the metadata for each file (case id, cancer type, etc.)
    ## data_dir is an initially empty directory where the gdc-client then downloaded the files.
    with open('..data/data_from_tcga/metadata_master.json', 'r') as f:
        metadata_master = json.load(f)
    metadata_dict = {}
    for each in metadata_master:
        metadata_dict[each['file_id']] = each
    ## Get list of file ids
    file_ids = [name for name in os.listdir(data_dir) if os.path.isdir(data_dir+name)]
    ## Gather data
    files_metadata = {'RNA-Seq': [], 'miRNA-Seq': [], 'miRNA-Seq_isoforms': [], 'Methylation Array': []}
    for file_id in file_ids:
        temp_entry = metadata_dict[file_id]
        case_id = temp_entry['associated_entities'][0]['case_id']
        cancer_type = temp_entry['cases'][0]['project']['project_id'].split('-')[1]
        data_type = temp_entry['experimental_strategy']
        file_name = temp_entry['file_name']
        if len(re.findall('isoforms', file_name)) > 0:
            data_type += '_isoforms'
        file_path = data_dir+file_id+'/'+file_name
        files_metadata[data_type].append([file_path, data_type, case_id+'_'+file_id, cancer_type])
    data_type_encoder = {'mrna': 'RNA-Seq', 'mirna': 'miRNA-Seq', 'mirna_iso': 'miRNA-Seq_isoforms', 'methyl': 'Methylation Array'}
    print('There are', len(files_metadata[data_type_encoder[dtype]]), 'files.')
    return files_metadata[data_type_encoder[dtype]]

def sparseProcessor(full_metadata_list, metadata_list_for_processing, data_type, feat_file = '', sparse = 0):
    ## metadata_list comes from metaDataGatherer function
    ## data_type is one of the following: mrna, data, data_iso, methyl
    metadata_dict = {'mrna': {'name': 'RNA-Seq', 'col_indices': [0,1], 'col_names': ['gene','value'], 
                              'col_dtypes': [str, float]}, 
                     'mirna': {'name': 'miRNA-Seq', 'col_indices': [0,2], 'col_names': ['miRNA_ID','value'], 
                               'col_dtypes': [str, float]},
                     'mirna_iso': {'name': 'miRNA-Seq_isoforms', 'col_indices': [0,1,3], 
                                   'col_names': ['miRNA_ID','isoform_coords','value'], 'col_dtypes': [str, str, float]},
                     'methyl': {'name': 'Methylation Array', 'col_indices': [0,1], 
                                'col_names': ['element_id','value'], 'col_dtypes': [str, float]}
                    }
    col_dtypes = {}
    for i, each in enumerate(metadata_dict[data_type]['col_dtypes']):
        col_name = metadata_dict[data_type]['col_names'][i]
        col_dtypes[col_name] = each
    ## Check if current features are in feature list
    ## Add any new features to end of feature list
    ## Add same number of new features as new columns to the array with np.hstack
    ## Find indices of the feature list that equal the current features
    ## Use these indices as indices to save current values to array
    data_names = []
    data_feats = []
    data_ys = []
    if len(feat_file) == 0:
        ## Go through files and get all features first
        for i, each in enumerate(full_metadata_list):
            clear_output()
            print('Now getting features from file', i, 'of', len(full_metadata_list))
            if each[1] == 'RNA-Seq':
                header = None
            else:
                header = 0
            temp_data = pd.read_csv(each[0], sep = '\t', index_col = 0, header = header,
                                    usecols=metadata_dict[data_type]['col_indices'], 
                                    names=metadata_dict[data_type]['col_names'], 
                                    dtype=col_dtypes)
            if each[1] == 'miRNA-Seq_isoforms':
                temp_data.reset_index(drop=False, inplace=True)
                temp_data['miRNA_isoform_ID'] = temp_data['miRNA_ID']+'_'+temp_data['isoform_coords']
                temp_data.set_index('miRNA_isoform_ID', drop = True, inplace = True)
            temp_data.dropna(subset = [metadata_dict[data_type]['col_names'][-1]], inplace = True)
            if len(data_feats) == 0:
                ## Make data_feats and data_array upon first iteration
                data_feats = temp_data.index.tolist()
            elif set(temp_data.index.tolist()) == set(data_feats):
                ## Find indices within data_feats of current feats
                continue
            else:
                ## Find new features
                data_feats = list(set(data_feats).union(temp_data.index.tolist()))
            del temp_data
            gc.collect()
    else:
        data_feats = pd.read_csv(feat_file, index_col = None, header = None).values[0]
    pd.DataFrame(data_feats).T.to_csv('../data/new_'+data_type+'_feats.csv', header = False, index = False)
    ## Now go through again and put values into the array
    label_encoder = dict(zip(data_feats, range(len(data_feats))))
    print('Done getting features!')
    print('There are a total of', len(metadata_list_for_processing), 'samples and',len(data_feats), 'features.')
    start_time = dt.datetime.now()
    proc = psutil.Process(os.getpid())
    if sparse == 1:
        print('Sparse mode...')
        gc.collect()
        data_array = lil_matrix((len(metadata_list_for_processing), len(data_feats)), dtype = float)
        data_ys = []
        data_names = []
        for i, each in enumerate(metadata_list_for_processing):
            if i == 0:
                print('Now processing', i, 'of', len(metadata_list_for_processing))
            else:
                clear_output()
                print('Last iteration took', dt.datetime.now() - temp_start_time)
                mem_now = proc.memory_info().rss
                print("Memory usage:", float(mem_now)/1e6, 'MB')
                print('Now processing', i, 'of', len(metadata_list_for_processing))
            temp_start_time = dt.datetime.now()
            if each[1] == 'RNA-Seq':
                header = None
            else:
                header = 0
            temp_data = pd.read_csv(each[0], sep = '\t', index_col = 0, header = header,
                                    usecols=metadata_dict[data_type]['col_indices'], 
                                    names=metadata_dict[data_type]['col_names'], 
                                    dtype=col_dtypes)
            if each[1] == 'miRNA-Seq_isoforms':
                temp_data.reset_index(drop=False, inplace=True)
                temp_data['miRNA_isoform_ID'] = temp_data['miRNA_ID']+'_'+temp_data['isoform_coords']
                temp_data.set_index('miRNA_isoform_ID', drop = True, inplace = True)
            temp_data.dropna(subset = [metadata_dict[data_type]['col_names'][-1]], inplace = True)
            ## Find indices within data_feats of current feats
            temp_feats_enc = np.array([label_encoder[x] for x in temp_data.index.tolist()])
            ## Index the array with the current values
            data_array[i, temp_feats_enc] = temp_data.iloc[:,-1].astype(float).values
            data_ys.append(each[3])
            data_names.append(each[2])
            del temp_data, temp_feats_enc
            gc.collect()
        with open('../data/new_'+data_type+'_X.pkl', 'wb') as f:
            pickle.dump(data_array, f)
        y = pd.DataFrame(data_ys, index = data_names, columns = ['y'])
        with open('../data/new_'+data_type+'_y.pkl', 'wb') as f:
            pickle.dump(y, f)
    else:
        print('csv mode...')
        with open('../data/new_'+data_type+'_X.csv', 'w') as f:
            csvwriter_X = csv.writer(f, delimiter = ',')
            csvwriter_X.writerow(['']+data_feats)
            with open('../data/new_'+data_type+'_y.csv', 'w') as g:
                csvwriter_y = csv.writer(g, delimiter = ',')
                csvwriter_y.writerow(['', 'y'])
                for i, each in enumerate(metadata_list_for_processing):
                    if i == 0:
                        print('Now processing', i, 'of', len(metadata_list_for_processing))
                    else:
                        clear_output()
                        print('Last iteration took', dt.datetime.now() - temp_start_time)
                        mem_now = proc.memory_info().rss
                        print("Memory usage:", float(mem_now)/1e6, 'MB')
                        print('Now processing', i, 'of', len(metadata_list_for_processing))
                    temp_start_time = dt.datetime.now()    
                    if each[1] == 'RNA-Seq':
                        header = None
                    else:
                        header = 0
                    temp_data = pd.read_csv(each[0], sep = '\t', index_col = 0, header = header,
                                            usecols=metadata_dict[data_type]['col_indices'], 
                                            names=metadata_dict[data_type]['col_names'], 
                                            dtype=col_dtypes)
                    if each[1] == 'miRNA-Seq_isoforms':
                        temp_data.reset_index(drop=False, inplace=True)
                        temp_data['miRNA_isoform_ID'] = temp_data['miRNA_ID']+'_'+temp_data['isoform_coords']
                        temp_data.set_index('miRNA_isoform_ID', drop = True, inplace = True)
                    temp_data.dropna(subset = [metadata_dict[data_type]['col_names'][-1]], inplace = True)
                    ## Find indices within data_feats of current feats
                    temp_feats_enc = np.array([label_encoder[x] for x in temp_data.index.tolist()])
                    ## Index the array with the current values
                    data_array = np.array([np.nan]*len(data_feats))
                    data_array[temp_feats_enc] = temp_data.iloc[:,-1].astype(float)
                    csvwriter_X.writerow([each[2]]+data_array.tolist())
                    csvwriter_y.writerow(each[2:4])
                    del temp_data, temp_feats_enc, data_array
                    gc.collect()
    print('Total time elapsed:', dt.datetime.now() - start_time)

### mRNA

In [None]:
files_metadata = metaDataGatherer('mrna')
sparseProcessor(files_metadata, files_metadata, 'mrna', '', 0)

### miRNA

In [None]:
files_metadata = metaDataGatherer('mirna')
sparseProcessor(files_metadata, files_metadata, 'mirna', '', 0)

### miRNA isoforms

In [None]:
files_metadata = metaDataGatherer('mirna_iso')
sparseProcessor(files_metadata, files_metadata, 'mirna_iso', '', 1)

### Methylation

In [5]:
files_metadata = metaDataGatherer('methyl')
sparseProcessor(files_metadata, files_metadata, 'methyl', '../data/new_methyl_feats.csv', 0)

Last iteration took 0:00:01.871378
Memory usage: 1125.982208 MB
Now processing 1538 of 1539
Total time elapsed: 0:37:32.010133
