In [3]:
import requests
import json
import re
import gzip
import pandas as pd
import tarfile
import os
import sys
import shutil
import numpy as np
import tarfile
import io
from io import StringIO

In [18]:
class gdc_data:
    
    def __init__(self, c_type):
        #Initialize the type of cancer for the database query
        self.c_type = c_type
        #initialize gene epression data matrix
        self.data = pd.DataFrame()
        #perform query and store data query
        
        #Initialize empty manifest data matrix
        self.manifest = pd.DataFrame()

    def gdc_query(self, cancer):
        '''
        Performs a query of the NCI genomic portal given a type of cancer.
        Hepatocellular Carcinoma - LIHC
        Returns the name of compressed tar.gz file, and a binary data file

        '''
        files_endpt = "https://api.gdc.cancer.gov/files"

        #Filters for the query, recieving all RNA-Seq, HTSeq-Count files for a specific cancer
        filters = {
            "op": "and",
            "content":[
                {
                "op": "in",
                "content":{
                    "field": "cases.project.project_id",
                    "value": ["TCGA-"+cancer]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "files.experimental_strategy",
                    "value": ["RNA-Seq"]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "files.analysis.workflow_type",
                    "value": ["HTSeq - Counts"]
                    }
                }
            ]
        }

        # Here a GET is used, so the filter parameters should be passed as a JSON string.
        params = {
            "filters": json.dumps(filters),
            "fields": "file_id",
            "format": "JSON",
            "size": "10" #Set to the first 10 files for developing
            }

        response = requests.get(files_endpt, params = params)
        file_uuid_list = []

        # This step populates the download list with the file_ids from the previous query
        for file_entry in json.loads(response.content.decode("utf-8"))["data"]["hits"]:
            file_uuid_list.append(file_entry["file_id"])

        data_endpt = "https://api.gdc.cancer.gov/data"

        params = {"ids": file_uuid_list}

        response = requests.post(data_endpt, data = json.dumps(params), headers = {"Content-Type": "application/json"})

        response_head_cd = response.headers["Content-Disposition"]

        file_name = re.findall("filename=(.+)", response_head_cd)[0]

        #with open(file_name, "wb") as output_file:
         #   output_file.write(response.content)

        return file_name,response
    
    def data_write(self):
        #Acquires the file_name and the data from the query in "response"
        self.file_name,self.response = self.gdc_query(self.c_type)
        
        #location of compressed data file
        comp_data = os.path.join("./test_data/",file_name)
        
        #Opens a file named after the file_name, and writes the contents of the query to the file
        if not os.path.exists("./test_data/"):
            os.makedirs("./test_data/")
        with open(comp_data, "wb") as output_file:
            output_file.write(response.content)
        
        #Unzips the tar.gz file that was generated into a test data subdirectory
        with tarfile.open(comp_data) as tar:
            tar.extractall("./test_data/")
            tar.close()
        
        #Stores the manifest of the data 
        manifest = pd.read_table("./test_data/MANIFEST.txt",sep="\t")
        
        uncomp_data = os.path.join(os.getcwd(),"./test_data/uncompressed_data/")
        
        if not os.path.exists(uncomp_data):
            os.makedirs(("./test_data/uncompressed_data"))
            
        for subdir, dirs, files in os.walk("./test_data/"):
            for file in files:
                if file[-4:] == "s.gz":
                    with gzip.open(os.path.join(subdir,file),'rb') as f:
                        file_content = f.read().decode("utf-8")
                        df = pd.read_csv(StringIO(file_content),sep="\t",header=None).set_index(0)
                        df.columns = [files[0]]
                        data = pd.concat([data,df],axis=1)
                        #df.to_csv(os.path.join("./test_data/uncompressed_data/",file[:-3]),header=False,sep=",",index=False)
        
    def data_save(self):
        #initialize/clear gene epression data matrix
        self.data = pd.DataFrame()
        #Stores the manifest of the data 
        self.manifest = pd.read_table("./test_data/MANIFEST.txt",sep="\t")
            
        for subdir, dirs, files in os.walk("./test_data/"):
            for file in files:
                if file[-4:] == "s.gz":
                    with gzip.open(os.path.join(subdir,file),'rb') as f:
                        file_content = f.read().decode("utf-8")
                        df = pd.read_csv(StringIO(file_content),sep="\t",header=None).set_index(0)
                        df.columns = [files[0]]
                        self.data = pd.concat([self.data,df],axis=1)
                                
                    
    def data_add(self,data):
            self.data = data

In [19]:
test = gdc_data("LIHC")

In [14]:
test.data_save()

In [16]:
test.data.head()

Unnamed: 0_level_0,7807318b-32d2-480e-805a-911d90f43e52.htseq.counts.gz,f6ae6ac1-3e00-4021-a6e7-fbb0d5f12836.htseq.counts.gz,554f6de3-63c7-47b1-a75a-dcfc73f54e96.htseq.counts.gz,687e7d1d-99eb-4bf9-9fa3-49e324ef32c3.htseq.counts.gz,e8aae9c3-1e7d-4f47-bd78-c96f55f992db.htseq.counts.gz,a2a12535-77ea-414b-bd36-d9328e019d05.htseq.counts.gz,f32c1def-c5c6-4076-966e-ae5f7233060a.htseq.counts.gz,7097ca31-8322-44a6-8fe7-f0c67bbf88fe.htseq.counts.gz,5fe28ffa-63af-4a8d-8512-b0742b4cded4.htseq.counts.gz,0fc6f38a-62da-4c2f-8a72-5c34b77656e5.htseq.counts.gz
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSG00000000003.13,2194,2893,3600,3920,16166,3093,8685,1976,4740,4455
ENSG00000000005.5,0,1,0,13,1,0,1,2,0,1
ENSG00000000419.11,1313,1048,2769,2116,1151,1425,2054,502,3645,955
ENSG00000000457.12,432,936,442,854,912,441,779,431,492,432
ENSG00000000460.15,153,392,237,2657,1669,76,191,134,611,213
