In [3]:
import requests
import json
import re
import gzip
import pandas as pd
import tarfile
import os
import sys
import shutil
import numpy as np
import tarfile
import io
from io import StringIO

In [8]:
def gdc_query(cancer):
    '''
    Performs a query of the NCI genomic portal given a type of cancer.
    Hepatocellular Carcinoma - LIHC
    Returns the name of compressed tar.gz file, and a binary data file

    '''
    files_endpt = "https://api.gdc.cancer.gov/files"

    #Filters for the query, recieving all RNA-Seq, HTSeq-Count files for a specific cancer
    filters = {
        "op": "and",
        "content":[
            {
            "op": "in",
            "content":{
                "field": "cases.project.project_id",
                "value": ["TCGA-"+cancer]
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.experimental_strategy",
                "value": ["RNA-Seq"]
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.analysis.workflow_type",
                "value": ["HTSeq - Counts"]
                }
            }
        ]
    }

    # Here a GET is used, so the filter parameters should be passed as a JSON string.
    params = {
        "filters": json.dumps(filters),
        "fields": "file_id",
        "format": "JSON",
        "size": "10" #Set to the first 10 files for developing
        }

    response = requests.get(files_endpt, params = params)
    file_uuid_list = []

    # This step populates the download list with the file_ids from the previous query
    for file_entry in json.loads(response.content.decode("utf-8"))["data"]["hits"]:
        file_uuid_list.append(file_entry["file_id"])

    data_endpt = "https://api.gdc.cancer.gov/data"

    params = {"ids": file_uuid_list}

    response = requests.post(data_endpt, data = json.dumps(params), headers = {"Content-Type": "application/json"})

    response_head_cd = response.headers["Content-Disposition"]

    file_name = re.findall("filename=(.+)", response_head_cd)[0]

    #with open(file_name, "wb") as output_file:
     #   output_file.write(response.content)

    return file_name,response

In [8]:
apple = pd.DataFrame(np.random.rand(10,10))
apple.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.735781,0.518014,0.271766,0.396838,0.292724,0.600595,0.687681,0.078576,0.048284,0.636957
1,0.252816,0.173767,0.172138,0.900633,0.990428,0.72527,0.828101,0.45377,0.491147,0.197787
2,0.155883,0.018023,0.849573,0.999709,0.093454,0.629949,0.276623,0.759548,0.381217,0.953016
3,0.333801,0.337605,0.48395,0.014518,0.116226,0.349993,0.377736,0.108378,0.381762,0.297364
4,0.163961,0.828238,0.26829,0.214402,0.392169,0.773153,0.595262,0.931022,0.642625,0.103817


In [11]:
apple.columns

RangeIndex(start=0, stop=10, step=1)

In [9]:
#Acquires the file_name and the data from the query in "response"
file_name,response = gdc_query("LIHC")

In [318]:
#location of compressed data file
comp_data = os.path.join("./test_data/",file_name)

In [319]:
#Opens a file named after the file_name, and writes the contents of the query to the file
if not os.path.exists("./test_data/"):
    os.makedirs("./test_data/")
with open(comp_data, "wb") as output_file:
    output_file.write(response.content)

In [320]:
#Unzips the tar.gz file that was generated into a test data subdirectory
with tarfile.open(comp_data) as tar:
    tar.extractall("./test_data/")
    tar.close()

In [321]:
#unzips .gz files
#with gzip.open(file_name, 'rb') as f:
#    file_content = f.read()

In [322]:
#Stores the manifest of the data 
manifest = pd.read_table("./test_data/MANIFEST.txt",sep="\t")

In [323]:
os.listdir("./test_data")

['407950b7-f2b7-4db5-bc3c-f50934cb98ac',
 '40f80afd-b6aa-4ec2-8f0c-63d950213323',
 '4d7b644e-68ee-4405-af57-2af04313e87a',
 '4db947b5-44bc-49ac-812b-ec7d6faa4d30',
 '5d06b081-fd28-4fd7-90c7-2c00539f427b',
 '71011088-1831-47d5-83d9-d32029c67430',
 'a281dee6-0cc3-44a2-b323-ebfd9b805f7e',
 'b1bd62fa-e635-4b48-ad1b-66bb3103ab11',
 'e19fc1b6-7957-4c6e-9e54-b6ea3319cedf',
 'fd550345-ee15-4388-8a33-aae89d5312e1',
 'gdc_download_20180226_225051.tar.gz',
 'MANIFEST.txt']

In [13]:
uncomp_data = os.path.join(os.getcwd(),"./test_data/uncompressed_data/")
data = pd.DataFrame()
if not os.path.exists(uncomp_data):
    os.makedirs(("./test_data/uncompressed_data"))
for subdir, dirs, files in os.walk("./test_data/"):
    for file in files:
        if file[-4:] == "s.gz":
            with gzip.open(os.path.join(subdir,file),'rb') as f:
                file_content = f.read().decode("utf-8")
                df = pd.read_csv(StringIO(file_content),sep="\t",header=None).set_index(0)
                df.columns = [files[0]]
                data = pd.concat([data,df],axis=1)
                #df.to_csv(os.path.join("./test_data/uncompressed_data/",file[:-3]),header=False,sep=",",index=False)

NameError: name 'StringIO' is not defined

In [14]:
data.head()

In [414]:
manifest

Unnamed: 0,id,filename,md5,size,state
0,407950b7-f2b7-4db5-bc3c-f50934cb98ac,407950b7-f2b7-4db5-bc3c-f50934cb98ac/7807318b-...,37df987c6491bd2953a84d7f1a0115c0,242525,live
1,40f80afd-b6aa-4ec2-8f0c-63d950213323,40f80afd-b6aa-4ec2-8f0c-63d950213323/f6ae6ac1-...,a5778e0fbeea4c07b88b48303b889b50,252517,live
2,4d7b644e-68ee-4405-af57-2af04313e87a,4d7b644e-68ee-4405-af57-2af04313e87a/554f6de3-...,bd21167b0a2099bfb56bcb96cd58a7e6,246883,live
3,4db947b5-44bc-49ac-812b-ec7d6faa4d30,4db947b5-44bc-49ac-812b-ec7d6faa4d30/687e7d1d-...,a94014b1f2bd7164b67368b6b9049857,248646,live
4,5d06b081-fd28-4fd7-90c7-2c00539f427b,5d06b081-fd28-4fd7-90c7-2c00539f427b/e8aae9c3-...,5fff9a50b12ae7f7b97e99e4b9d500a2,246383,live
5,71011088-1831-47d5-83d9-d32029c67430,71011088-1831-47d5-83d9-d32029c67430/a2a12535-...,97ee461819d716703953f09678e712fb,244379,live
6,a281dee6-0cc3-44a2-b323-ebfd9b805f7e,a281dee6-0cc3-44a2-b323-ebfd9b805f7e/f32c1def-...,04076649e9a05d161ffba7afb9dc0da7,247240,live
7,b1bd62fa-e635-4b48-ad1b-66bb3103ab11,b1bd62fa-e635-4b48-ad1b-66bb3103ab11/7097ca31-...,8f0aa13e5f67ab793e654185592a843b,239686,live
8,e19fc1b6-7957-4c6e-9e54-b6ea3319cedf,e19fc1b6-7957-4c6e-9e54-b6ea3319cedf/5fe28ffa-...,6f56d29547220bdf70ec34d018352dad,251961,live
9,fd550345-ee15-4388-8a33-aae89d5312e1,fd550345-ee15-4388-8a33-aae89d5312e1/0fc6f38a-...,e5df4067c4421ceae1e2b68190c5df82,248192,live
