In [2]:
import requests
import json
import pandas as pd
from io import StringIO

## HTSeq - Counts: Summary

In [20]:
#Query for the HTSeq - Counts 
#Fields to query in the files endpoint
files_endpt = "https://api.gdc.cancer.gov/files"
fields_files = [
    "file_name",
    "cases.samples.sample_type",
    "cases.submitter_id",
    "cases.disease_type",
    "cases.project.project_id",
    "cases.primary_site",
    "cases.demographic.gender",
    "cases.diagnoses.vital_status",
    "cases.demographic.year_of_birth",
    "cases.demographic.race",
    "cases.diagnoses.tumor_stage",
    "cases.diagnoses.age_at_diagnosis",
    "cases.diagnoses.morphology",
    "cases.diagnoses.days_to_last_follow_up",
    "cases.diagnoses.days_to_birth",
    "cases.diagnoses.days_to_death",
    "cases.exposures.bmi",
    "cases.exposures.cigarettes_per_day",
    "cases.exposures.alcohol_history",
    "cases.exposures.height",
    "cases.exposures.weight",
    "cases.exposures.years_smoked"
    ]
fields_files = ",".join(fields_files)

#Fields to query in the files endpoint

#Set of filters nested under an operator
filters = {
            "op":"in",
            "content":{
                "field": "files.analysis.workflow_type",
                "value": ["HTSeq - Counts"]
                }
        }

# With a GET request, the filters parameter needs to be converted
# from a dictionary to JSON-formatted string

params_files = {
    "filters": json.dumps(filters),
    "fields": fields_files,
    "format": "TSV",
    "size": 150000
    }

response_files = requests.get(files_endpt,params = params_files)

In [21]:
files = pd.read_table(StringIO(response_files.content.decode("utf-8")),sep="\t")
files.head()

Unnamed: 0,cases.0.exposures.0.alcohol_history,cases.0.exposures.0.height,cases.0.diagnoses.0.tumor_stage,cases.0.diagnoses.0.vital_status,cases.0.project.project_id,cases.0.disease_type,cases.0.demographic.year_of_birth,cases.0.exposures.0.cigarettes_per_day,cases.0.diagnoses.0.days_to_last_follow_up,cases.0.diagnoses.0.days_to_birth,...,cases.0.diagnoses.0.age_at_diagnosis,cases.0.samples.0.sample_type,cases.0.diagnoses.0.morphology,cases.0.exposures.0.weight,cases.0.exposures.0.years_smoked,cases.0.demographic.race,file_name,cases.0.demographic.gender,cases.0.primary_site,cases.0.exposures.0.bmi
0,,,not reported,alive,TCGA-SARC,Sarcoma,1959.0,,969.0,-19088.0,...,19088.0,Primary Tumor,8858/3,,,white,81cab804-bfb1-44f0-ac2d-923e0766ac61.htseq.cou...,male,Soft Tissue,
1,,,not reported,alive,TCGA-LGG,Brain Lower Grade Glioma,1981.0,,182.0,-11637.0,...,11637.0,Primary Tumor,9382/3,,,not reported,155f0f8e-9af5-452a-92ee-8b49d5df864e.htseq.cou...,female,Brain,
2,,,stage i,alive,TCGA-THCA,Thyroid Carcinoma,1973.0,,1922.0,-12714.0,...,12714.0,Primary Tumor,8340/3,,,white,49b5b772-6a7b-4aae-a9ed-8a95a0d9a28d.htseq.cou...,female,Thyroid,
3,,,not reported,alive,TCGA-OV,Ovarian Serous Cystadenocarcinoma,1948.0,,547.0,-21147.0,...,21147.0,Primary Tumor,8441/3,,,white,f9dc7237-e186-40e8-ab4c-d142dba7cf3d.htseq.cou...,female,Ovary,
4,,,stage iii,dead,TCGA-MESO,Mesothelioma,1932.0,,,-29464.0,...,29464.0,Primary Tumor,9053/3,,,white,86d7188b-2d97-4dca-b033-34778e5a6aa1.htseq.cou...,male,Pleura,


In [22]:
files[files['cases.0.project.project_id']=="TCGA-LIHC"].head()

Unnamed: 0,cases.0.exposures.0.alcohol_history,cases.0.exposures.0.height,cases.0.diagnoses.0.tumor_stage,cases.0.diagnoses.0.vital_status,cases.0.project.project_id,cases.0.disease_type,cases.0.demographic.year_of_birth,cases.0.exposures.0.cigarettes_per_day,cases.0.diagnoses.0.days_to_last_follow_up,cases.0.diagnoses.0.days_to_birth,...,cases.0.diagnoses.0.age_at_diagnosis,cases.0.samples.0.sample_type,cases.0.diagnoses.0.morphology,cases.0.exposures.0.weight,cases.0.exposures.0.years_smoked,cases.0.demographic.race,file_name,cases.0.demographic.gender,cases.0.primary_site,cases.0.exposures.0.bmi
385,,157.0,stage i,alive,TCGA-LIHC,Liver Hepatocellular Carcinoma,1964.0,,608.0,-18009.0,...,18009.0,Primary Tumor,8170/3,62.0,,asian,7097ca31-8322-44a6-8fe7-f0c67bbf88fe.htseq.cou...,female,Liver,25.15315
404,,168.0,stage i,alive,TCGA-LIHC,Liver Hepatocellular Carcinoma,1935.0,,409.0,-27756.0,...,27756.0,Primary Tumor,8170/3,71.0,,black or african american,0fc6f38a-62da-4c2f-8a72-5c34b77656e5.htseq.cou...,female,Liver,25.155896
411,,167.0,stage iiia,alive,TCGA-LIHC,Liver Hepatocellular Carcinoma,1954.0,,1989.0,-20288.0,...,20288.0,Primary Tumor,8170/3,60.0,,asian,e8aae9c3-1e7d-4f47-bd78-c96f55f992db.htseq.cou...,male,Liver,21.513859
414,,163.0,stage ii,dead,TCGA-LIHC,Liver Hepatocellular Carcinoma,1948.0,,,-24020.0,...,24020.0,Primary Tumor,8170/3,80.0,,white,a2a12535-77ea-414b-bd36-d9328e019d05.htseq.cou...,female,Liver,30.110279
423,,176.0,stage i,alive,TCGA-LIHC,Liver Hepatocellular Carcinoma,1942.0,,2245.0,-24090.0,...,24090.0,Primary Tumor,8170/3,109.0,,white,f32c1def-c5c6-4076-966e-ae5f7233060a.htseq.cou...,male,Liver,35.188533


### HTSeq - Counts: Cases to Files Ratio

In [None]:
cases = files['cases.0.project.project_id'].unique()
len(cases) # 37 unique cases, and thus 

In [None]:
only_healthy = files['cases.0.samples.0.sample_type'] == "Solid Tissue Normal" #boolean
numFiles = files['cases.0.project.project_id'].value_counts()[cases]
numHealthy = files[only_healthy]['cases.0.project.project_id'].value_counts()[cases].fillna(0)
numCases = numFiles - numHealthy

In [None]:
cases_summary = pd.concat([numFiles,numCases,numHealthy,numFiles/numCases],axis=1)
cases_summary.columns = ['files','cases','healthy','ratio']
cases_summary

#### Based on the availability of healthy and tumour sequence data for each project, the ideal projects are:
* TCGA-THCA: Thyroid
* TCGA-KICH: Kidney
* TCGA-KIRC: Kidney
* TCGA-HNSC*: Head and Neck
* TCGA-PRAD*: Prostate
* TCGA-LIHC: Liver
* TCGA-CHOL: Bile Duct
* TCGA-LUAD*: lung
* TCGA-KIRP*: Kidney
* TCGA-BRCA: Breast
* TCGA-LUSC*: Lung