In [3]:
import requests
import json
import pandas as pd
from io import StringIO

## HTSeq - Counts: Summary

In [70]:
#Query for the HTSeq - Counts 
#Fields to query in the files endpoint
files_endpt = "https://api.gdc.cancer.gov/files"
fields_files = [
    "file_name",
    "cases.samples.sample_type",
    "cases.disease_type",
    "cases.project.project_id",
    "cases.primary_site",
    "cases.demographic.gender",
    "cases.diagnoses.vital_status"
    ]
fields_files = ",".join(fields_files)

#Fields to query in the files endpoint

#Set of filters nested under an operator
filters = {
            "op":"in",
            "content":{
                "field": "files.analysis.workflow_type",
                "value": ["HTSeq - Counts"]
                }
        }

# With a GET request, the filters parameter needs to be converted
# from a dictionary to JSON-formatted string

params_files = {
    "filters": json.dumps(filters),
    "fields": fields_files,
    "format": "TSV",
    "size": 150000
    }

response_files = requests.get(files_endpt,params = params_files)

In [69]:
files = pd.read_table(StringIO(response_files.content.decode("utf-8")),sep="\t")
files.head()

Unnamed: 0,cases.0.samples.0.sample_type,file_name,cases.0.project.project_id,cases.0.primary_site,cases.0.diagnoses.0.vital_status,cases.0.disease_type,cases.0.demographic.gender,id
0,Primary Tumor,81cab804-bfb1-44f0-ac2d-923e0766ac61.htseq.cou...,TCGA-SARC,Soft Tissue,alive,Sarcoma,male,0934a0b3-858d-430c-a5c9-ea459104a392
1,Primary Tumor,155f0f8e-9af5-452a-92ee-8b49d5df864e.htseq.cou...,TCGA-LGG,Brain,alive,Brain Lower Grade Glioma,female,e7643723-dbbc-4c6c-981a-f0ccccf2aa9f
2,Primary Tumor,49b5b772-6a7b-4aae-a9ed-8a95a0d9a28d.htseq.cou...,TCGA-THCA,Thyroid,alive,Thyroid Carcinoma,female,39a74fe1-e4e0-4118-8489-75aefb4735b3
3,Primary Tumor,f9dc7237-e186-40e8-ab4c-d142dba7cf3d.htseq.cou...,TCGA-OV,Ovary,alive,Ovarian Serous Cystadenocarcinoma,female,dd9bc896-11ad-4156-9fd2-e85be97507e4
4,Primary Tumor,86d7188b-2d97-4dca-b033-34778e5a6aa1.htseq.cou...,TCGA-MESO,Pleura,dead,Mesothelioma,male,e1617a0f-a75d-471b-ad45-c78b78ba1615


### HTSeq - Counts: Cases to Files Ratio

In [145]:
cases = files['cases.0.project.project_id'].unique()
len(cases) # 37 unique cases, and thus 

37

In [148]:
only_healthy = files['cases.0.samples.0.sample_type'] == "Solid Tissue Normal" #boolean
numFiles = files['cases.0.project.project_id'].value_counts()[cases]
numHealthy = files[only_healthy]['cases.0.project.project_id'].value_counts()[cases].fillna(0)
numCases = numFiles - numHealthy

In [140]:
cases_summary = pd.concat([numFiles,numCases,numHealthy,numFiles/numCases],axis=1)
cases_summary.columns = ['files','cases','healthy','ratio']
cases_summary

Unnamed: 0,files,cases,healthy,ratio
TCGA-SARC,265,263.0,2.0,1.007605
TCGA-LGG,529,529.0,0.0,1.0
TCGA-THCA,568,510.0,58.0,1.113725
TCGA-OV,379,379.0,0.0,1.0
TCGA-MESO,86,86.0,0.0,1.0
TARGET-AML,187,187.0,0.0,1.0
TCGA-READ,177,167.0,10.0,1.05988
TARGET-NBL,157,157.0,0.0,1.0
TCGA-ACC,79,79.0,0.0,1.0
TCGA-SKCM,472,471.0,1.0,1.002123


#### Based on the availability of healthy and tumour sequence data for each project, the ideal projects are:
* TCGA-THCA: Thyroid
* TCGA-KICH: Kidney
* TCGA-KIRC: Kidney
* TCGA-HNSC*: Head and Neck
* TCGA-PRAD*: Prostate
* TCGA-LIHC: Liver
* TCGA-CHOL: Bile Duct
* TCGA-LUAD*: lung
* TCGA-KIRP*: Kidney
* TCGA-BRCA: Breast
* TCGA-LUSC*: Lung