In [20]:
import requests
import json
import pandas as pd
import numpy as np
import os.path

# Step 1: downloading the data
To get the data, we use the [GDC API](https://docs.gdc.cancer.gov/API/Users_Guide/). Please refer to the link for additional information.

In [21]:
##### Folders
pload_p = "payload/"
data_p = "data/"

headers = {'Content-Type': 'application/json',}
root = 'https://api.gdc.cancer.gov/'
endpt_l = {"case":"cases",
           "ssm":"ssm_occurrences",
           "cnv":"cnv_occurrences",
           "gene":"analysis/top_mutated_genes_by_project"}

pload_l = ["case", "ssm", "cnv", "gene"]

def dl(endpt, data):
    """ simple wrapper around request.post with correct arguments filled in.
    """
    response = requests.post(root + endpt, timeout=5, headers=headers, data=data)
    return response.text

In [28]:
def fill_filter_gene():
    """ Builds the filter for cvn/ssm to ensure only motation on top genes are reported.
    """
    top_genes = pd.read_csv(data_p + "gene.tsv", "\t")["symbol"].to_numpy()
    for req in ("ssm", "cnv"):
        with open(pload_p + req, "r+") as f:
            payload = json.load(f)
            payload["filters"]["content"][0]["content"]["value"] = list(top_genes)
            f.seek(0)
            json.dump(payload, f, separators=(",",":"))
            f.truncate()

In [23]:
def load(fname, force_dl=False):
    """ wrapper utility to avoid re-downloading data.
    """
    fp = data_p + fname + ".tsv"
    if not force_dl and os.path.exists(fp):
        return pd.read_csv(fp, sep="\t", low_memory=False)
    #load request params
    print(f"file {fp} not found... Downloading...")
    data = open(pload_p + fname)
    text = dl(endpt_l[fname], data)
    with open(fp, "w") as f:
        f.write(text)
        
    return pd.read_csv(fp, sep="\t", low_memory=False)       

In [29]:
casedf = load("case")
genedf = load("gene")
fill_filter_gene() #update filters to only cater top 50genes
ssmdf = load("ssm")
cnvdf = load("cnv")

file data/case.tsv not found... Downloading...
file data/gene.tsv not found... Downloading...
file data/ssm.tsv not found... Downloading...
file data/cnv.tsv not found... Downloading...
