In [1]:
import requests
import os

API Endpoints

In [2]:
prod = 'https://hub.datacommons.cancer.gov/api/graphql'
#Note that use of Dev2 requires a VPN connection through the NIH firewall
dev2 = 'https://hub-dev2.datacommons.cancer.gov/api/graphql'

It is highly recommended that you keep your API key somewhere not visitlbe.  For the purposes of this demonstration it's been set in an environment variable names "DEV2API".

In [3]:
dev2APIKey = os.environ['DEV2API']

In [6]:
def apiQuery(url, query, variables):
    token = os.environ['DEV2API']
    headers = {"Authorization": f"Bearer {token}"}
    try:
        if variables is None:
            result = requests.post(url = url, headers = headers, json={"query": query})
        else:
            result = requests.post(url = url, headers = headers, json = {"query":query, "variables":variables})
        if result.status_code == 200:
            return result.json()
        else:
            print(f"Error: {result.status_code}")
            return result.content
    except requests.exceptions.HTTPError as e:
        return(f"HTTP Error: {e}")

Let's assume that this is our first submission using the API, so what we need to do is list the studies that my orgnaization is approved for so I can submit to the correct study. That's done with the *listApprovedStudiesOfMyOrganization* query

In [9]:
org_query = """
{
  listApprovedStudiesOfMyOrganization{
    originalOrg
    dbGaPID
    studyAbbreviation
    studyName
    _id
  }
}
"""

Note that the actual results returned by this query will vary for each organization.  These are examples only and shouldn't be used.

In [10]:
org_res = apiQuery(dev2, org_query,None)
print(org_res)

{'data': {'listApprovedStudiesOfMyOrganization': [{'originalOrg': 'Purdue Center for Cancer Research', 'dbGaPID': '', 'studyAbbreviation': 'UBC01', 'studyName': 'Antitumor Activity and Molecular Effects of Vemurafenib in Dogs with BRAF-mutant Bladder Cancer', '_id': 'b9e9ab79-d90b-4ec1-83b7-f83a5a75f5b5'}, {'originalOrg': 'Comparative Molecular Characterization Program', 'dbGaPID': '', 'studyAbbreviation': 'OSA01', 'studyName': 'A Multi-Platform Sequencing Analysis of Canine Appendicular Osteosarcoma', '_id': 'e3feefe9-cc70-4ae0-be06-9df7f29d84e8'}, {'originalOrg': 'Comparative Molecular Characterization Program', 'dbGaPID': '', 'studyAbbreviation': 'TCL01', 'studyName': 'Whole exome sequencing analysis of canine cancer cell lines', '_id': '6c7fa436-efa3-42c6-af4c-7f5b70a1d35d'}, {'originalOrg': 'NCI BBRB', 'dbGaPID': '', 'studyAbbreviation': 'CMB', 'studyName': 'Cancer Moonshot Biobank', '_id': '4c2b6522-20b8-4841-8c7a-318b325c99b4'}, {'originalOrg': 'CCDI', 'dbGaPID': 'phs003432', 's

For the purposes of this demonstration, we'll use the CCDI TALLsc study as the example.  The first step is to create a new submission within the study (do NOT do this if you're continuing with an exsiting study).

In [21]:
for study in org_res['data']['listApprovedStudiesOfMyOrganization']:
    if study['originalOrg'] == 'CCDI':
        org = study['originalOrg']
        dbgap = study['dbGaPID']
        abbrev = study['studyAbbreviation']
        name = study['studyName']
        studyid = study['_id']

dc = "CDS"
name = "Demo create submission Jupyter"
intention = "New/Update"
datatype = "Metadata and Data Files"


Creating submissions requires the use of a mutation that calls createSubmissions.  There are multiple required variables that have to be provided in a GraphQL compatible way:
studyID:  This is the Study ID that can be obtained from the graphical interface
dbGaPID: Obtained when registering the study at dbGaP.  This is required for all controlled access studies
dataCommons: This is the CRDC Data Commons the submissions will be deposited in
name: This can be anything that allows you to identify this specific submission
intention: Can be “New/Update” if you are adding information to the submission or  “Delete” if you are removing information from the submission
dataType: Can be either "Metadata and Data Files" or “Metadata Only”.  Which one is selected depends on whether or not data files will be included in the submission

In [33]:
create_submission_query = """
mutation CreateNewSubmission(
  $studyID: String!,
  $dbGaPID: String!,
  $dataCommons: String!,
  $name: String!,
  $intention:String!,
  $dataType: String!,
){
  createSubmission(
    studyID: $studyID,
    dbGaPID: $dbGaPID,
    dataCommons: $dataCommons,
    name: $name,
    intention: $intention,
    dataType: $dataType
  ){
    studyID
    dbGaPID
    dataCommons
    name
    intention
    dataType
    status
  }
}"""

In [34]:
variables = {"studyID":studyid, "dbGaPID":dbgap, "dataCommons":dc, "name":name, "intention":intention,"dataType":datatype}

In [36]:
create_res = apiQuery(dev2,create_submission_query, variables)

In [37]:
print(create_res)

{'data': {'createSubmission': {'studyID': '49a69fef-71f8-44e6-ad3b-f7a62d91e348', 'dbGaPID': 'phs003432', 'dataCommons': 'CDS', 'name': 'Demo create submission Jupyter', 'intention': 'New/Update', 'dataType': 'Metadata and Data Files', 'status': 'New'}}}


Now that we've created the submission, we need to grab the ID since that isn't returned.

In [44]:
submission_list_query = """
query GetSubmissionList($status:String!)
{
 listSubmissions(status:$status){
   submissions{
     _id
     name
   }
 }
}
"""

In [45]:
status = "New"
status_variables = {"status" : status}

In [46]:
res = apiQuery(dev2, submission_list_query, status_variables)
print(res)

{'data': {'listSubmissions': {'submissions': [{'_id': 'f41aea9c-bb76-4b48-8b53-27028317b434', 'name': 'Demo create submission Jupyter'}, {'_id': '107ba083-f107-4a2f-a848-824bb8746a01', 'name': 'Demo create submission 1'}, {'_id': '181432cd-e915-46ff-b62e-1f167abb7e2f', 'name': 'API Demonstration'}]}}}


In [47]:
for study in res['data']['listSubmissions']['submissions']:
    if study['name'] == name:
        submissionid = study['_id']
print(name)
print(submissionid)

Demo create submission Jupyter
f41aea9c-bb76-4b48-8b53-27028317b434


At this point if you go to the graphical interface you should see that a new submission has been created using the name provided in the query

Once the study is created, the next step is to start uploading metadata and data to the system.  There are two ways of accomplishing this upload:
1) Using the Upload CLI Tool : This is generally the easiest method and can be used to upload both the metadata templates and the data files.  The use of the Uploader CLI Tool is documented elsewhere LINK HERE
2) Using the API : If you wish to provide metadata only via a program, the API can be used as will be demonstrated in this notebook.  Note that while the API can be used to upload metadata, the actual data files MUST be uploaded with the Upload CLI Tool

The first step of submitting metadata via an API is to use the createTempCredentials mutation to get credentials that allow the submisison.

In [48]:
get_temp_cred_query = """
 mutation CreateTempCredentials(
        $submissionID: ID!
    ){
        createTempCredentials(submissionID: $submissionID){
          accessKeyId
          secretAccessKey
          sessionToken          
        }
    }
"""

In [49]:
cred_variables = {"submissionID" : submissionid}

In [53]:
cred_res = apiQuery(dev2, get_temp_cred_query, cred_variables)

In [51]:
accessKeyID = cred_res['data']['createTempCredentials']['accessKeyId']
secretKey = cred_res['data']['createTempCredentials']['secretAccessKey']
sessionToken = cred_res['data']['createTempCredentials']['sessionToken']

Let's set up the list of metadata files we want to upload.  These need to be done according to the FileInput spec which uses fileName and size as the keys.  The type field is either "metadata" or "data file" and "data file" isn't allowed ouside of the Upload CLI Tool, we'll set it to "metadata"

In [75]:
metadatafiles = [{"fileName":"/media/vmshare/PDXNet_participant.tsv", "size": 2106 }, {"fileName":"/media/vmshare/PDXNet_sample.tsv", "size":12416}]
type = "metadata"

Now that we've got credentials, we create a "batch", which is the term for one or more files uploaded at the same time.  This is done with a mutation that calls the createBatch endpoint

In [76]:
create_batch_query = """
mutation CreateBatch($submissionID: ID!, $type: String!, $file: [FileInput]) {
  createBatch(submissionID: $submissionID, type: $type, files: $file) {
    _id
    files {
      fileName
      signedURL
    }
  }
}
"""

In [77]:
create_batch_variables = {"submissionID":submissionid, "type":type, "file":metadatafiles}
print(create_batch_variables)

{'submissionID': 'f41aea9c-bb76-4b48-8b53-27028317b434', 'type': 'metadata', 'file': [{'fileName': '/media/vmshare/PDXNet_participant.tsv', 'size': 2106}, {'fileName': '/media/vmshare/PDXNet_sample.tsv', 'size': 12416}]}


print(variables)

In [83]:
create_batch_res = apiQuery(dev2, create_batch_query, create_batch_variables)
print(create_batch_res)

{'data': {'createBatch': {'_id': 'e1937bd6-f659-4275-a5b7-9ac387f4fde8', 'files': [{'fileName': '/media/vmshare/PDXNet_participant.tsv', 'signedURL': 'https://crdc-hub-dev2-submission.s3.amazonaws.com/6681e23e-c091-40b0-9dfe-b1e415d97cd7/f41aea9c-bb76-4b48-8b53-27028317b434/metadata/1724706210804//media/vmshare/PDXNet_participant.tsv?AWSAccessKeyId=ASIA3MJN7XTZ2OHVAC3I&Content-Disposition=attachment%3B%20filename%3D%22%2Fmedia%2Fvmshare%2FPDXNet_participant.tsv%22&Content-Type=text%2Ftab-separated-values&Expires=1724709810&Signature=w0qzTWadklZ0b8KskT9zL8vco5Y%3D&x-amz-acl=private&x-amz-security-token=IQoJb3JpZ2luX2VjEOz%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJHMEUCID%2BLr2xwJd6FClNCQ9gKQBhZKzqZv3LOKkPKy4Bfk0QGAiEA6b9kVS%2FVITT7%2BvCA61z59FoE1iHD6BvRXgH9jnfEcaYqggQI9f%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgw3ODIzMTczNzA2MTEiDKclq6wsAq7NfvD13irWA9X34dp91kIfmnI9Wg4nKimaaio4z3sQfcFGVnTNU9FUj5qgYPbzzbbjUxKP1k6yBpmixfKQgZTvlTtRDiMCEgZ9J89fQEd6vu9lyi%2FMGYosV832Q1b%2B9bejtC1Vwff4VUunO7HJj

Now upload the files with the signed URL

In [84]:
batchid = create_batch_res['data']['createBatch']['_id']


In [85]:
def fileUpload(file, signedurl):
    try:
        files = {"file": open(file, 'rb')}
        res = requests.post(signedurl, files=files)
        if res.status_code == 200:
            return res.json()
        else:
            print(f"Error: {res.status_code}")
            return res.content
    except requests.exceptions.HTTPError as e:
        return(f"HTTP Error: {e}")


In [90]:
def awsFileUpload(file, signedurl):
    #https://docs.aws.amazon.com/AmazonS3/latest/userguide/example_s3_Scenario_PresignedUrl_section.html
    try:
        with open(file, 'r') as f:
            filetext = f.read()
        res = requests.put(signedurl, data=filetext)
        if res.status_code == 200:
            return res.json()
        else:
            print(f"Error: {res.status_code}")
            return res.content
    except requests.exceptions.HTTPError as e:
        return(f"HTTP error: {e}")

In [92]:
for entry in metadatafiles:
    for metadatafile in create_batch_res['data']['createBatch']['files']:
        if entry['fileName'] == metadatafile['fileName']:
            print(entry['fileName'])
            print(metadatafile['fileName'])
            print(metadatafile['signedURL'])
            #metares = fileUpload(metadatafile['fileName'], metadatafile['signedURL'])
            metares = awsFileUpload(metadatafile['fileName'], metadatafile['signedURL'])
            print(metares)

/media/vmshare/PDXNet_participant.tsv
/media/vmshare/PDXNet_participant.tsv
https://crdc-hub-dev2-submission.s3.amazonaws.com/6681e23e-c091-40b0-9dfe-b1e415d97cd7/f41aea9c-bb76-4b48-8b53-27028317b434/metadata/1724706210804//media/vmshare/PDXNet_participant.tsv?AWSAccessKeyId=ASIA3MJN7XTZ2OHVAC3I&Content-Disposition=attachment%3B%20filename%3D%22%2Fmedia%2Fvmshare%2FPDXNet_participant.tsv%22&Content-Type=text%2Ftab-separated-values&Expires=1724709810&Signature=w0qzTWadklZ0b8KskT9zL8vco5Y%3D&x-amz-acl=private&x-amz-security-token=IQoJb3JpZ2luX2VjEOz%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJHMEUCID%2BLr2xwJd6FClNCQ9gKQBhZKzqZv3LOKkPKy4Bfk0QGAiEA6b9kVS%2FVITT7%2BvCA61z59FoE1iHD6BvRXgH9jnfEcaYqggQI9f%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgw3ODIzMTczNzA2MTEiDKclq6wsAq7NfvD13irWA9X34dp91kIfmnI9Wg4nKimaaio4z3sQfcFGVnTNU9FUj5qgYPbzzbbjUxKP1k6yBpmixfKQgZTvlTtRDiMCEgZ9J89fQEd6vu9lyi%2FMGYosV832Q1b%2B9bejtC1Vwff4VUunO7HJjZWKbwM7%2FSrGzGBxy41VqXeUop8pW%2BGaokVq%2FmCLqNgbte9bb8KmBFGf8aTcCljybUS0bR

After files have been uploaded, the next step is to update the batch by calling the updateBatch mutation