In [127]:
import requests
import os
import json
import ast
import pandas as pd

The libraries below are for display purposes only

In [128]:
from IPython.display import display, Markdown
import pprint

In [129]:
def apiQuery(tier, query, variables):
    if tier == 'prod':
        url = 'https://hub.datacommons.cancer.gov/api/graphql'
        token = os.environ['PRODAPI']
    elif tier == 'stage':
        #Note that use of Stage is for example purposes only, actual submissions should use the production URL.  If you wish to run tests on Stage, please contact the helpdesk.
        url = 'https://hub-stage.datacommons.cancer.gov/api/graphql'
        token = os.environ['STAGEAPI']
    else:
        return('Please provide either "stage" or "prod" as tier values')
    headers = {"Authorization": f"Bearer {token}"}
    try:
        if variables is None:
            result = requests.post(url = url, headers = headers, json={"query": query})
        else:
            result = requests.post(url = url, headers = headers, json = {"query":query, "variables":variables})
        if result.status_code == 200:
            return result.json()
        else:
            print(f"Error: {result.status_code}")
            return result.content
    except requests.exceptions.HTTPError as e:
        return(f"HTTP Error: {e}")

Queries needed to get the list of errors

In [130]:
error_query = """
query retrieveReleasedDataByID(
    $submissionID: String!,
    $nodeType: String!
    $nodeID: String!
){
retrieveReleasedDataByID(
    submissionID: $submissionID,
    nodeType: $nodeType
    nodeID: $nodeID
){
    submissionID
    status
    dataCommons
    dataCommonsDisplayName
    studyID
    nodeType
    nodeID
    props
}
}
"""

In [131]:
submission_nodes_query = """
query getSubmissionNodes(
    $_id: String!,
    $nodeType: String!, 
    $status: String,
    $first: Int, 
    $offset:Int, 
    $orderBy: String, 
    $sortDirection:String
) {
getSubmissionNodes(
    submissionID: $_id
    nodeType: $nodeType
    status: $status
    first: $first
    offset: $offset
    orderBy: $orderBy
    sortDirection: $sortDirection
) {
    total
    IDPropName
    properties
    nodes {
        nodeID
        nodeType
        status
        props
    }
    }
}
"""

In [132]:
def diffDataFrame(nodetype, nodeID, tier, query):
    difflist = []
    variables = {'submissionID': subid , 'nodeType': nodetype, 'nodeID': nodeID}
    diffres = apiQuery('stage', query, variables)
    dfcollection = {}
    if 'errors' in diffres:
        return None
    else:
        for entry in diffres['data']['retrieveReleasedDataByID']:
            propstuff = ast.literal_eval(entry['props'])
            temp_df = pd.DataFrame(propstuff, index=[entry['submissionID']])
            dfcollection[entry['submissionID']] = temp_df
            keylist = list(dfcollection.keys())
            if len(keylist) >= 2:
                df1 = dfcollection[keylist[0]]
                df2 = dfcollection[keylist[1]]
                diff_df = pd.concat([df1, df2]).drop_duplicates(keep=False)
                difflist.append(diff_df)
        report_df = pd.concat(difflist)
        return report_df

The cell below contains the variables that need to be set by the user:
- subid: The submission ID to check (can be obtained in the graphical interface)
- severity: Should be set to 'All'
- nodelist: the node names containing the warnings that are to be checked
- outputdirectory: Where the summary spreadsheets should be saved.  Must have '/' at the end

In [133]:
subid = 'c0ceb350-9318-4a48-875a-dd956a73e211'
severity = 'All'
nodelist = ['genomic_info', 'image', 'proteomic']
outputdirectory = '/media/sf_VMShare/WarningSummary/'

In [134]:
submission_nodes_vars = {'_id':subid, 'nodeType':nodetype, 'status':severity, 'first':-1, 'offset':0, 'orderBy':'studyID', 'sortDirection':'desc'}

In [135]:
subnodes_res = apiQuery('stage', submission_nodes_query, submission_nodes_vars)

In [136]:
for node in nodelist:
    nodedflist = []
    node_vars = {'_id':subid, 'nodeType':node, 'status':severity, 'first':-1, 'offset':0, 'orderBy':'studyID', 'sortDirection':'desc'}
    nodedata_res = apiQuery('stage', submission_nodes_query, node_vars)
    #Set up the dataframe needed to query for errors
    for result in nodedata_res['data']['getSubmissionNodes']['nodes']:
        nodetype = result['nodeType']
        nodeid = result['nodeID']
        report_df = diffDataFrame(nodetype, nodeid, 'stage', error_query)
        if report_df is not None:
            nodedflist.append(report_df)
    report_df = pd.concat(nodedflist)
    report_df.index.name = 'submission_id'
    report_df.to_csv(f"{outputdirectory}{node}_warning_diffs.csv", sep="\t")
    #Uncomment the line below if you want to display the output tables in this notebook.
    #print(f"Node: {node}\nDiff report: {display(Markdown(report_df.to_markdown()))}\n")
        
    