In [15]:
import requests
import os
import json
import pandas as pd

The libraries below are for display purposes only

In [16]:
from IPython.display import display, Markdown
import pprint

In [17]:
def apiQuery(tier, query, variables):
    if tier == 'prod':
        url = 'https://hub.datacommons.cancer.gov/api/graphql'
        token = os.environ['PRODAPI']
    elif tier == 'stage':
        #Note that use of Stage is for example purposes only, actual submissions should use the production URL.  If you wish to run tests on Stage, please contact the helpdesk.
        url = 'https://hub-stage.datacommons.cancer.gov/api/graphql'
        token = os.environ['STAGEAPI']
    else:
        return('Please provide either "stage" or "prod" as tier values')
    headers = {"Authorization": f"Bearer {token}"}
    try:
        if variables is None:
            result = requests.post(url = url, headers = headers, json={"query": query})
        else:
            result = requests.post(url = url, headers = headers, json = {"query":query, "variables":variables})
        if result.status_code == 200:
            return result.json()
        else:
            print(f"Error: {result.status_code}")
            return result.content
    except requests.exceptions.HTTPError as e:
        return(f"HTTP Error: {e}")

Queries needed to get the list of errors

In [18]:
error_query = """
query retrieveReleasedDataByID(
    $submissionID: String!,
    $nodeType: String!
    $nodeID: String!
){
retrieveReleasedDataByID(
    submissionID: $submissionID,
    nodeType: $nodeType
    nodeID: $nodeID
){
    submissionID
    status
    dataCommons
    dataCommonsDisplayName
    studyID
    nodeType
    nodeID
    props
}
}
"""

In [19]:
submission_nodes_query = """
query getSubmissionNodes(
    $_id: String!,
    $nodeType: String!, 
    $status: String,
    $first: Int, 
    $offset:Int, 
    $orderBy: String, 
    $sortDirection:String
) {
getSubmissionNodes(
    submissionID: $_id
    nodeType: $nodeType
    status: $status
    first: $first
    offset: $offset
    orderBy: $orderBy
    sortDirection: $sortDirection
) {
    total
    IDPropName
    properties
    nodes {
        nodeID
        nodeType
        status
        props
    }
    }
}
"""

In [20]:
def diffDataFrame(subid, nodetype, nodeID, tier, query):
    difflist = []
    variables = {'submissionID': subid , 'nodeType': nodetype, 'nodeID': nodeID}
    diffres = apiQuery(tier, query, variables)
    dfcollection = {}
    if 'errors' in diffres:
        return None
    else:
        for entry in diffres['data']['retrieveReleasedDataByID']:
            #propstuff = ast.literal_eval(entry['props'])
            propstuff = json.loads(entry['props'])
            temp_df = pd.DataFrame(propstuff, index=[entry['submissionID']])
            dfcollection[entry['submissionID']] = temp_df
            keylist = list(dfcollection.keys())
            if len(keylist) >= 2:
                df1 = dfcollection[keylist[0]]
                df2 = dfcollection[keylist[1]]
                diff_df = pd.concat([df1, df2]).drop_duplicates(keep=False)
                difflist.append(diff_df)
        report_df = pd.concat(difflist)
        return report_df

The cell below contains the variables that need to be set by the user:
- subid: The submission ID to check (can be obtained in the graphical interface)
- severity: Should be set to 'All'
- nodelist: the node names containing the warnings that are to be checked
- outputdirectory: Where the summary spreadsheets should be saved.  Must have '/' at the end
- tier - Must be either 'prod' or 'stage'

In [21]:
subid = 'c0ceb350-9318-4a48-875a-dd956a73e211'
severity = 'All'
nodelist = ['genomic_info', 'image', 'proteomic']
outputdirectory = '/media/sf_VMShare/WarningSummary/'
tier = 'stage'

In [22]:
for node in nodelist:
    nodedflist = []
    node_vars = {'_id':subid, 'nodeType':node, 'status':severity, 'first':-1, 'offset':0, 'orderBy':'studyID', 'sortDirection':'desc'}
    nodedata_res = apiQuery(tier, submission_nodes_query, node_vars)
    #Set up the dataframe needed to query for errors
    for result in nodedata_res['data']['getSubmissionNodes']['nodes']:
        nodetype = result['nodeType']
        nodeid = result['nodeID']
        report_df = diffDataFrame(subid, nodetype, nodeid, 'stage', error_query)
        if report_df is not None:
            nodedflist.append(report_df)
    report_df = pd.concat(nodedflist)
    report_df.index.name = 'submission_id'
    report_df.to_csv(f"{outputdirectory}{node}_warning_diffs.csv", sep="\t")
    #Uncomment the line below if you want to display the output tables in this notebook.
    print(f"Node: {node}\nDiff report: {display(Markdown(report_df.to_markdown()))}\n")
        
    

| submission_id                        | genomic_info_id   | library_id                             |   bases |   number_of_reads |   avg_read_length |   coverage | reference_genome_assembly   | custom_assembly_fasta_file_for_alignment    | design_description                     | library_strategy   | library_layout   | library_selection   | platform   | instrument_model         | sequence_alignment_software                  | reporter_label        | methylation_platform                        | library_source_material   | library_source_molecule   | crdc_id                                      |
|:-------------------------------------|:------------------|:---------------------------------------|--------:|------------------:|------------------:|-----------:|:----------------------------|:--------------------------------------------|:---------------------------------------|:-------------------|:-----------------|:--------------------|:-----------|:-------------------------|:---------------------------------------------|:----------------------|:--------------------------------------------|:--------------------------|:--------------------------|:---------------------------------------------|
| c0ceb350-9318-4a48-875a-dd956a73e211 | GC-GENO-115990    | Gallia_est_omnis_divisa_in_partes_tres |     720 |               863 |            846.52 |          8 | Not specified in data       | scientia_et_labor                           | damnant_quod_non_intellegunt           | Spatial-tx         | Paired-End       | Unspecified         | BGISEQ     | 454 GS FLX+              | e_causa_ignota                               | e_causa_ignota        | scientiae_et_patriae                        | Bulk Cells                | Genomic DNA               | nascentes_morimur_finisque_ab_origine_pendet |
| 807cdd3c-1066-49ad-ae58-95b6a7987744 | GC-GENO-115990    | Gallia_est_omnis_divisa_in_partes_tres |     720 |               863 |            846.52 |          8 | Not specified in data       | scientia_et_labor                           | Illumina NextSeq 2500                  | Spatial-tx         | Paired-End       | Unspecified         | BGISEQ     | 454 GS FLX+              | e_causa_ignota                               | e_causa_ignota        | scientiae_et_patriae                        | Bulk Cells                | Genomic DNA               | nascentes_morimur_finisque_ab_origine_pendet |
| c0ceb350-9318-4a48-875a-dd956a73e211 | GC-GENO-185213    | scientia_et_labor                      |     552 |               620 |            808.98 |          0 | GRCh37                      | Macte_animo_Generose_puer_sic_itur_ad_astra | e_causa_ignota                         | CLONE              | Single-indexed   | PCR                 | Ultima     | Illumina NextSeq 500     | scientia_imperii_decus_et_tutamen            | a_bene_placito        | a_bene_placito                              | Single-nuclei             | Not Reported              | barba_crescit_caput_nescit                   |
| 807cdd3c-1066-49ad-ae58-95b6a7987744 | GC-GENO-185213    | scientia_et_labor                      |     552 |               620 |            808.98 |          0 | GRCh37                      | Macte_animo_Generose_puer_sic_itur_ad_astra | Illumina NextSeq 2500                  | CLONE              | Single-indexed   | PCR                 | Ultima     | Illumina NextSeq 500     | scientia_imperii_decus_et_tutamen            | a_bene_placito        | a_bene_placito                              | Single-nuclei             | Not Reported              | barba_crescit_caput_nescit                   |
| c0ceb350-9318-4a48-875a-dd956a73e211 | GC-GENO-645746    | scientia_imperii_decus_et_tutamen      |     525 |               253 |            469.13 |          3 | GRCh38                      | saltus_in_demonstrando                      | cacatum_non_est_pictum                 | scRNA-Seq          | Single-indexed   | ChIP                | Helicos    | PacBio Sequel II         | nascentes_morimur_finisque_ab_origine_pendet | sapiens_qui_prospicit | Macte_animo_Generose_puer_sic_itur_ad_astra | Not Reported              | Transcriptome             | cacatum_non_est_pictum                       |
| 807cdd3c-1066-49ad-ae58-95b6a7987744 | GC-GENO-645746    | scientia_imperii_decus_et_tutamen      |     525 |               253 |            469.13 |          3 | GRCh38                      | saltus_in_demonstrando                      | Illumina NextSeq 2500                  | scRNA-Seq          | Single-indexed   | ChIP                | Helicos    | PacBio Sequel II         | nascentes_morimur_finisque_ab_origine_pendet | sapiens_qui_prospicit | Macte_animo_Generose_puer_sic_itur_ad_astra | Not Reported              | Transcriptome             | cacatum_non_est_pictum                       |
| c0ceb350-9318-4a48-875a-dd956a73e211 | GC-GENO-808685    | cacatum_non_est_pictum                 |     961 |               492 |            948.83 |          1 | GRCh37-lite                 | labor_ipse_voluptas                         | Gallia_est_omnis_divisa_in_partes_tres | FL-cDNA            | Single-indexed   | Unspecified         | Helicos    | AB 3130 Genetic Analyzer | id_quod_plerumque_accidit                    | scientia_et_labor     | scientia,_aere_perennius                    | Single-nuclei             | Not Reported              | a_bene_placito                               |
| 807cdd3c-1066-49ad-ae58-95b6a7987744 | GC-GENO-808685    | cacatum_non_est_pictum                 |     961 |               492 |            948.83 |          1 | GRCh37-lite                 | labor_ipse_voluptas                         | Illumina NextSeq 2500                  | FL-cDNA            | Single-indexed   | Unspecified         | Helicos    | AB 3130 Genetic Analyzer | id_quod_plerumque_accidit                    | scientia_et_labor     | scientia,_aere_perennius                    | Single-nuclei             | Not Reported              | a_bene_placito                               |

Node: genomic_info
Diff report: None



| submission_id                        | study_link_id   | de_identification_method_type   | de_identification_method_description         | de_identification_software   | license   | citation_or_DOI                   | species        | image_modality   | imaging_equipment_manufacturer   | imaging_equipment_model   | imaging_software                       | imaging_protocol         | organ_or_tissue                         | performed_imaging_study_typeCode       | longitudinal_temporal_event_type   |   longitudinal_temporal_event_offset | crdc_id              |
|:-------------------------------------|:----------------|:--------------------------------|:---------------------------------------------|:-----------------------------|:----------|:----------------------------------|:---------------|:-----------------|:---------------------------------|:--------------------------|:---------------------------------------|:-------------------------|:----------------------------------------|:---------------------------------------|:-----------------------------------|-------------------------------------:|:---------------------|
| c0ceb350-9318-4a48-875a-dd956a73e211 | GC-IMAGE-666518 | Automatic                       | pace_tua                                     | imperium_in_imperio          | CC BY 3.0 | nanos_gigantum_humeris_insidentes | a_bene_placito | GM               | a_bene_placito                   | scientiae_cedit_mare      | Gallia_est_omnis_divisa_in_partes_tres | scientia,_aere_perennius | Sigmoid colon                           | Gallia_est_omnis_divisa_in_partes_tres | Baseline                           |                               904.75 | labor_ipse_voluptas  |
| 807cdd3c-1066-49ad-ae58-95b6a7987744 | GC-IMAGE-666518 | Automatic                       | pace_tua                                     | imperium_in_imperio          | CC BY 3.0 | nanos_gigantum_humeris_insidentes | a_bene_placito | GM               | Orbitrap                         | Orbitrap                  | Gallia_est_omnis_divisa_in_partes_tres | scientia,_aere_perennius | Sigmoid colon                           | Gallia_est_omnis_divisa_in_partes_tres | Baseline                           |                               904.75 | labor_ipse_voluptas  |
| c0ceb350-9318-4a48-875a-dd956a73e211 | GC-IMAGE-715541 | Semiautomatic                   | nascentes_morimur_finisque_ab_origine_pendet | id_quod_plerumque_accidit    | CC BY 3.0 | a_bene_placito                    | a_bene_placito | DM               | barba_crescit_caput_nescit       | scientia_et_sapientia     | saltus_in_demonstrando                 | sapiens_qui_prospicit    | Overlapping lesion of ill-defined sites | scientia,_aere_perennius               | Enrollment                         |                               778.1  | scientiae_cedit_mare |
| 807cdd3c-1066-49ad-ae58-95b6a7987744 | GC-IMAGE-715541 | Semiautomatic                   | nascentes_morimur_finisque_ab_origine_pendet | id_quod_plerumque_accidit    | CC BY 3.0 | a_bene_placito                    | a_bene_placito | DM               | Orbitrap                         | Orbitrap                  | saltus_in_demonstrando                 | sapiens_qui_prospicit    | Overlapping lesion of ill-defined sites | scientia,_aere_perennius               | Enrollment                         |                               778.1  | scientiae_cedit_mare |

Node: image
Diff report: None



| submission_id                        | proteomic_info_id   | aliquot_id          | analytical_fractions   | instrument_make                 | proteomic_instrument_model   | proteomic_design_description           | manufacturer_model_name   | crdc_id                      |
|:-------------------------------------|:--------------------|:--------------------|:-----------------------|:--------------------------------|:-----------------------------|:---------------------------------------|:--------------------------|:-----------------------------|
| c0ceb350-9318-4a48-875a-dd956a73e211 | GC-PROTEOMIC-270724 | imperium_in_imperio | Proteome               | faber_est_suae_quisque_fortunae | labor_ipse_voluptas          | haec_olim_meminisse_iuvabit            | Orbitrap                  | saltus_in_demonstrando       |
| 807cdd3c-1066-49ad-ae58-95b6a7987744 | GC-PROTEOMIC-270724 | imperium_in_imperio | Proteome               | faber_est_suae_quisque_fortunae | Orbitrap                     | haec_olim_meminisse_iuvabit            | Orbitrap                  | nan                          |
| c0ceb350-9318-4a48-875a-dd956a73e211 | GC-PROTEOMIC-393569 | scientia_et_labor   | Phosphoproteome        | damnant_quod_non_intellegunt    | scientia_et_labor            | Gallia_est_omnis_divisa_in_partes_tres | Orbitrap                  | damnant_quod_non_intellegunt |
| 807cdd3c-1066-49ad-ae58-95b6a7987744 | GC-PROTEOMIC-393569 | scientia_et_labor   | Phosphoproteome        | damnant_quod_non_intellegunt    | Orbitrap                     | Gallia_est_omnis_divisa_in_partes_tres | Orbitrap                  | nan                          |

Node: proteomic
Diff report: None

