In [146]:
import requests
import os
import json
import ast

The libraries below are for display purposes only

In [147]:
import pandas as pd
from IPython.display import display, Markdown
import pprint

In [148]:
def apiQuery(tier, query, variables):
    if tier == 'prod':
        url = 'https://hub.datacommons.cancer.gov/api/graphql'
        token = os.environ['PRODAPI']
    elif tier == 'stage':
        #Note that use of Stage is for example purposes only, actual submissions should use the production URL.  If you wish to run tests on Stage, please contact the helpdesk.
        url = 'https://hub-stage.datacommons.cancer.gov/api/graphql'
        token = os.environ['STAGEAPI']
    else:
        return('Please provide either "stage" or "prod" as tier values')
    headers = {"Authorization": f"Bearer {token}"}
    try:
        if variables is None:
            result = requests.post(url = url, headers = headers, json={"query": query})
        else:
            result = requests.post(url = url, headers = headers, json = {"query":query, "variables":variables})
        if result.status_code == 200:
            return result.json()
        else:
            print(f"Error: {result.status_code}")
            return result.content
    except requests.exceptions.HTTPError as e:
        return(f"HTTP Error: {e}")

In [149]:
subid = '9ba3152c-4194-4a30-ba9d-10bbba735302'

In [170]:
neweq = """
query retrieveReleasedDataByID(
    $submissionID: String!,
    $nodeType: String!
    $nodeID: String!
){
retrieveReleasedDataByID(
    submissionID: $submissionID,
    nodeType: $nodeType
    nodeID: $nodeID
){
    submissionID
    status
    dataCommons
    dataCommonsDisplayName
    studyID
    nodeType
    nodeID
    props
}
}
"""

In [171]:
submission_nodes_query = """
query getSubmissionNodes(
    $_id: String!,
    $nodeType: String!, 
    $status: String,
    $first: Int, 
    $offset:Int, 
    $orderBy: String, 
    $sortDirection:String
) {
getSubmissionNodes(
    submissionID: $_id
    nodeType: $nodeType
    status: $status
    first: $first
    offset: $offset
    orderBy: $orderBy
    sortDirection: $sortDirection
) {
    total
    IDPropName
    properties
    nodes {
        nodeID
        nodeType
        status
        props
    }
    }
}
"""

In [172]:
severity = 'All'

In [173]:
submission_nodes_vars = {'_id':subid, 'nodeType':nodetype, 'status':severity, 'first':-1, 'offset':0, 'orderBy':'studyID', 'sortDirection':'desc'}

In [174]:
subnodes_res = apiQuery('stage', submission_nodes_query, submission_nodes_vars)

In [175]:
columns = ["nodeID","nodeType","sample_id","sample_type","sample_description","sample_type_category","sample_tumor_status","sample_anatomic_site","sample_age_at_collection","derived_from_specimen","biosample_accession","participant.study_participant_id"]
subnodes_df = pd.DataFrame(columns=columns)

In [176]:
for node in subnodes_res['data']['getSubmissionNodes']['nodes']:
    #pprint.pprint(f"{subnodes_res['data']['getSubmissionNodes']}\n")
    temp = json.loads(node['props'])
    temp['nodeID'] = node['nodeID']
    temp['nodeType'] = node['nodeType']
    subnodes_df.loc[len(subnodes_df)] = temp

In [177]:
#display(Markdown(subnodes_df.to_markdown()))

In [190]:
difflist = []
for index, row in subnodes_df.iterrows():
    nodetype = row['nodeType']
    nodeID = row['nodeID']
    variables = {'submissionID': subid , 'nodeType': nodetype, 'nodeID': nodeID, 'first': -1}
    diffres = apiQuery('stage', neweq, variables)
    dfcollection = {}
    #difflist = []
    for entry in diffres['data']['retrieveReleasedDataByID']:
        propstuff = ast.literal_eval(entry['props'])
        temp_df = pd.DataFrame(propstuff, index=[entry['submissionID']])
        dfcollection[entry['submissionID']] = temp_df
        keylist = list(dfcollection.keys())
        if len(keylist) >= 2:
            df1 = dfcollection[keylist[0]]
            df2 = dfcollection[keylist[1]]
            diff_df = pd.concat([df1, df2]).drop_duplicates(keep=False)
            difflist.append(diff_df)
            #display(Markdown(diff_df.to_markdown()))

In [191]:
report_df = pd.concat(difflist)
display(Markdown(report_df.to_markdown()))

|                                      | reference_genome_assembly   | design_description                                                                                                                                                                                                                                                                                                                                                                          | library_id   | library_strategy   | library_source_material   | library_source_molecule   | library_selection   | platform   | instrument_model      | genomic_info_id                           |   bases |   number_of_reads |   avg_read_length |   coverage | library_layout   | sequence_alignment_software                      |
|:-------------------------------------|:----------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------|:-------------------|:--------------------------|:--------------------------|:--------------------|:-----------|:----------------------|:------------------------------------------|--------:|------------------:|------------------:|-----------:|:-----------------|:-------------------------------------------------|
| 9ba3152c-4194-4a30-ba9d-10bbba735302 | GRCh38                      | WXS Hybrid Selection paired-end                                                                                                                                                                                                                                                                                                                                                             | lib1         | ChIP-Seq           | Bulk Tissue               | Viral RNA                 | MSLL                | Illumina   | Illumina NovaSeq 6000 | dg.4DFC/840c7e9c-8900-4d1d-96667ce99c1e_1 |     648 |               116 |             65.22 |          1 | Paired-End       | Nucleotide Sequence Alignment Software Name Text |
| 1d75dfe1-f4e9-4445-9836-e11f55d92761 | GRCh38                      | WXS Hybrid Selection paired-end                                                                                                                                                                                                                                                                                                                                                             | nan          | ChIP-Seq           | Bulk Tissue               | Viral RNA                 | MSLL                | Illumina   | Illumina NovaSeq 6000 | dg.4DFC/840c7e9c-8900-4d1d-96667ce99c1e_1 |     648 |               116 |             65.22 |          1 | Paired-End       | nan                                              |
| 9ba3152c-4194-4a30-ba9d-10bbba735302 | GRCh37                      | Anchored Multiplex PCR (AMP(TM)) chemistry is purpose-built to accurately identify both simple and complex genetic mutations by leveraging the power of bidirectional primers and molecular barcodes, or MBCs, from low nucleic acid input in tissue or blood. AMP chemistry is also flexible, so it can be used for applications in DNA, RNA, and ctDNA sequencing across most tumor type. | lib2         | CTS                | Bulk Tissue               | Viral RNA                 | MSLL                | BGISEQ     | Illumina MiSeq        | dg.4DFC/840c7e9c-8900-4d1d-96667ce99c1e_2 |     879 |               530 |             41.91 |          2 | Paired-End       | test1                                            |
| 1d75dfe1-f4e9-4445-9836-e11f55d92761 | GRCh37                      | Anchored Multiplex PCR (AMP(TM)) chemistry is purpose-built to accurately identify both simple and complex genetic mutations by leveraging the power of bidirectional primers and molecular barcodes, or MBCs, from low nucleic acid input in tissue or blood. AMP chemistry is also flexible, so it can be used for applications in DNA, RNA, and ctDNA sequencing across most tumor type. | nan          | CTS                | Bulk Tissue               | Viral RNA                 | MSLL                | BGISEQ     | Illumina MiSeq        | dg.4DFC/840c7e9c-8900-4d1d-96667ce99c1e_2 |     879 |               530 |             41.91 |          2 | Paired-End       | nan                                              |
| 9ba3152c-4194-4a30-ba9d-10bbba735302 | GRCh38                      | WXS Hybrid Selection paired-end                                                                                                                                                                                                                                                                                                                                                             | lib3         | CLONEEND           | Bulk Tissue               | Transcriptome             | MSLL                | LS 454     | Illumina NovaSeq 6000 | dg.4DFC/840c7e9c-8900-4d1d-96667ce99c1e_3 |     127 |               527 |            151.42 |          3 | Paired-End       | test2                                            |
| 1d75dfe1-f4e9-4445-9836-e11f55d92761 | GRCh38                      | WXS Hybrid Selection paired-end                                                                                                                                                                                                                                                                                                                                                             | nan          | CLONEEND           | Bulk Tissue               | Transcriptome             | MSLL                | LS 454     | Illumina NovaSeq 6000 | dg.4DFC/840c7e9c-8900-4d1d-96667ce99c1e_3 |     127 |               527 |            151.42 |          3 | Paired-End       | nan                                              |
| 9ba3152c-4194-4a30-ba9d-10bbba735302 | GRCh37                      | Anchored Multiplex PCR (AMP(TM)) chemistry is purpose-built to accurately identify both simple and complex genetic mutations by leveraging the power of bidirectional primers and molecular barcodes, or MBCs, from low nucleic acid input in tissue or blood. AMP chemistry is also flexible, so it can be used for applications in DNA, RNA, and ctDNA sequencing across most tumor type. | lib4         | snATAC-Seq         | Bulk Tissue               | Transcriptome             | MSLL                | Ultima     | Illumina MiSeq        | dg.4DFC/840c7e9c-8900-4d1d-96667ce99c1e_4 |     762 |               943 |            148.27 |          4 | Paired-End       | seq3                                             |
| 1d75dfe1-f4e9-4445-9836-e11f55d92761 | GRCh37                      | Anchored Multiplex PCR (AMP(TM)) chemistry is purpose-built to accurately identify both simple and complex genetic mutations by leveraging the power of bidirectional primers and molecular barcodes, or MBCs, from low nucleic acid input in tissue or blood. AMP chemistry is also flexible, so it can be used for applications in DNA, RNA, and ctDNA sequencing across most tumor type. | nan          | snATAC-Seq         | Bulk Tissue               | Transcriptome             | MSLL                | Ultima     | Illumina MiSeq        | dg.4DFC/840c7e9c-8900-4d1d-96667ce99c1e_4 |     762 |               943 |            148.27 |          4 | Paired-End       | nan                                              |
| 9ba3152c-4194-4a30-ba9d-10bbba735302 | GRCh37                      | Anchored Multiplex PCR (AMP(TM)) chemistry is purpose-built to accurately identify both simple and complex genetic mutations by leveraging the power of bidirectional primers and molecular barcodes, or MBCs, from low nucleic acid input in tissue or blood. AMP chemistry is also flexible, so it can be used for applications in DNA, RNA, and ctDNA sequencing across most tumor type. | lib5         | FINISHING          | Bulk Tissue               | Transcriptome             | Other               | Illumina   | Illumina MiSeq        | dg.4DFC/840c7e9c-8900-4d1d-96667ce99c1e_5 |     668 |               843 |            403.05 |          5 | Single-indexed   | seq4                                             |
| 1d75dfe1-f4e9-4445-9836-e11f55d92761 | GRCh37                      | Anchored Multiplex PCR (AMP(TM)) chemistry is purpose-built to accurately identify both simple and complex genetic mutations by leveraging the power of bidirectional primers and molecular barcodes, or MBCs, from low nucleic acid input in tissue or blood. AMP chemistry is also flexible, so it can be used for applications in DNA, RNA, and ctDNA sequencing across most tumor type. | nan          | FINISHING          | Bulk Tissue               | Transcriptome             | Other               | Illumina   | Illumina MiSeq        | dg.4DFC/840c7e9c-8900-4d1d-96667ce99c1e_5 |     668 |               843 |            403.05 |          5 | Single-indexed   | nan                                              |

In [192]:
report_df.to_csv('/home/pihl/temp/warningdiffs.csv', sep="\t")