The purpose of this notebook is to try out vmMatch with CDA examples and see where it leads.  The basic idea it to provide a list of terms and see if I can tease out the CDEs they beling to

In [46]:
import pandas as pd
import requests
import pprint
import json
import pandas as pd

caDSR Swagger interface:  https://cadsrapi.cancer.gov/NCIAPI/1.0/index.html

In [47]:
vmatchprodurl = "https://cadsrapi.cancer.gov/rad/vmMatch/v1/vmMatch"

In [48]:
cdaSpeciesList = ["canis familiaris", "home sapeins", "homo sapiens; mus musculus", "internal reference-pooled sample", "jhu-qc", "mus musculus", "normal only ir", "not reported", "pnnl-jhu ref", "ref", "taiwanese ir", "tumor only ir"]
testlist = ["canis familiaris","jhu-qc","homo sapiens; mus musculus"]
bumlist = ["jhu-qc"]

In [49]:
headers = { 'Content-Type': 'application/json',
            'matchType': 'Restricted',
            'function': 'Concepts Only'}

In [64]:
 def runPostQuery(url, query, headers):
     #url is the vmMatch URL
    #query is a list of dictionary
    #headers is HTML headers
    try:
        results = requests.post(url, data=json.dumps(query), headers=headers)
    except requests.exceptions.HTTPError as e:
        pprint.pprint(e)
    results = json.loads(results.content.decode())
    #print(("%s\n%s\n")%(query, results))
    return results['matchResults']

In [51]:
def dataElementQuery(publicid):
    url = "https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api/DataElement/{}".format(publicid)
    headers = {"accept" : "application/json"}
    results = requests.get(url, headers = headers)
    if results.status_code == 200:
        results = json.loads(results.content.decode())
    else:
        results = None
    return results

In [52]:
def conceptCodeQuery(conceptcode):
    url = "https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api/DataElement/query/Concept?conceptCode={}".format(conceptcode)
    headers = {"accept" : "application/json"}
    results = requests.get(url, headers = headers)
    results = json.loads(results.content.decode())
    return results['DataElementQueryResults']

In [53]:
def dictOfArray(dictionary, key, value):
    #This loads a dictionary of arrays.  If the key exists, the value is added to the array.  If they key does not exist, it is added along with an array containng the new value.
    if key in dictionary:
        temparray = dictionary[key]
        temparray.append(value)
        dictionary[key] = temparray
    else:
        dictionary[key] = [value]
    return dictionary

vmMatch takes a list of dictionary with "name" and "userTip" defined in each dictionary.  List lengths in the 10-30 range probably OK, getting into hundreds may cause system errors

This is where things can get funky.  The concept id can be used in a caDSR Concept query (/DataElement/query/Concept,  curl -X GET "https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api/DataElement/query/Concept?conceptCode=C14201" -H "accept: application/json").  The records returned from that contain a publicID that can then be used
in a Data Element query (curl -X GET "https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api/DataElement/5729594" -H "accept: application/json")

So Step one will be to coleect the concept IDs

In [57]:
querylist = []
#for item in testlist:
#for item in bumlist:
for item in cdaSpeciesList:
    querylist.append({"name":item, "userTip":item})
bigres = runPostQuery(vmatchprodurl, querylist, headers)

[{'name': 'canis familiaris', 'userTip': 'canis familiaris'}, {'name': 'home sapeins', 'userTip': 'home sapeins'}, {'name': 'homo sapiens; mus musculus', 'userTip': 'homo sapiens; mus musculus'}, {'name': 'internal reference-pooled sample', 'userTip': 'internal reference-pooled sample'}, {'name': 'jhu-qc', 'userTip': 'jhu-qc'}, {'name': 'mus musculus', 'userTip': 'mus musculus'}, {'name': 'normal only ir', 'userTip': 'normal only ir'}, {'name': 'not reported', 'userTip': 'not reported'}, {'name': 'pnnl-jhu ref', 'userTip': 'pnnl-jhu ref'}, {'name': 'ref', 'userTip': 'ref'}, {'name': 'taiwanese ir', 'userTip': 'taiwanese ir'}, {'name': 'tumor only ir', 'userTip': 'tumor only ir'}]



For each entry we need to look at the 'name' field to find out what query it's related to. 

In [58]:
conceptiddict = {}
nohitlist = []
for entry in bigres:
    testname = entry['name']
    if int(entry['numberOfMatches']) > 0:
        for match in entry['matches']:
            conceptid = match['concept']
            conceptiddict = dictOfArray(conceptiddict, testname,conceptid)
    else:
        nohitlist.append(entry['name'])
        

In [59]:
#pprint.pprint(conceptiddict)
#pprint.pprint(nohitlist)

For each of the concept codes, hit the Concept endpoint and see what comes back

In [60]:
publiciddict = {}
for testname,list in conceptiddict.items():
    for id in list:
        conceptres = conceptCodeQuery(id)
        for entry in conceptres:
            publicid = entry['publicId']
            publiciddict = dictOfArray(publiciddict, testname, publicid)

In [61]:
#pprint.pprint(publiciddict)

In [62]:
tempiddict = {'canis familiaris': ['2452737','2452741']}

Lastly, use the publicID in a CDE Query.  From this we'll want the context and preferredName to start

In [63]:
cdemapping = {}
unknownids = {}
#for key, list in tempiddict.items():
for key, list in publiciddict.items():
    for publicid in list:
        cderes = dataElementQuery(publicid)
        if cderes is not None:
            context = cderes['DataElement']['DataElementConcept']['ConceptualDomain']['context']
            name = cderes['DataElement']['DataElementConcept']['ConceptualDomain']['preferredName']
            status = cderes['DataElement']['DataElementConcept']['ConceptualDomain']['workflowStatus']
            publicid = cderes['DataElement']['DataElementConcept']['ConceptualDomain']['publicId']
            holding = {"publicId":publicid, "context":context, "preferredName":name, 'workflowStatus':status}
            cdemapping = dictOfArray(cdemapping, key, holding)
        else: #Store the IDs that likely generated a 500 error
            unknownids = dictOfArray(unknownids, key, publicid)

    

In [38]:
#pprint.pprint(cdemapping)

In [39]:
#pprint.pprint(unknownids)

In [67]:
columns = ['OriginalSearchTerm', 'Context', 'PreferredName', 'PublicID','Status']
cdedf = pd.DataFrame(columns = columns)
for key, list in cdemapping.items():
    for entry in list:
        tempdf = pd.DataFrame({'OriginalSearchTerm':key, 'Context':entry['context'], 'PreferredName': entry['preferredName'], 'PublicID':entry['publicId'], 'Status':entry['workflowStatus']}, index=[0])
        #print(tempdf.to_string())
        cdedf = pd.concat([cdedf, tempdf], ignore_index=True)

In [68]:
cdedf.head()
#print(cdedf.to_string())

Unnamed: 0,OriginalSearchTerm,Context,PreferredName,PublicID,Status
0,canis familiaris,CCR,Veterinary Study,2452699,RELEASED
1,canis familiaris,CCR,Veterinary Study,2452699,RELEASED
2,canis familiaris,CCR,Veterinary Study,2452699,RELEASED
3,canis familiaris,CCR,Veterinary Study,2452699,RELEASED
4,canis familiaris,CCR,Veterinary Study,2452699,RELEASED


In [69]:
cdedf.groupby(cdedf.columns.tolist(), as_index=False).size()

Unnamed: 0,OriginalSearchTerm,Context,PreferredName,PublicID,Status,size
0,canis familiaris,CCR,Person Measure/Instrument Testing,2524082,RELEASED,2
1,canis familiaris,CCR,Veterinary Study,2452699,RELEASED,5
2,canis familiaris,CTEP,Assessment Results,2008556,RELEASED,1
3,canis familiaris,CTEP,Individuals,2008532,RELEASED,2
4,canis familiaris,CTEP,Specimen Characteristics,2008547,RELEASED,1
...,...,...,...,...,...,...
342,tumor only ir,SPOREs,Tissue Banking,2230266,RELEASED,5
343,tumor only ir,TEST,MethodDevice,3594779,RETIRED ARCHIVED,2
344,tumor only ir,caCORE,Findings,2179409,DRAFT NEW,1
345,tumor only ir,caCORE,Techniques,2008582,DRAFT NEW,1
