The purpose of this notebook is to try out vmMatch with CDA examples and see where it leads.  The basic idea it to provide a list of terms and see if I can tease out the CDEs they beling to

In [18]:
import pandas as pd
import requests
import pprint
import json
import time

caDSR Swagger interface:  https://cadsrapi.cancer.gov/NCIAPI/1.0/index.html

In [19]:
vmatchprodurl = "https://cadsrapi.cancer.gov/rad/vmMatch/v1/vmMatch"

In [20]:
cdaSpeciesList = ["canis familiaris", "home sapeins", "homo sapiens; mus musculus", "internal reference-pooled sample", "jhu-qc", "mus musculus", "normal only ir", "not reported", "pnnl-jhu ref", "ref", "taiwanese ir", "tumor only ir"]
testlist = ["canis familiaris","jhu-qc","homo sapiens; mus musculus"]
bumlist = ["jhu-qc"]
singlelist = ["canis familiaris"]

In [21]:
headers = { 'Content-Type': 'application/json',
            'matchType': 'Restricted',
            'function': 'Concepts Only'}

In [24]:
 def runPostQuery(url, query, headers):
     #url is the vmMatch URL
    #query is a list of dictionary
    #headers is HTML headers
    #starttime = time.time()
    try:
        results = requests.post(url, data=json.dumps(query), headers=headers)
    except requests.exceptions.HTTPError as e:
        pprint.pprint(e)
    results = json.loads(results.content.decode())
    #print(("%s\n%s\n")%(query, results))
    #endtime = time.time()
    #print(("PostQueryElapsed time:\t%s\n")%(str(endtime-starttime)))
    return results['matchResults']

In [25]:
def dataElementQuery(publicid):
    url = "https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api/DataElement/{}".format(publicid)
    headers = {"accept" : "application/json"}
    #starttime = time.time()
    results = requests.get(url, headers = headers)
    if results.status_code == 200:
        results = json.loads(results.content.decode())
    else:
        results = None
    #endtime = time.time()
    #print(("DataElementQueryElapsed time:\t%s\n")%(str(endtime-starttime)))
    return results

In [26]:
def conceptCodeQuery(conceptcode):
    url = "https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api/DataElement/query/Concept?conceptCode={}".format(conceptcode)
    headers = {"accept" : "application/json"}
    #starttime = time.time()
    results = requests.get(url, headers = headers)
    results = json.loads(results.content.decode())
    #endtime = time.time()
    #print(("ConceptCodeQueryyElapsed time:\t%s\n")%(str(endtime-starttime)))
    return results['DataElementQueryResults']

In [27]:
def dictOfArray(dictionary, key, value):
    #This loads a dictionary of arrays.  If the key exists, the value is added to the array.  If they key does not exist, it is added along with an array containng the new value.
    if key in dictionary:
        temparray = dictionary[key]
        temparray.append(value)
        dictionary[key] = temparray
    else:
        dictionary[key] = [value]
    return dictionary

vmMatch takes a list of dictionary with "name" and "userTip" defined in each dictionary.  List lengths in the 10-30 range probably OK, getting into hundreds may cause system errors

This is where things can get funky.  The concept id can be used in a caDSR Concept query (/DataElement/query/Concept,  curl -X GET "https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api/DataElement/query/Concept?conceptCode=C14201" -H "accept: application/json").  The records returned from that contain a publicID that can then be used
in a Data Element query (curl -X GET "https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api/DataElement/5729594" -H "accept: application/json")

So Step one will be to coleect the concept IDs

In [28]:
querylist = []
#for item in testlist:
#for item in bumlist:
for item in cdaSpeciesList:
#for item in singlelist:
    querylist.append({"name":item, "userTip":item})
bigres = runPostQuery(vmatchprodurl, querylist, headers)
#pprint.pprint(bigres)

For each entry we need to look at the 'name' field to find out what query it's related to.   Also, I'm an idiot, this result has the concept code so there's no need to hit the concept endpoint.

In [29]:
conceptiddict = {}
nohitlist = []
for entry in bigres:
    testname = entry['name']
    if int(entry['numberOfMatches']) > 0:
        for match in entry['matches']:
            conceptid = match['concept']
            conceptiddict = dictOfArray(conceptiddict, testname,conceptid)
    else:
        nohitlist.append(entry['name'])
        

In [30]:
#pprint.pprint(conceptiddict)
#pprint.pprint(nohitlist)

For each of the concept codes, hit the Concept endpoint and see what comes back.  The publicID is what links the concept to CDEs.

In [31]:
publiciddict = {}
for testname,list in conceptiddict.items():
    if len(list) > 50:
        print(("%s list length: %a")%(testname, str(len(list))))
    else:
        for id in list:
            conceptres = conceptCodeQuery(id)
            for entry in conceptres:
                publicid = entry['publicId']
                publiciddict = dictOfArray(publiciddict, testname, publicid)

ref list length: '1948'


In [32]:
#pprint.pprint(conceptres)
#pprint.pprint(publiciddict)

In [33]:
tempiddict = {'canis familiaris': ['2452737','2452741']}

Lastly, use the publicID in a CDE Query.  From this we'll want the context and preferredName to start.  May not need to do this.  Previous result has a preferredName

In [34]:
#cdemapping = {}
unknownids = {} 
columns = ['OriginalSearchTerm', 'Context', 'PreferredName', 'PublicID','Status']
cdedf = pd.DataFrame(columns = columns)
#for key, list in tempiddict.items():
for key, list in publiciddict.items():
    
    if len(list) > 00:
        print(("%s PublicID List length:\t%s\n")%(key, str(len(list))))
    else:
        for publicid in list:
            cderes = dataElementQuery(publicid)
            #pprint.pprint(cderes)
            if cderes is not None:
            #    starttime = time.time()
                context = cderes['DataElement']['DataElementConcept']['ConceptualDomain']['context']
                name = cderes['DataElement']['DataElementConcept']['ConceptualDomain']['preferredName']
                status = cderes['DataElement']['DataElementConcept']['ConceptualDomain']['workflowStatus']
                publicid = cderes['DataElement']['DataElementConcept']['ConceptualDomain']['publicId']
                tempdf = pd.DataFrame({'OriginalSearchTerm':key, 'Context':context, 'PreferredName': name, 'PublicID':publicid, 'Status':status}, index=[0])
                cdedf = pd.concat([cdedf, tempdf], ignore_index=True)
                #endtime = time.time()
             #   print(("Dataframe load time:\t%s\n")%(str(endtime-starttime)))
            else: #Store the IDs that likely generated a 500 error
                unknownids = dictOfArray(unknownids, key, publicid)

    

internal reference-pooled sample PublicID List length:	267

normal only ir PublicID List length:	26698

not reported PublicID List length:	640

tumor only ir PublicID List length:	3295



In [35]:
#pprint.pprint(cdemapping)

In [40]:
pprint.pprint(unknownids)

{'canis familiaris': ['2452737',
                      '2452741',
                      '2453731',
                      '2614959',
                      '3130966',
                      '3770708'],
 'home sapeins': ['2437505',
                  '2483052',
                  '2954138',
                  '3812905',
                  '3812920',
                  '3812936',
                  '3812977',
                  '4731731',
                  '4732283',
                  '5773872',
                  '6013119',
                  '6407813',
                  '6407875']}


In [38]:
cdedf.head()
#print(cdedf.to_string())

Unnamed: 0,OriginalSearchTerm,Context,PreferredName,PublicID,Status
0,canis familiaris,CCR,Veterinary Study,2452699,RELEASED
1,canis familiaris,CCR,Veterinary Study,2452699,RELEASED
2,canis familiaris,CCR,Veterinary Study,2452699,RELEASED
3,canis familiaris,CCR,Veterinary Study,2452699,RELEASED
4,canis familiaris,CCR,Veterinary Study,2452699,RELEASED


In [39]:
cdedf.groupby(cdedf.columns.tolist(), as_index=False).size()

Unnamed: 0,OriginalSearchTerm,Context,PreferredName,PublicID,Status,size
0,canis familiaris,CCR,Person Measure/Instrument Testing,2524082,RELEASED,2
1,canis familiaris,CCR,Veterinary Study,2452699,RELEASED,5
2,canis familiaris,CTEP,Assessment Results,2008556,RELEASED,1
3,canis familiaris,CTEP,Individuals,2008532,RELEASED,2
4,canis familiaris,CTEP,Specimen Characteristics,2008547,RELEASED,1
5,canis familiaris,caCORE,UML DEFAULT CD,2222502,RELEASED,2
6,home sapeins,CCR,Person Measure/Instrument Testing,2524082,RELEASED,5
7,home sapeins,CTEP,Assessments,2008551,RELEASED,14
8,home sapeins,CTEP,Data Source,2008576,RELEASED,2
9,home sapeins,CTEP,Eligibility,2008525,RELEASED,5
