<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
def getCMAP(cmapID='1QBMKWB9H-P56N1M-6XJ',baseURL = 'https://cmapscloud.ihmc.us/resources/rid='):
    """
    Given a CMAP ID and possibly a Server URL, read a CMAP from a CMAP server, the default server being the CMAPs cloud server
    Note we are using this webservice: https://cmap.ihmc.us/xml/Cmapserver-HTTP-API.pdf
    """
    import base64, sys, requests
    import getpass
    from urllib import request
    from urllib.request import urlopen
    from urllib.error import URLError
    
    #For CMaps GET commands: get.resmeta, get.cmap
#    getCMAPmetadata = '/?cmd=get.resmeta' 
#    getCMAP = '/?cmd=get.cmap' 
#    getPermissions = '/?cmd=get.permissions' 
#    url = baseURL + cmapID + getCMAP
#    pwd = getpass.getpass(prompt='Password: ', stream=None) 
#    requests = requests.get(url, auth=(email, pwd), allow_redirects=True, verify=True)
#    cxlDoc = requests.text
    
    getMetacom = '/?cmd=get.resmeta' 
    getCMAPcom = '/?cmd=get.cmap' 
    getPermscom = '/?cmd=get.permissions' 
    url = baseURL + cmapID + getCMAPcom
    authEmail='mclange@ucdavis.edu'
    pwd = getpass.getpass(prompt='Password: ', stream=None) 
    r = requests.get(url, auth=(authEmail, pwd), allow_redirects=True, verify=True)
    cxlDoc = r.text
    
    return(cxlDoc)


In [4]:
from sqlalchemy import create_engine

def CMAP2DF(cmapID,baseURL = 'https://cmapscloud.ihmc.us/resources/rid='):
    """Reads a CMAP on a remote server and puts its contents into data frames"""

    
    cxlDoc=getCMAP(cmapID, baseURL)
    #print (cxlDoc)
    parseCMAP(cxlDoc)



def parseCMAP(cxlDoc=getCMAP()):
    """
    Given a cmap cxl (xml) document, parse it and shove it into a data frame.
    We'll start with the resource metadata (ResMeta), followed by:
    concepts, link phrases, connections, their appearances, and any associated resources
    """
    engine = create_engine('postgresql://milk:milkDB@localhost:5432/dbAnimalMilks')
    from IPython.display import display
    import xmltodict
    import pandas as pd
    ###################################
    def dicKeyRenamer (oldDic, dicRenamer):
        """
        A dic key renaming function we'll use for renaming variables from oldDic 
        with a dicRenamer containing oldnames:newnames. Putting the renamed k,v pairs
        in a newDic
        """
        newDic = {}
        for k,v in oldDic.items():
            for k2,v2 in dicRenamer.items():
                if k==k2:
                    newDic[v2] = v
        return(newDic)
    ###################################
    def countDicItems (d):
        """
        takes a (sub) dictionary and returns the number of items (lists or dics)
        """
        count = 0
        for x in enumerate(d.items()): 
            if isinstance(x[1][1], list): 
                count += len(x[1][1]) 
        return (count)
    ###################################
    def subDic2DF (rootDicName, rootDicVal, dicLoc, nodParent, nodFamily, dicColRenames={}):
        """
        takes a parent (sub) dicionary (node) and makes the homogeneous child nodes into a dataframe 
        with ability to rename columns if dicColRenames provided in form of {dicName:colName}
        nodeText and 
        """
        if (isinstance(nodParent, int)) or \
            (nodParent is None) or \
            (nodFamily is None) or \
            (nodParent in dicLoc): 
            if nodParent is None:
                #print("nodParent is None!")
                dicParent = dicLoc
            else:
                dicParent = dicLoc[nodParent]
                #print(dicParent)
                nodeCount  = countDicItems(dicParent)
                #print('nodeCount:' + str(nodeCount))
                keyCount = len(dicParent)
            #print('keyCount:' + str(keyCount))
            if nodFamily is None: #make a df for the keys at this level
                #print("nodFamily is None!")
                keyCount = len(dicParent)
                if keyCount>0:
                    print('')
                df = pd.DataFrame([dicParent])
            else:
                if nodeCount>1:
                    df = pd.DataFrame(dicParent[nodFamily]) # turn dic into dataframe
                else:
                    df = pd.DataFrame([dicParent[nodFamily]] )#accomodate singleton as a non-scalar with extra '[]'
                df.rename(columns= dicColRenames,inplace=True)    #get rid of the dict names and make more SQL friendly
            for column in df: #rename columns to remove/replace special characters
                ampNum = str(column).find('@')  #eventually update this to find all special characters that may be in a dict name
                if ampNum >-1:
                    newCol = str(column)[ampNum+1:]
                    df.rename(columns= {column:newCol},inplace=True)
                newCol = str(column).replace(':','-')
                df.rename(columns= {column:newCol},inplace=True)
            df.insert(loc=0,column=rootDicName, value=rootDicVal) # add rootID as a column   
            if  dicLoc != dicMapResPeople and nodFamily != 'resource':
                if nodFamily is not None:
                    display(nodFamily + 's')
                    display(df)
            return(df)
    #############################################
    #First transform cxl into dic
    dicDoc = xmltodict.parse (cxlDoc) 
    #set-up the mapID and associated variables
    #print(cxlDoc)
    mapID = dicDoc['cmap']['res-meta']['dc:source'].split(':')[-1]
    rootDicName = 'mapID' #we'll use rootDicName as we parse the dics
    rootDicVal = mapID    #we'll use rootDicVal as we parse the dics
    dicLoc = None
    nodParent = None
    nodFamily = None
    dicColRenames = {}
    tupDic2DF = (rootDicName, rootDicVal, dicLoc, nodParent, nodFamily,dicColRenames )
    #############################################
    #Make a df of the map (resource) metadata
    #first make a list people and orgs to pull out their sub-dics 
    listResMetaPeopleOrgs = ['dc:creator', 'dc:contributor', 'dcterms:rightsHolder'] 
    #create new dics with & without people/orgs 
    dicMapResMeta = {key:val for key, val in dict(dicDoc['cmap']['res-meta']).items() if key not in listResMetaPeopleOrgs}
    dicMapResPeople = {key:val for key, val in dict(dicDoc['cmap']['res-meta']).items() if key in listResMetaPeopleOrgs}

    #Now we'll make a DF from the resmeta dic sans people/org references
    dicLoc = dicMapResMeta
    tupDic2DF = (rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
    dfMapResMeta = subDic2DF(*tupDic2DF)
    dfMapResMeta.to_sql('tbl_MapResMeta', engine, if_exists='append', chunksize=1000)

    #############################################
    #make a df of all the people sans metadata
    dicLoc = dicMapResPeople
    dfMapResPeople = None
    for item in listResMetaPeopleOrgs:
        nodParent = item
        tupDic2DF = (rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
        if dfMapResPeople is not None:
            dfMapResPeople = dfMapResPeople.append(subDic2DF(*tupDic2DF))
            dfMapResPeople['role'].iat[-1] =item.split(':')[-1]
        else:
            dfMapResPeople = subDic2DF(*tupDic2DF)
            dfMapResPeople.insert(loc=1, column='role', value=item.split(':')[-1])
        if 'vcard-ORG' in dfMapResPeople.columns:
            dfMapResPeople['vcard-ORG'].iat[-1] = str(dfMapResPeople['vcard-ORG'].iat[-1]).split(", '",1)[-1][:-4] 
    display(dfMapResPeople)
    dfMapResPeople.to_sql('tbl_People', engine, if_exists='append', chunksize=1000)
    ###################################
    #Variables for Map-level dict extraction
    dicLoc = dicDoc['cmap']['map'] 
    #############################################
    #Get the Concepts
    nodParent = 'concept-list'
    nodFamily = 'concept'
    dfConcepts = subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
    numConcepts = len(dfConcepts.index)
    dfConcepts.to_sql('tbl_Concepts', engine, if_exists='append', chunksize=1000)
    #############################################
    #Get the Concept-appearances
    nodParent = 'concept-appearance-list'
    nodFamily = 'concept-appearance'
    dfConceptsAprncs = subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
    dfConceptsAprncs.to_sql('tbl_ConceptsAprncs', engine, if_exists='append', chunksize=1000)
    #############################################
    #Get the Linking Phrases 
    nodParent = 'linking-phrase-list'
    nodFamily = 'linking-phrase'
    dfLinkingPhrases = subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
    dfLinkingPhrases.to_sql('tbl_LinkingPhrases', engine, if_exists='append', chunksize=1000)
    #############################################
    #Get the linking-phrase-appearances
    nodParent = 'linking-phrase-appearance-list'
    nodFamily = 'linking-phrase-appearance'
    dfLinkingPhraseAprncs = subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
    dfLinkingPhraseAprncs.to_sql('tbl_LinkingPhraseAprncs', engine, if_exists='append', chunksize=1000)
    #############################################
    #Get the connections
    nodParent = 'connection-list'
    nodFamily = 'connection'
    dfCnxns = subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
    dfCnxns.to_sql('tbl_Cnxns', engine, if_exists='append', chunksize=1000)
    #############################################
    #Get the connections-appearances
    nodParent = 'connection-appearance-list'
    nodFamily = 'connection-appearance'
    dfCnxnAprncs = subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
    dfCnxnAprncs.to_sql('tbl_CnxnAprncs', engine, if_exists='append', chunksize=1000)
    #############################################
    #Get the resource-groups
    nodParent = 'resource-group-list'
    nodFamily = 'resource-group'
    dfResGrps = subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
    dfResGrps['resource']
    dfResGrps.to_sql('tbl_ResGrps',engine, if_exists='append', chunksize=1000)
    #############################################
    #Get the resources
    dicLoc = dicDoc['cmap']['map']['resource-group-list']['resource-group']
    rgCount = len(dfResGrps.index)
    rgCounter = 0 #the counter for iterating through each of the ResourceGroups
    dfResources = None
    while rgCounter < rgCount:
        nodParent = rgCounter
        nodFamily = 'resource'
        if dfResources is not None:
            dfResources = dfResources.append(subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily))
            dfResources['parent-id'].iat[-1] = dicLoc[rgCounter].get("@parent-id")
        else:
            dfResources = subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
            dfResources.insert(loc=1, column='parent-id', value=dicLoc[rgCounter].get("@parent-id"))
        rgCounter +=1
    display(dfResources)
    dfResources.to_sql('tbl_Resources', engine, if_exists='append', chunksize=1000)
    #############################################
    #Get the resource-appearances
    dicLoc = dicDoc['cmap']['map']
    nodParent = 'resource-appearance-list'
    nodFamily = 'resource-appearance'
    dfResAprncs = subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
    dfResAprncs.to_sql('tbl_ResAprncs', engine, if_exists='append', chunksize=1000)
    #############################################
    #Get the stylesheets
    nodParent = 'style-sheet-list'
    nodFamily = 'style-sheet'
    dfStyleSheets = subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
    dfStyleSheets.to_sql('tbl_StyleSheets', engine, if_exists='append', chunksize=1000)
    #############################################
    #Get the styles within stylesheets
    listStyles = ['map-style', 'concept-style', 'linking-phrase-style', 'connection-style', 'resource-style']
    dicLoc = dicDoc['cmap']['map']['style-sheet-list']['style-sheet']
    dfStyles = None
    rgCount = len(dfStyleSheets.index)
    rgCounter = 0 #the counter for iterating through each of the style-sheets
    while rgCounter < rgCount:
        nodParent = rgCounter
        for item in listStyles:
            nodFamily = item
            if item in (dicLoc[nodParent]):
                if dfStyles is not None:
                    dfStyles = dfStyles.append(subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily))
                    dfStyles['style-sheet-id'].iat[-1] = dicLoc[rgCounter].get("@id")
                    dfStyles['style-type'].iat[-1] = str(item)
                else:
                    dfStyles = subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
                    dfStyles.insert(loc=1, column='style-sheet-id', value = dicLoc[rgCounter].get("@id"))
                    dfStyles.insert(loc=1, column='style-type', value = str(item))
            display(dfStyles)
        rgCounter +=1
    dfStyles.to_sql('tbl_Styles', engine, if_exists='append', chunksize=1000)
    #############################################    #Get the extra-properties
    dicLoc = dicDoc['cmap']['map']
    nodParent = 'extra-properties-list'
    nodFamily = 'properties-list'
    dfExtraProperties = subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
    display(dfExtraProperties)
    dfExtraProperties.to_sql('tbl_ExtraProperties', engine, if_exists='append', chunksize=1000)
    ############################################# #Get the images
    nodParent = 'image-list'
    nodFamily = 'image'
    dfExtraProperties = subDic2DF(rootDicName, rootDicVal, dicLoc, nodParent, nodFamily)
    display(dfExtraProperties)
    dfExtraProperties.to_sql('tbl_ExtraProperties', engine, if_exists='append', chunksize=1000)

    #############################################

Password: ········


In [6]:
CMAP2DF('1Y4GQRF92-182W7S7-8VB')#'1W3PQRS9F-JF73RT-C6K')#,'https://cmapspublic3.ihmc.us:443/resources/rid=')#('1W0WVLFBD-TS6DJK-DSG')#('1W30W801X-V1NSJ8-3R4') #('1SZD25T3-FT7FSN-FZ')#

Password: ········



KeyError: 'dc:creator'