# Class Objects for Guid Core Metadata

### Requirements
- common interface with fixed commands
    - postAPI
    - postNeo

In [113]:
import json, re
from flask import Response
import requests
from neo4j.v1 import GraphDatabase

In [57]:
class NeoConn():
    uri = "".join(["bolt://",os.Environ['NEO_URL'], ":7687"])
    user = os.Environ['NEO_USER']
    password = os.Environ['NEO_PASSWORD']
                  
    def __init__(self):
        self.driver = GraphDatabase.driver(uri ="bolt://localhost:7687")
        self.driver = GraphDatabase.driver(uri = self.uri, auth = (self.user, self.password) )

In [675]:
def processAnvl(anvl):
    ''' Single process to format json-ld from ANVL - simplifies Get Path
    '''
    return formatJson(recursiveUnpack(removeProfileFormat(anvl)))
    

In [145]:
ezid_auth = requests.auth.HTTPBasicAuth('apitest','apitest')

In [664]:
class CoreMetadata():
    
    def __init__(self, payload):
        ''' Core metadata Object
        
        Read in Data
        Determine if any options are set
        Validate all necesary Keys are Present
        '''
        
        default_options = {
            "_target": "https://ors.datacite.org/"+data.get('@id'),
            "_status": "reserved"
        }
        
        # load in the identifier level metadata, determine if user has set options 
        if  payload.get("metadata") == None:
            self.data = payload
            self.options = default_options
        else:
            self.data = payload.get("metadata")
            if payload.get("options") == None:
                self.options = default_options
            else:
                self.options = payload.get("options")
        
        
        # make sure all required keys are present
        try:
            assert set(data.keys()).issuperset(self.required_keys)
        except AssertionError:
            missing = ", ".join(self.required_keys.difference(self.required_keys.intersection(data.keys())) )
            missing_message = {
                            "message": "Missing Required Keys",
                            "GUID": data.get('@id', None),
                            "missingKeys": " ".join(["[", missing, "]"]) 
                            }
            return Response( status =400,
                    message = json.dumps(missing_message)
                    )
        
    
    
    def outputANVL(self):
        ''' Translation to Anvl with required keychanges
        '''
        return formatJson(recursiveFlatten(anvl))
        
        
    def postAPI(self, auth):
        """ Send a put request to the endpoint for the identifier
        """
        target = "".join([self.endpoint, self.data.get('@id',None) ])

        payload = profileFormat(recursiveFlatten(self.data))
        payload.update(self.options)
            
        #return outputAnvl(self.data)
        response = requests.put(
                auth = auth,
                url=target,
                headers = {'Content-Type': 'text/plain; charset=UTF-8'},
                data = outputAnvl(payload)
            )

        return response.content
        
    def deleteAPI(self, auth):
        target = "".join([self.endpoint, self.data.get('@id',None) ])
        response = requests.delete(
            auth = auth,
            url=target
        )
        return response.content
        

## Data Catalogs

DataCatalogs are no longer objects but have metadata about namesapces within GUID registry services

EZID have a root guid for every namespace?

Fields (all are mandatory)
- @id
- @type
- identifier
- name
- url

In [665]:
class DataCatalog(CoreMetadata):
    required_keys = set(['@id', '@type', 'name','url'])
    endpoint = "https://ezid.cdlib.org/id/"
  
    def postNeo(self):
        ''' Post to Neo database
        '''
        neo_driver = NeoConn()
        
        with neo_driver.driver.session() as session:
            with session.begin_transaction() as tx:
                tx.run("CREATE (d:dataCatalog {guid: $guid, name: $name, url: $url, type: 'DataCatalog'} )",
                       guid=self.data.get('@id', None), 
                       name=self.data.get('name', None),
                       url=self.data.get('url', None)
                      )
        
        
    

In [357]:
# ark regex pattern
# r'ark:/(\d{5})/(.*)'

# doi regex pattern
    

In [670]:
exDataCatalog = {
    "@id": "ark:/99999/fk4wkwod",
    "@type": "DataCatalog",
    "name": "Rat Genome Database",
    "url": "http://rgd.mcw.edu/"
}

In [671]:
myDC = DataCatalog(exDataCatalog)

In [674]:
myDC.postAPI(ezid_auth)

b'success: ark:/99999/fk4wkwod'

In [673]:
myDC.deleteAPI(ezid_auth)

b'success: ark:/99999/fk4wkwod'

In [364]:
myDC.postNeo()

In [554]:
getCache(myDC.data['@id'])

{'guid': 'ark:/99999/fk4MIR',
 'name': 'Rat Genome Database',
 'type': 'DataCatalog',
 'url': 'http://rgd.mcw.edu/'}

In [555]:
deleteCache(myDC.data['@id'])

{'guid': 'ark:/99999/fk4MIR',
 'name': 'Rat Genome Database',
 'type': 'DataCatalog',
 'url': 'http://rgd.mcw.edu/'}

In [557]:
def importDC(target):
    response = requests.get(target)
    if response.status_code == 200:
        obj = DataCatalog(ingestAnvl(str(response.content.decode('utf-8')) ))
        obj.postNeo()
        return True
    else:
        return False

In [None]:
# testing anvl -> object conversion
response = requests.get(
    url = 'https://ezid.cdlib.org/id/ark:/88120/r8059v'
)

In [657]:
# test import from API endpoint
importDC("https://ezid.cdlib.org/id/ark:/99999/fk4MIR")

True

## ARKs

endpoint = https://ezid.cdlib.org/id/ark:/...

Changes 
- includedInDataCatalog as a required key
- contentUrl as a required key as a string
    - set to object {'aws': 's3://..', 'gpc': 's3://...'}
- includedInDataCatalog is a guid string
- type is default to Dataset
- checksum is object method as key, object is the output

Required Fields
- @id
- identifier
- url: landing page url or other content
- dateCreated
- name
- author
- includedInDataCatalog

Optional Fields
- @type
- contentUrl
- expires

In [568]:
just_metadata = exArk
no_options = {"metadata": exArk}
full = {"metadata": exArk, "options": {"target": "random", "expiration": "3 minutes"}}

In [574]:
# function to determine which is which
just_metadata.get("metadata")

#no_options.get("metadata")


data = full.get("metadata")
settings = full.get("options")

if settings == None:
    settings == self.Default


In [573]:
settings

{'expiration': '3 minutes', 'target': 'random'}

In [656]:
class Ark(CoreMetadata):
    required_keys = set(['@id', 'identifier', 'url', 'dateCreated', 'name','author','includedInDataCatalog','contentUrl'])
    optional_keys = set(['@type', 'expires'])
    endpoint = "https://ezid.cdlib.org/id/"

    
    def postNeo(self):
        ''' Post to Neo database
        '''
        neo_driver = NeoConn()
        
        
        # query for parent with guid
        with neo_driver.driver.session() as session:
            with session.begin_transaction() as tx:
                node = tx.run("MATCH (node:dataCatalog) "
                      "WHERE node.guid = $guid "
                       "RETURN count(node)",
                       guid = self.data.get('includedInDataCatalog'))
                count = node.single().data().get('count(node)')
        
        # if dataCatalogNot found attempt to import
        if count!= 1:
            target = self.endpoint + self.data.get('includedInDataCatalog')
            parent = importDC(target)
            if parent==False:
                raise InvalidParent(self.data.get('includedInDataCatalog'))
        
        with neo_driver.driver.session() as session:
            with session.begin_transaction() as tx:
                # create node with required properties
                # create downloads as 
                tx.run("MATCH (parent:dataCatalog) WHERE parent.guid=$parent "
                       "CREATE (node:Ark {guid: $guid, name: $name, author: $author, dateCreated: $dateCreated, url: $url, type: 'Dataset'}) "
                       "CREATE (node)-[parentRel:includedIn]->(parent) "
                       "CREATE (aws:AWSdownload {url: $awsUrl}) "
                       "CREATE (node)-[awsRel:download]->(aws) "
                       "CREATE (gpc:GPCdownload {url: $gpcUrl}) "
                       "CREATE (node)-[gpcRel:download]->(gpc) ",
                       parent=self.data.get('includedInDataCatalog'),
                       guid=self.data.get('@id', None), 
                       name=self.data.get('name', None),
                       dateCreated = self.data.get('dateCreated', None),
                       author = self.data.get('author',None).get('name',None),
                       url=self.data.get('url',None),
                       awsUrl = self.data.get('contentUrl').get('aws'),
                       gpcUrl = self.data.get('contentUrl').get('gpc')
                      )
                
                
                
                
                

In [585]:
flatten = {"@contentUrl": {"nested": {"hiding": "hello"}, "aws":"http://bd2k.ini.usc.edu/assets/all-hands-meeting/minid_v0.1_Nov_2015.pdf"}}

In [599]:
all([ isinstance(value, dict)==False for value in flatten.values()])

False

In [651]:
def recursiveFlatten(nestedAnvl):
    output = nestedAnvl
    while all([ isinstance(value, dict)==False for value in output.values()])==False:
        output = flatten(output)
    return output

In [652]:
def flatten(anvlDict):
    output = {}
    for key, value in anvlDict.items():
        key = re.sub("@","", key)
        if isinstance(value, dict):
            for subKey, subValue in value.items():
                key = ".".join([key, subKey])
                output[key] =subValue
                
        if isinstance(value, str):
            output[key] = value
            
        if isinstance(value, list):
            # if a list of strings
            if all([isinstance(el, str) for el in value]):
                output[key] = ";".join(value)
            
            # if a list of objects, must all be flattended and added as keys
            if any([isinstance(el, dict) for el in value]):
                for item in value:
                    output[key] = flatten(item)
            
    return output

In [653]:
formatted_anvl = profileFormat(recursiveFlatten(exArk))

In [654]:
anvl = removeProfileFormat(formatted_anvl)

In [642]:
def unpack(anvl):
    output = {}
    for key, value in anvl.items():
        if len(key.split(".", 1))==2:
            split_key, split_val = key.split(".", 1)
            output[split_key] = {split_val:value}
        else:
            output[key]=value
    return output

def recursiveUnpack(anvl):
    output = anvl
    while all([len(key.split(".",1))==1 for key, value in output.items()])==False:
        output = unpack(output)
        
    return output

In [621]:
# add NIHdc to first tag
def profileFormat(anvlDict):
    output = {}
    for key, value in anvlDict.items():
        key = ".".join(["NIHdc", key])
        output[key] = value
    return output
        
def removeProfileFormat(anvlDict):
    output = {}
    for key, value in anvlDict.items():
        key = re.sub("NIHdc.", "", key)
        output[key] = value
    return output

In [649]:
# add appropriate @ symbols to important keys
def formatJson(anvl):
    
    # @context
    temp = anvl.pop('context')
    anvl['@context'] = temp
    
    # @id
    temp = anvl.pop('id')
    anvl['@id'] = temp
    
    # @type
    temp = anvl.pop('type')
    anvl['@type'] = temp
    
    return anvl


In [550]:
exArk = {
  "@context": "http://schema.org",
  "@id": "ark:/88120/r8059v",
  "@type": "CreativeWork",
  "identifier": "ark:/88120/r8059v",
  "url": "http://minid.bd2k.org/minid/landingpage/ark:/88120/r8059v",
  "contentUrl": {"aws":"http://bd2k.ini.usc.edu/assets/all-hands-meeting/minid_v0.1_Nov_2015.pdf"},
  "name": "minid: A BD2K Minimal Viable Identifier Pilot v0.1",
  "author": {
    "@id": "http://orcid.org/0000-0003-2129-5269",
    "@type": "Person",
    "name": "Ian Foster"
  },
  "dateCreated": "2015-11-10T04:44:44.387671Z",
    "includedInDataCatalog": "ark:/99999/fk4MIR",
    "checksum": [{"sha-256": "cacc1abf711425d3c554277a5989df269cefaa906d27f1aaa72205d30224ed5f"}]    
}


In [559]:
testArk = Ark(exArk)

In [446]:
testArk.data.get('@id')

'ark:/88120/r8059v'

In [558]:
# to test importing delete this cache and then if will grab from endpoint
deleteCache(myDC.data['@id'])

In [560]:
testArk.postNeo()

In [561]:
getCache(myDC.data['@id'])

{'guid': 'ark:/99999/fk4MIR',
 'name': 'Rat Genome Database',
 'type': 'DataCatalog',
 'url': 'http://rgd.mcw.edu/'}

In [553]:
getCache(testArk.data.get('@id'))

{'author': 'Ian Foster',
 'dateCreated': '2015-11-10T04:44:44.387671Z',
 'guid': 'ark:/88120/r8059v',
 'name': 'minid: A BD2K Minimal Viable Identifier Pilot v0.1',
 'type': 'Dataset',
 'url': 'http://minid.bd2k.org/minid/landingpage/ark:/88120/r8059v'}

In [544]:
deleteCache(testArk.data.get('@id'))

{'author': 'Ian Foster',
 'dateCreated': '2015-11-10T04:44:44.387671Z',
 'guid': 'ark:/88120/r8059v',
 'name': 'minid: A BD2K Minimal Viable Identifier Pilot v0.1',
 'type': 'Dataset',
 'url': 'http://minid.bd2k.org/minid/landingpage/ark:/88120/r8059v'}

In [567]:
exArk

{'@context': 'http://schema.org',
 '@id': 'ark:/88120/r8059v',
 '@type': 'CreativeWork',
 'author': {'@id': 'http://orcid.org/0000-0003-2129-5269',
  '@type': 'Person',
  'name': 'Ian Foster'},
 'checksum': [{'sha-256': 'cacc1abf711425d3c554277a5989df269cefaa906d27f1aaa72205d30224ed5f'}],
 'contentUrl': {'aws': 'http://bd2k.ini.usc.edu/assets/all-hands-meeting/minid_v0.1_Nov_2015.pdf'},
 'dateCreated': '2015-11-10T04:44:44.387671Z',
 'identifier': 'ark:/88120/r8059v',
 'includedInDataCatalog': 'ark:/99999/fk4MIR',
 'name': 'minid: A BD2K Minimal Viable Identifier Pilot v0.1',
 'url': 'http://minid.bd2k.org/minid/landingpage/ark:/88120/r8059v'}

In [565]:
ingestAnvl(str(response.content.decode('utf-8')) )

{'_created': '1447130687',
 '_export': 'yes',
 '_owner': 'isi-isrd',
 '_ownergroup': 'isi-isrd',
 '_profile': 'erc',
 '_status': 'public',
 '_target': 'http://minid.bd2k.org/minid/landingpage/ark:/88120/r8059v',
 '_updated': '1450366519',
 'erc.what': '2015-11-10 04:44:44.387671',
 'erc.when': 'minid: A BD2K Minimal Viable Identifier Pilot v0.1',
 'erc.who': 'Ian Foster',
 'minid.checksum': '23',
 'success': 'ark:/88120/r8059v'}

## DOIs
Required Fields 
- @id
- @type
- identifier
- url
- includedInDataCatalog
- name
- author
- datePublished

Optional Fields
- dateCreated
- additionalType
- description
- keywords
- license
- version
- citation
- isBasedOn
- PredecessorOf
- successorOf
- hasPart
- isPartOf
- funder
- contentSize
- fileFormat
- contentUrl

In [None]:
class Doi(CoreMetadata):
    required_keys = set(['@id', '@type', 'identifier', 
                        'url', 'includedInDataCatalog', 'name', 'author',
                        'datePublished', 'contentUrl'])
    optional_keys = set(['dateCreated', 'additionalType', 'description', 
                        'keywords', 'license', 'version', 'citation', 'isBasedOn',
                        'predecessorOf', 'successorOf', 'hasPart', 'isPartOf', 'funder',
                        'contentSize', 'fileFormat'])
    
    def postNeo(self):
        ''' Post Doi to Neo database
        '''
        neo_driver = NeoConn()
        
        
        # query for parent with guid
        with neo_driver.driver.session() as session:
            with session.begin_transaction() as tx:
                node = tx.run("MATCH (node:dataCatalog) "
                      "WHERE node.guid = $guid "
                       "RETURN count(node)",
                       guid = self.data.get('includedInDataCatalog'))
                count = node.single().data().get('count(node)')
        
        # if dataCatalogNot found attempt to import
        if count!= 1:
            parent = importDC(self.data.get('includedInDataCatalog'))
            if parent is None:
                raise InvalidParent(self.data.get('includedInDataCatalog'))
        
        with neo_driver.driver.session() as session:
            with session.begin_transaction() as tx:
                # create node with required properties
                # create downloads as 
                tx.run("MATCH (parent:dataCatalog) WHERE parent.guid=$parent "
                       "CREATE (node:Ark {guid: $guid, name: $name, author: $author, dateCreated: $dateCreated, url: $url, type: 'Dataset'}) "
                       "CREATE (node)-[parentRel:includedIn]->(parent) "
                       "CREATE (aws:AWSdownload {url: $awsUrl}) "
                       "CREATE (node)-[awsRel:download]->(aws) "
                       "CREATE (gpc:GPCdownload {url: $gpcUrl}) "
                       "CREATE (node)-[gpcRel:download]->(gpc) ",
                       parent=self.data.get('includedInDataCatalog'),
                       guid=self.data.get('@id', None), 
                       name=self.data.get('name', None),
                       dateCreated = self.data.get('dateCreated', None),
                       author = self.data.get('author',None).get('name',None),
                       url=self.data.get('url',None),
                       awsUrl = self.data.get('contentUrl').get('aws'),
                       gpcUrl = self.data.get('contentUrl').get('gpc')
                      )
                
                

In [None]:
exDoi = {
  "@context": "http://schema.org",
  "@type": "Dataset",
  "@id": "https://doi.org/10.25491/5e92-ht74",
  "identifier": "https://doi.org/10.25491/5e92-ht74",
  "additionalType": "Data dictionary",
  "name": "A data dictionary that describes each variable in the GTEx_v7_Annotations_SubjectPhenotypesDS.txt",
  "author": {
    "@type": "Organization",
    "name": "The GTEx Consortium"
  },
  "keywords": "gtex, annotation, phenotype, gene regulation, transcriptomics",
  "datePublished": "2017",
  "includedInDataCatalog": {
    "@type": "Organization",
    "name": "GTEx"
  },
  "version": "v7",
  "url": "https://www.gtexportal.org/home/datasets",
  "contentSize": "5.4 Mb",
  "fileFormat": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
  "funder": {
   "@type": "Organization",
   "@id": "https://doi.org/10.13039/100000050",
   "name": "National Heart, Lung, and Blood Institute"
  }
}

## Neo Interactions

### Get Downloads From Cache

### Retreive Object from Cache

### Delete Object from Cache

### Import Object from External Service into Cache

In [342]:
def getCache(guid):
    """ Retrieve an Object by its Guid from the Cache
    """
    
    neo_driver = NeoConn()
    with neo_driver.driver.session() as session:
        with session.begin_transaction() as tx:
            node = tx.run("MATCH (node) "
                   "WHERE node.guid=$guid "
                   "RETURN properties(node)",
                   guid = guid)
            node_data = node.data()
            if node_data == []:
                return None
            else:
                return node_data[0].get('properties(node)', None)

In [474]:
guid = 'ark:/88120/r8059v'
getDownloads(guid)

['http://bd2k.ini.usc.edu/assets/all-hands-meeting/minid_v0.1_Nov_2015.pdf']

In [473]:
# Find 
def getDownloads(guid, loc='aws'):
    neo_driver = NeoConn()
    with neo_driver.driver.session() as session:
        with session.begin_transaction() as tx:
            if loc=='aws':
                content = tx.run("MATCH (node)-[*]->(d:AWSdownload) WHERE node.guid=$guid "
                   "RETURN d.url",
                  guid=guid)
            else:
                content = tx.run("MATCH (node)-[*]->(d:GPCdownload) WHERE node.guid=$guid "
                   "RETURN d.url",
                  guid=guid)
                
            
    content_data = content.data()
    return [download_node.get('d.url') for download_node in content_data]

In [315]:
def deleteCache(guid):
    """ Delete an Object and remove all relationships from Cache
    """
    neo_driver = NeoConn()
    with neo_driver.driver.session() as session:
        with session.begin_transaction() as tx:
            node = tx.run("MATCH (node) "
                  "WHERE node.guid=$guid "
                  "RETURN properties(node)",
                  guid= guid)
            tx.run("MATCH (node) "
                  "WHERE node.guid=$guid "
                  "DETACH DELETE node ",
                  guid= guid)
            node_data = node.data()
            if node_data == []:
                return None
            else:
                return node_data[0].get('properties(node)')

In [509]:
def importCache(target):
    """ Send a request to GUID service, import response to cache
    
    From the target determine the object type
    """
    response = requests.get(target)
    anvlDict = ingestAnvl(response.content.decode('utf-8'))
    
    if re.match("ark:/", target):
        obj = Ark(anvlDict)
        
    if re.match("doi:", target):
        obj = Doi(anvlDict)
        
    obj.postNeo()

In [536]:
def ingestAnvl(anvl):
    anvlDict = {}
    for element in anvl.split('\n'):
        split_element = str(element).split(': ', 1)
        if len(split_element)==2:
            anvlDict[split_element[0]] = split_element[1]
    return anvlDict

## Compact Identifiers

cant currently support
    - needs to be rdf or xml response format

endpoint = http://identifiers.org/[collection]/[entity]

Fields (all mandatory)
- @id
- @type
- identifier
- url
- includedInDataCatalog

In [203]:
exCompactId = {
  "@context": "http://schema.org",
  "@id": "https://identifiers.org/rgd/2825",
  "@type": "Dataset",
  "identifier": "https://identifiers.org/rgd/2825",
  "url": "https://rgd.mcw.edu/rgdweb/report/gene/main.html?id=2825",
  "includedInDataCatalog": {
    "@id": "https://www.ebi.ac.uk/miriam/main/datatypes/MIR:00000047",
    "@type": "DataCatalog",
    "name": "Rat Genome Database",
    "url": "http://rgd.mcw.edu/"
    }
}

In [115]:
class CompactId(CoreMetadata):
    endpoint = "http://identifiers.org/"
    
    def postNeo(self):
        ''' Post to Neo database
        '''
        neo_driver = NeoConn()
        
        with neo_driver.driver.session() as session:
            session.write_transaction(
                tx.run("CREATE (d:dataCatalog {guid: $guid, name: $name, url: $url} )",
                       guid=self.data['@id'], 
                       name=self.data['name'],
                       url=self.data['url']
                      )
            )
            
            

In [676]:
# helper functions for processing messages
def escape(s):
    return re.sub("[%:\r\n]", lambda c: "%%%02X" % ord(c.group(0)), s)

def outputAnvl(anvlDict):
    ''' Encode all objects into strings, lists into strings
    '''  
        
    return "\n".join("%s: %s" % (escape(str(name)), escape(str(value) )) for name,value in anvlDict.items()).encode('utf-8')

In [None]:
# have to use ident