<a href="https://colab.research.google.com/github/oceanproteinportal/phase2-onboarding-resources/blob/main/OPP_PhaseII_RDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
%%capture
!pip -q install rdflib
!pip -q install datapackage

from rdflib import BNode, Graph, Literal, Namespace, RDF, URIRef, plugin
from rdflib.namespace import XSD
from rdflib.serializer import Serializer
import datapackage
import uuid

In [12]:
dp = datapackage.DataPackage('https://raw.githubusercontent.com/oceanproteinportal/phase2-onboarding-resources/main/datapackage.json', None)
if (dp.errors):
    for error in dp.errors:
        logging.error(error)
    raise Exception('Invalid data package')

# Validate the Datapackage
try:
    valid = datapackage.validate(dp.descriptor)
except exceptions.ValidationError as exception:
    for error in datapackage.exception.errors:
        logging.error(error)
    raise Exception('Invalid data package')

# Generate datasetId
datasetId = str(uuid.uuid3(uuid.NAMESPACE_DNS, dp.descriptor['name'] + '_ver.' + dp.descriptor.get('version', 'noversion')))

print(datasetId)

cf4891ae-fddc-3e2c-9dbc-a3aca5498c5f


In [24]:
# Define Namespaces
geo = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
opp = Namespace("http://schema.oceanproteinportal.org/v2/")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
schema = Namespace("http://schema.org/")
sosa = Namespace("http://www.w3.org/ns/sosa/")
ssn = Namespace("http://www.w3.org/ns/ssn/")
ssn_system = Namespace("http://www.w3.org/ns/ssn/systems/")
wikidata = Namespace("http://www.wikidata.org/entity/")


# Add namespaces to graph
g = Graph()
g.bind("geo", geo)
g.bind("opp", opp)
g.bind("rdf", RDF)
g.bind("rdfs", rdfs)
g.bind("schema", schema)
g.bind("sosa", sosa)
g.bind("ssn", ssn)
g.bind("ssn-system", ssn_system)
g.bind("wikidata", wikidata)
g.bind("xsd", XSD)

# Define URI space for OPP subjects
id_uri_prefix = "http://id.oceanproteinportal.org/"

dataset = URIRef(id_uri_prefix + "dataset/" + datasetId)

print(dataset)

g.add((dataset, RDF.type, schema.Dataset))

# Dataset Identifier
datasetIdentifier = BNode()
g.add((datasetIdentifier, RDF.type, opp.Identifier))
datasetIdScheme = BNode()
g.add((datasetIdScheme, RDF.type, opp.DatasetIdentfierScheme))
g.add((datasetIdentifier, opp.identifierScheme, datasetIdScheme))
g.add((datasetIdentifier, opp.identifierValue, Literal(datasetId, datatype=XSD.token)))
g.add((dataset, opp.identifier, datasetIdentifier))

g.add((dataset, schema.name, Literal(dp.descriptor['title'], datatype=XSD.string)))
g.add((dataset, schema.description, Literal(dp.descriptor['description'], datatype=XSD.string)))
g.add((dataset, schema.url, Literal(dp.descriptor['homepage'], datatype=XSD.anyURI)))
g.add((dataset, schema.version, Literal(dp.descriptor['version'], datatype=XSD.token)))
g.add((dataset, schema.alternateName, Literal(dp.descriptor['opp:shortName'], datatype=XSD.string)))

for keyword in dp.descriptor['keywords']:
  g.add((dataset, schema.keyword, Literal(keyword, datatype=XSD.token)))

http://id.oceanproteinportal.org/dataset/cf4891ae-fddc-3e2c-9dbc-a3aca5498c5f


In [38]:
for index, contributor in enumerate(dp.descriptor['contributors']):
  # Agent
  agent = BNode("agent-{idx:.2f}".format(idx=index))
  if 'uri' in contributor:
    agent = URIRef(contributor['uri'])
  # TODO: SHACL rule to infer proper Agent subclass

  g.add((agent, RDF.type, opp.Agent))
  g.add((agent, schema.name, Literal(contributor['title'], datatype=XSD.string)))
  if 'email' in contributor:
    g.add((agent, schema.email, Literal(contributor['email'], datatype=XSD.string)))
  if 'orcid' in contributor:
    orcid = BNode()
    g.add((orcid, RDF.type, opp.Identifier))
    g.add((orcid, opp.identifierScheme, wikidata.Q51044))
    g.add((orcid, opp.identifierValue, Literal(contributor['orcid'], datatype=XSD.token)))
    g.add((agent, opp.identifier, orcid))
  # Role
  if 'role' in contributor:
    role = opp.Role_Contributor
    if 'author' == contributor['role']:
      role = opp.Role_Author
    elif 'contact' == contributor['role']:
      role = opp.Role_Contact
    elif 'publisher' == contributor['role']:
      role = opp.Role_Publisher

  # AgentRole
  agent_role = BNode()
  g.add((agent_role, RDF.type, opp.AgentRole))
  g.add((agent_role, opp.isAgentRoleFor, dataset))
  g.add((agent_role, opp.performedBy, agent))
  g.add((agent_role, opp.inRole, role))
  # TODO: SHACL rule to infer opp.hasAgentRole

In [39]:
print(g.serialize(format='ttl').decode("utf-8"))

@prefix opp: <http://schema.oceanproteinportal.org/v2/> .
@prefix schema: <http://schema.org/> .
@prefix wikidata: <http://www.wikidata.org/entity/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://lod.bco-dmo.org/id/affiliation/191> a opp:Agent ;
    schema:email "info@bco-dmo.org"^^xsd:string ;
    schema:name "BCO-DMO"^^xsd:string .

<http://lod.bco-dmo.org/id/person/50985> a opp:Agent ;
    opp:identifier [ a opp:Identifier ;
            opp:identifierScheme wikidata:Q51044 ;
            opp:identifierValue "0000-0001-6040-9295"^^xsd:token ],
        [ a opp:Identifier ;
            opp:identifierScheme wikidata:Q51044 ;
            opp:identifierValue "0000-0001-6040-9295"^^xsd:token ],
        [ a opp:Identifier ;
            opp:identifierScheme wikidata:Q51044 ;
            opp:identifierValue "0000-0001-6040-9295"^^xsd:token ],
        [ a opp:Identifier ;
            opp:identifierScheme wikidata:Q51044 ;
            opp:identifierValue "0000-0001-6040-9295"^^xsd