In [2]:
import pandas as pds
import json
from rdflib import Graph, RDFS, RDF, OWL, URIRef, Literal
from hashlib import md5
from pandasql import sqldf
from uuid import uuid4

def pysqldf(q):
    return sqldf(q, globals())

In [4]:
epdf = pds.read_excel("data/mixs_v5_local.xlsx",sheet_name="environmental_packages")
# epdf.head()

## Prep/clean dataframe
* Trim spaces around column names
* Replace spaces in column names with underscore
* Make column names lower case
* Replace NA data values with ""

In [5]:
## prep column names
cleanded_columns = epdf.columns.str.strip() # trim spaces
cleanded_columns = cleanded_columns.str.lower() # make lower case
cleanded_columns = cleanded_columns.str.replace(" ", "_") # replace space with _
epdf.columns = cleanded_columns

## replace NaN
epdf.fillna("", inplace=True)

In [7]:
# epdf.columns

## Make dictionary mapping column names iris

In [8]:
col_to_iri = {}
for c in cleanded_columns:
    col_to_iri[c] = "http://purl.obolibrary.org/obo/MIXS_" + c
# col_to_iri

## Create column containing the iri for each record; the iri will be based on the MIXS ID column

In [9]:
## helper function
def make_mixs_record_iri(mixs_id, prefix="http://purl.obolibrary.org/obo/"):
    mixs_id = str(mixs_id).strip()
    if len(mixs_id) > 0:
        if mixs_id.find(":") > -1:
            return prefix + mixs_id.replace(":", "_")
        else:
            return prefix + "MIXS_" + mixs_id
    else:
        return prefix + "MIXS_" + str(uuid4())
    

In [12]:
## add record_iri column
epdf['record_iri'] = epdf.apply(lambda row: make_mixs_record_iri(row.mixs_id), axis=1)
# epdf.head()

## Create graph of MIxS records
* add column headers as annotation properties
* add rows as  instances of class 'MIxS record'

In [23]:
g = Graph()

In [24]:
## add column headers as annotations
for c in cleanded_columns:
    label = c.replace("_", " ")
    iri = URIRef(col_to_iri[c])
    g.add((iri, RDF.type, OWL.AnnotationProperty))
    g.add ((iri, RDFS.label, Literal(label)))
    

In [25]:
## create MIxS record class
record_class_iri = URIRef("http://purl.obolibrary.org/MIXS_environmental_package_record")
g.add((record_class_iri, RDF.type, OWL.Class))
g.add((record_class_iri, RDFS.label, Literal("environmental package record")))

In [26]:
## add MIxS records to graph
for ix, row in epdf.iterrows():
    record_iri = URIRef(row.record_iri) # create iris and labels
    label = f"{row['package_item']} ({row['environmental_package']})"
    
    g.add((record_iri, RDF.type, record_class_iri)) # add iri to graph
    g.add((record_iri, RDFS.label, Literal(label)))
    
    for c in cleanded_columns: # add each column value as an annotation
        annotation_value = str(row[c]).strip()
        annotation_iri = URIRef(col_to_iri[c])
        g.add((record_iri, annotation_iri, Literal(annotation_value)))

In [27]:
## add ontology iri
ontology_iri = URIRef("http://purl.obolibrary.org/obo/environmental-package-record-translation.owl")
g.add((ontology_iri, RDF.type, OWL.Ontology))

In [28]:
## save graph (note: different formats (e.g., turtle) are possible)
g.serialize(destination='output/environmental-package-record-translation.owl', format='xml')