In [66]:
import pandas as pds
import json
from rdflib import Graph, RDFS, RDF, OWL, URIRef, Literal
from hashlib import md5
from pandasql import sqldf
from uuid import uuid4

def pysqldf(q):
    return sqldf(q, globals())

In [67]:
mixsdf = pds.read_excel("data/mixs_v5_local.xlsx",sheet_name="MIxS")
# mixsdf.head()

## Prep/clean dataframe
* Trim spaces around column names
* Replace spaces in column names with underscore
* Make column names lower case
* Replace NA data values with ""

In [68]:
## prep column names
cleanded_columns = mixsdf.columns.str.strip() # trim spaces
cleanded_columns = cleanded_columns.str.lower() # make lower case
cleanded_columns = cleanded_columns.str.replace(" ", "_") # replace space with _
mixsdf.columns = cleanded_columns

## replace NaN
mixsdf.fillna("", inplace=True)

In [69]:
# mixsdf.columns

## Make dictionary mapping column names iris

In [78]:
col_to_iri = {}
for c in cleanded_columns:
    col_to_iri[c] = "http://purl.obolibrary.org/obo/MIXS_" + c
# col_to_iri

## Create column containing the iri for each record; the iri will be based on the MIXS ID column

In [79]:
## helper function
def make_mixs_record_iri(mixs_id, prefix="http://purl.obolibrary.org/obo/"):
    mixs_id = str(mixs_id).strip()
    if len(mixs_id) > 0:
        if mixs_id.find(":") > -1:
            return prefix + mixs_id.replace(":", "_")
        else:
            return prefix + "MIXS_" + mixs_id
    else:
        return prefix + "MIXS_" + str(uuid4())
    

In [80]:
## add record_iri column
mixsdf['record_iri'] = mixsdf.apply(lambda row: make_mixs_record_iri(row.mixs_id), axis=1)
# mixsdf.head(10) # note: check the 'altitude' item use uuid() to create iri

## Create graph of MIxS records
* add column headers as annotation properties
* add rows as  instances of class 'MIxS record'

In [86]:
g = Graph()

In [87]:
## add column headers as annotations
for c in cleanded_columns:
    label = c.replace("_", " ")
    iri = URIRef(col_to_iri[c])
    g.add((iri, RDF.type, OWL.AnnotationProperty))
    g.add ((iri, RDFS.label, Literal(label)))
    

In [88]:
## create MIxS record class
record_class_iri = URIRef("http://purl.obolibrary.org/MIXS_mixs_record")
g.add((record_class_iri, RDF.type, OWL.Class))
g.add((record_class_iri, RDFS.label, Literal("MIxS record")))

In [89]:
## add MIxS records to graph
for ix, row in mixsdf.iterrows():
    record_iri = URIRef(row.record_iri) # create iris and labels
    label = row['item']
    
    g.add((record_iri, RDF.type, record_class_iri)) # add iri to graph
    g.add((record_iri, RDFS.label, Literal(label)))
    
    for c in cleanded_columns: # add each column value as an annotation
        annotation_value = str(row[c]).strip()
        annotation_iri = URIRef(col_to_iri[c])
        g.add((record_iri, annotation_iri, Literal(annotation_value)))

In [90]:
## add ontology iri
ontology_iri = URIRef("http://purl.obolibrary.org/obo/MIxS-record-translation.owl")
g.add((ontology_iri, RDF.type, OWL.Ontology))

In [91]:
## save graph (note: different formats (e.g., turtle) are possible)
g.serialize(destination='output/MIxS-record-translation.owl', format='xml')