In [1]:
import pandas as pds
import json
from rdflib import Graph, RDFS, RDF, OWL, URIRef, Literal
from hashlib import md5
from pandasql import sqldf

def pysqldf(q):
    return sqldf(q, globals())

### Load data from classificaiton paths file `GOLD_Ecosystem_Classification_Paths_10152019.xlsx`

In [2]:
pathsdf = pds.read_excel("data/GOLD_Ecosystem_Classification_Paths_10152019.xlsx")

## Normalize column names to lower case and trim/strip spaces

In [3]:
clean_columns = [c.lower().strip() for c in pathsdf.columns]
# clean_columns

In [4]:
pathsdf.columns = clean_columns

### Clean data: 
* replace with nan with empyt string
* make all values lowercase 
* trim spaces

In [5]:
pathsdf.fillna("", inplace=True)
pathsdf = pathsdf.applymap(lambda x: "" if "(null)" == x else x)
pathsdf = pathsdf.applymap(lambda x: x.lower().strip())    
pathsdf.head()

Unnamed: 0,ecosystem,ecosystem_category,ecosystem_type,ecosystem_subtype,specific_ecosystem
0,engineered,bioreactor,aerobic,unclassified,unclassified
1,engineered,bioreactor,anaerobic,unclassified,unclassified
2,engineered,bioreactor,continuous culture,marine intertidal flat sediment inoculum,wadden sea-germany
3,engineered,bioreactor,continuous culture,marine sediment inoculum,wadden sea-germany
4,engineered,bioreactor,continuous culture,unclassified,unclassified


## Build dataframe of unique label paths and their checksums
#### For example: 'enviromental > aquatic > freshwater > sediment'

In [6]:
## helper functions for creating label path, hash, and iri
def make_label_path(row, include_missing=False):
    path_list = list(row)
    temp = [e for e in path_list if e != ""] # remove all empty strings
    if len(temp) > 0:
        if include_missing:
            ## if the value in the list an empty string (e.g., ['host-associated', 'plants', 'endosphere', ''])
            ## this will put an " > " it (e.g., host-associated > plants > endosphere >)
            return " > ".join(path_list)
        else:
            ## this only retuns a path devoid of empty string
            ## e.g, ['host-associated', 'plants', 'endosphere', ''] returns host-associated > plants > endosphere
            return " > ".join(temp)
    else:
        return ""


def make_parent_label_path(label_path):
    path_list = label_path.split(" > ")
    if len(path_list) > 1:
        temp = path_list[0:-1]
        return " > ".join(temp)
    else:
        return ""


def make_row_list(row, subset_list=[]):
    if len(subset_list) > 0:  # only get values in subset list
        row_list = [v for k,v in row.to_dict().items() 
                    if k in subset_list]
    else:
        row_list = list(row)
    return row_list


def make_row_hash(row, subset_list=[]):
    row_list = make_row_list(row, subset_list)
    
    temp = ["" if pds.isnull(e) else e for e in row_list] # replace NaNs with ''
    temp = "".join(temp)
#     print("row: ", temp)
    return make_hash(temp)


def make_parent_hash(row, subset_list=[]):
    row_list = make_row_list(row, subset_list)
    row_list = row_list[0:-1] # to get the parent, exclude last value in list

    temp = ["" if pds.isnull(e) else e for e in row_list] # replace NaNs with ''
    temp = "".join(temp)
#     print("parent: ", temp)
    return make_hash(temp)


def make_hash(val):
    if len(val) > 0:
        hash = md5(val.encode('utf-8'))
        return str(hash.hexdigest())
    else:
        return ""


def make_iri(val, prefix="http://purl.obolibrary.org/obo/GOLD_"):
    if None != val and len(val) > 0:
#         hash = make_hash(val)
        return f"{prefix}{val}"
    else:
        return ""


def make_annotation_dict(value_list, prefix="http://purl.obolibrary.org/obo/GOLD_"):
    annotation_dict = {}
    for val in value_list:
        val = str(val).lower().strip()
        annotation_dict[val] = \
            {'iri': make_iri(str(val), prefix=prefix), 'label': val}
    return annotation_dict


def make_json_annotation(row, annotation_dict):
    annotation_value_dict = {}
    annotation_dict_keys = annotation_dict.keys()
    for k,v in row.to_dict().items():
        k = str(k).lower().strip()
        if v != "" and (k in annotation_dict_keys):
            annotation_value_dict[annotation_dict[k]['iri']] = v
    return json.dumps(annotation_value_dict)

In [7]:
ontdf = pds.DataFrame(columns=['label_path', 'parent_label_path', 'iri', 'parent_iri', 'annotation'])
cols = pathsdf.columns
annotation_dict = make_annotation_dict(cols)

for i in range(len(cols)):
    df = pathsdf[cols[0:i + 1]].fillna("").drop_duplicates()
    df['label_path'] = df.apply(lambda row: make_label_path(row), axis=1)
    df['row_hash'] = df.apply(lambda row: make_row_hash(row, subset_list=clean_columns), axis=1)
    
    df['parent_label_path'] = df.apply(lambda row: make_parent_label_path(row['label_path']), axis=1)
    df['parent_hash'] = df.apply(lambda row: make_parent_hash(row, subset_list=clean_columns), axis=1)
    
    df['iri'] = df.apply(lambda row: make_iri(row['row_hash']), axis=1)
    df['parent_iri'] = df.apply(lambda row: make_iri(row['parent_hash']), axis=1)
    
    df['annotation'] = df.apply(lambda row: make_json_annotation(row, annotation_dict), axis=1)
    ontdf = ontdf.append(df[['label_path', 'parent_label_path', 'iri', 'parent_iri', 'annotation']]).drop_duplicates()

In [8]:
## visually examine output
# pds.set_option('max_rows', None)
# pds.set_option('display.max_colwidth', 1000)
# print(ontdf[['label_path', 'parent_label_path']])
# print(len(ontdf))

## Use rdflib to build ontology from dataframe

In [9]:
g = Graph() # instantiate graph

### Create annotaiton properties for each of the columns (saved in the annotation dictionary)

In [10]:
for k, v in annotation_dict.items():
    ## note: the value is a dict with keys 'iri' and 'label'
    g.add((URIRef(v['iri']), RDF.type, OWL.AnnotationProperty))
    g.add((URIRef(v['iri']), RDFS.label, Literal(v['label'])))

### Add each row from the ontology dataframe to graph

In [11]:
for (ix, label_path, parent_label_path, iri, parent_iri, annotation) in ontdf.itertuples(): # ontdf.head(100).itertuples():
    if len(iri) > 0:
        ## add iri to graph
        g.add((URIRef(iri), RDF.type, OWL.Class))
        g.add((URIRef(iri), RDFS.label, Literal(label_path)))
        
        ## add iri annotations to graph (note: annotation is a json string of form iri:value)
        ann = json.loads(annotation)
        for k, v in ann.items():
            g.add((URIRef(iri), URIRef(k), Literal(v)))
    
    if len(parent_iri) > 0:
        g.add((URIRef(iri), RDFS.subClassOf, URIRef(parent_iri))) # add parent iri to graph

In [12]:
## save graph (note: different formats (e.g., turtle) are possible)
g.serialize(destination='output/gold-classification-paths-translation.owl', format='xml')