In [15]:
import pandas as pds
import json
from rdflib import Graph, RDFS, RDF, OWL, URIRef, Literal
from hashlib import md5
from pandasql import sqldf

def pysqldf(q):
    return sqldf(q, globals())

## Build dataframe containing enviromental level data values

In [2]:
all_data_df = pds.read_csv("data/Biosample_all.tsv.gz", sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
elevels = ['ECOSYSTEM', 'ECOSYSTEM_CATEGORY', 'ECOSYSTEM_TYPE', 'ECOSYSTEM_SUBTYPE', 'SPECIFIC_ECOSYSTEM']
elevelsdf = all_data_df[elevels]

### clean data: 
* replace with nan with empyt string
* make all values lowercase 
* trim spaces

In [4]:
elevelsdf.fillna("", inplace=True)
elevelsdf = elevelsdf.applymap(lambda x: x.lower().strip())    
elevelsdf.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Unnamed: 0,ECOSYSTEM,ECOSYSTEM_CATEGORY,ECOSYSTEM_TYPE,ECOSYSTEM_SUBTYPE,SPECIFIC_ECOSYSTEM
0,host-associated,plants,phyllosphere,caulosphere,
1,engineered,food production,dairy products,,
2,engineered,food production,dairy products,,
3,engineered,food production,dairy products,,
4,engineered,food production,dairy products,,


## Build dataframe of unique label paths and their checksums
#### For example: 'enviromental > aquatic > freshwater > sediment'

In [16]:
## helper functions for creating label path, hash, and iri
def make_label_path(row, include_missing=False):
    path_list = list(row)
    temp = [e for e in path_list if e != ""] # remove all empty strings
    if len(temp) > 0:
        if include_missing:
            ## if the value in the list an empty string (e.g., ['host-associated', 'plants', 'endosphere', ''])
            ## this will put an " > " it (e.g., host-associated > plants > endosphere >)
            return " > ".join(path_list)
        else:
            ## this only retuns a path devoid of empty string
            ## e.g, ['host-associated', 'plants', 'endosphere', ''] returns host-associated > plants > endosphere
            return " > ".join(temp)
    else:
        return ""

    
def make_parent_label_path(label_path):
    path_list = label_path.split(" > ")
    if len(path_list) > 1:
        temp = path_list[0:-1]
        return " > ".join(temp)
    else:
        return ""

    
def make_hash(val):
    if len(val) > 0:
        hash = md5(val.encode('utf-8'))
        return str(hash.hexdigest())
    else:
        return ""

    
def make_iri(val, prefix="http://purl.obolibrary.org/obo/GOLD_"):
    if None != val and len(val) > 0:
        hash = make_hash(val)
        return f"{prefix}{hash}"
    else:
        return ""
    

def make_annotation_dict(value_list, prefix="http://purl.obolibrary.org/obo/GOLD_"):
    annotation_dict = {}
    for val in value_list:
        val = str(val).lower().strip()
        annotation_dict[val] = \
            {'iri': make_iri(str(val), prefix=prefix), 'label': val}
    return annotation_dict


def make_json_annotation(row, annotation_dict):
    annotation_value_dict = {}
    annotation_dict_keys = annotation_dict.keys()
    for k,v in row.to_dict().items():
        k = str(k).lower().strip()
        if v != "" and (k in annotation_dict_keys):
            annotation_value_dict[annotation_dict[k]['iri']] = v
    return json.dumps(annotation_value_dict)

In [17]:
ontdf = pds.DataFrame(columns=['label_path', 'parent_label_path', 'iri', 'parent_iri', 'annotation'])
annotation_dict = make_annotation_dict(elevels)

for i in range(len(elevels)):
    df = elevelsdf[elevels[0:i + 1]].fillna("").drop_duplicates()
    df['label_path'] = df.apply(lambda row: make_label_path(row), axis=1)
    df['parent_label_path'] = df.apply(lambda row: make_parent_label_path(row['label_path']), axis=1)
    df['iri'] = df.apply(lambda row: make_iri(row['label_path']), axis=1)
    df['parent_iri'] = df.apply(lambda row: make_iri(row['parent_label_path']), axis=1)
    df['annotation'] = df.apply(lambda row: make_json_annotation(row, annotation_dict), axis=1)
    ontdf = ontdf.append(df[['label_path', 'parent_label_path', 'iri', 'parent_iri', 'annotation']]).drop_duplicates()

In [18]:
## visually examine output
# pds.set_option('max_rows', None)
# pds.set_option('display.max_colwidth', 1000)
# print(ontdf[['label_path', 'parent_label_path']])
# print(len(ontdf))

In [19]:
## check that number of unique iris matches the number of rows
print("Nuumber of rows:", len(ontdf))
print("Number of iris: ", len(ontdf.iri.unique()))

Nuumber of rows: 833
Number of iris:  833


In [20]:
## check the number of unique vs non-unique parent iris
## there should be less unique ones; there is a one-to-many relation between parent iris and iris(i.e., parents have many children)
print("Number of unique parent iris: ", len(ontdf.parent_iri.unique()))
print("Number of parent iris: ", len(ontdf.parent_iri))

Number of unique parent iris:  400
Number of parent iris:  833


In [21]:
## check that the number of non-empty lable paths matches the number of non-empty iris
## there should be one less (see code below which returns one row of empty strings)
print("Number of non-empty iris: ", len(ontdf[ontdf.iri != ''].iri.unique()))
print("Number of non-empty label paths: ", len(ontdf[ontdf.label_path != ''].label_path.unique()))

print("Number of non-empty parent iris: ", len(ontdf[ontdf.parent_iri != ''].parent_iri.unique()))
print("Number of non-empty parent label paths: ", len(ontdf[ontdf.parent_label_path != ''].parent_label_path.unique()))

Number of non-empty iris:  832
Number of non-empty label paths:  832
Number of non-empty parent iris:  399
Number of non-empty parent label paths:  399


In [22]:
## only only records (show above) should have the same parent iri and iri value
print("Number of matching iri/parent iri: ", len(ontdf[ontdf.iri == ontdf.parent_iri]))
ontdf[ontdf.iri == ontdf.parent_iri]

Number of matching iri/parent iri:  1


Unnamed: 0,label_path,parent_label_path,iri,parent_iri,annotation
8786,,,,,{}


In [23]:
## examine those records w/o a parent iri
ontdf[ontdf.parent_iri == '']

Unnamed: 0,label_path,parent_label_path,iri,parent_iri,annotation
0,host-associated,,http://purl.obolibrary.org/obo/GOLD_daf7fb2e82...,,"{""http://purl.obolibrary.org/obo/GOLD_a5f3791a..."
1,engineered,,http://purl.obolibrary.org/obo/GOLD_fee6b5a458...,,"{""http://purl.obolibrary.org/obo/GOLD_a5f3791a..."
7,environmental,,http://purl.obolibrary.org/obo/GOLD_3bbfdace1e...,,"{""http://purl.obolibrary.org/obo/GOLD_a5f3791a..."
8786,,,,,{}


## Use rdflib to build ontology from dataframe

In [24]:
g = Graph() # instantiate graph

### Create annotaiton properties for each of the columns (saved in the annotation dictionary)

In [25]:
for k, v in annotation_dict.items():
    ## note: the value is a dict with keys 'iri' and 'label'
    g.add((URIRef(v['iri']), RDF.type, OWL.AnnotationProperty))
    g.add((URIRef(v['iri']), RDFS.label, Literal(v['label'])))

### Add each row from the ontology dataframe to graph

In [26]:
for (ix, label_path, parent_label_path, iri, parent_iri, annotation) in ontdf.itertuples(): # ontdf.head(100).itertuples():
    if len(iri) > 0:
        ## add iri to graph
        g.add((URIRef(iri), RDF.type, OWL.Class))
        g.add((URIRef(iri), RDFS.label, Literal(label_path)))
        
        ## add iri annotations to graph (note: annotation is a json string of form iri:value)
        ann = json.loads(annotation)
        for k, v in ann.items():
            g.add((URIRef(iri), URIRef(k), Literal(v)))
    
    if len(parent_iri) > 0:
        g.add((URIRef(iri), RDFS.subClassOf, URIRef(parent_iri))) # add parent iri to graph

In [27]:
## save graph (note: different formatats (e.g., turtle) are possible)
g.serialize(destination='output/gold-dataset-translation.owl', format='xml')