In [123]:
import pandas as pds
from rdflib import Graph, RDFS, RDF, OWL, URIRef, Literal
from hashlib import md5

## Build dataframe containing enviromental level data values

In [2]:
all_data_df = pds.read_csv("data/Biosample_all.tsv.gz", sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
elevels = ['ECOSYSTEM', 'ECOSYSTEM_CATEGORY', 'ECOSYSTEM_TYPE', 'ECOSYSTEM_SUBTYPE', 'SPECIFIC_ECOSYSTEM']
elevelsdf = all_data_df[elevels]

### clean data: 
* replace with nan with empyt string
* make all values lowercase 
* trim spaces

In [16]:
elevelsdf.fillna("", inplace=True)
elevelsdf = elevelsdf.applymap(lambda x: x.lower().strip())    
elevelsdf.head()

Unnamed: 0,ECOSYSTEM,ECOSYSTEM_CATEGORY,ECOSYSTEM_TYPE,ECOSYSTEM_SUBTYPE,SPECIFIC_ECOSYSTEM
0,host-associated,plants,phyllosphere,caulosphere,
1,engineered,food production,dairy products,,
2,engineered,food production,dairy products,,
3,engineered,food production,dairy products,,
4,engineered,food production,dairy products,,


In [7]:
", ".join(['ECOSYSTEM'])

'ECOSYSTEM'

In [83]:
" > ".join(["", "", ""])

' >  > '

In [86]:
l = ["", "", ""]
"" in l
l.remove("")
l = [e for e in l if e != ""]
l

[]

## Build dataframe of unique label paths and their checksums
#### For example: 'enviromental > aquatic > freshwater > sediment'

In [11]:
" FOO ".lower().strip()

'foo'

In [20]:
hash = md5("foo".encode('utf-8'))

In [21]:
hash.hexdigest()

'acbd18db4cc2f85cedef654fccc4a4d8'

In [29]:
for i in range(len(elevels)):
    print(elevels[0:i + 1])

['ECOSYSTEM']
['ECOSYSTEM', 'ECOSYSTEM_CATEGORY']
['ECOSYSTEM', 'ECOSYSTEM_CATEGORY', 'ECOSYSTEM_TYPE']
['ECOSYSTEM', 'ECOSYSTEM_CATEGORY', 'ECOSYSTEM_TYPE', 'ECOSYSTEM_SUBTYPE']
['ECOSYSTEM', 'ECOSYSTEM_CATEGORY', 'ECOSYSTEM_TYPE', 'ECOSYSTEM_SUBTYPE', 'SPECIFIC_ECOSYSTEM']


In [34]:
for i in range(len(elevels)):
    df = elevelsdf[elevels[0:i + 1]]
    print(df.drop_duplicates())
    break
    print(len(df.drop_duplicates()))

            ECOSYSTEM
0     host-associated
1          engineered
7       environmental
8786                 


In [106]:
## helper functions for creating label path, hash, and iri
def make_label_path(row, include_missing=False):
    path_list = list(row)
    temp = [e for e in path_list if e != ""] # remove all empty strings
    if len(temp) > 0:
        if include_missing:
            ## if the value in the list an empty string (e.g., ['host-associated', 'plants', 'endosphere', ''])
            ## this will put an " > " it (e.g., host-associated > plants > endosphere >)
            return " > ".join(path_list)
        else:
            ## this only retuns a path devoid of empty string
            ## e.g, ['host-associated', 'plants', 'endosphere', ''] returns host-associated > plants > endosphere
            return " > ".join(temp)
    else:
        return ""

def make_parent_label_path(label_path):
    path_list = label_path.split(" > ")
    if len(path_list) > 1:
        temp = path_list[0:-1]
        return " > ".join(temp)
    else:
        return ""
    
def make_hash(val):
    if len(val) > 0:
        hash = md5(val.encode('utf-8'))
        return str(hash.hexdigest())
    else:
        return ""

def make_iri(val, prefix="http://purl.obolibrary.org/obo/GOLD_"):
    if None != val and len(val) > 0:
        hash = make_hash(val)
        return f"{prefix}{hash}"
    else:
        return ""

In [111]:
ontdf = pds.DataFrame(columns=['label_path', 'parent_label_path', 'iri', 'parent_iri'])

for i in range(len(elevels)):
    df = elevelsdf[elevels[0:i + 1]].fillna("").drop_duplicates()
    df['label_path'] = df.apply(lambda row: make_label_path(row), axis=1)
    df['parent_label_path'] = df.apply(lambda row: make_parent_label_path(row['label_path']), axis=1)
    df['iri'] = df.apply(lambda row: make_iri(row['label_path']), axis=1)
    df['parent_iri'] = df.apply(lambda row: make_iri(row['parent_label_path']), axis=1)
    ontdf = ontdf.append(df[['label_path', 'parent_label_path', 'iri', 'parent_iri']]).drop_duplicates()

In [113]:
## visually examine output
# pds.set_option('max_rows', None)
# pds.set_option('display.max_colwidth', 1000)
# print(ontdf[['label_path', 'parent_label_path']])
# print(len(ontdf))

### TODO: write code to check output
The visual output looks fine, but I would to check algorithmically.

## Use rdflib to build ontology from dataframe

In [125]:
g = Graph()

for (ix, label_path, parent_label_path, iri, parent_iri) in ontdf.itertuples(): # ontdf.head(100).itertuples():
    if len(iri) > 0:
        g.add((URIRef(iri), RDF.type, OWL.Class))
        g.add((URIRef(iri), RDFS.label, Literal(label_path)))
    
    if len(parent_iri) > 0:
        g.add((URIRef(iri), RDFS.subClassOf, URIRef(parent_iri)))


In [128]:
g.serialize(destination='output/gold-dataset-translation.owl', format='xml')