In [1]:
import pandas as pds
import json
from rdflib import Graph, RDFS, RDF, OWL, URIRef, Literal
from hashlib import md5
from pandasql import sqldf

def pysqldf(q):
    return sqldf(q, globals())

## Build dataframe containing enviromental level data values

In [2]:
all_data_df = pds.read_csv("data/Biosample_all.tsv.gz", sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


## Normalize column names to lower case and trim/strip spaces

In [3]:
clean_columns = [c.lower().strip() for c in all_data_df.columns]
# clean_columns

In [4]:
all_data_df.columns = clean_columns

## Create subset that contains just the GOLD elevels data

In [5]:
gold_elevels = ['ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem']
gold_elevelsdf = all_data_df[gold_elevels]

### Clean data: 
* replace with nan with empyt string
* make all values lowercase 
* trim spaces

In [6]:
gold_elevelsdf.fillna("", inplace=True)
gold_elevelsdf = gold_elevelsdf.applymap(lambda x: x.lower().strip())    
gold_elevelsdf.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Unnamed: 0,ecosystem,ecosystem_category,ecosystem_type,ecosystem_subtype,specific_ecosystem
0,host-associated,plants,phyllosphere,caulosphere,
1,engineered,food production,dairy products,,
2,engineered,food production,dairy products,,
3,engineered,food production,dairy products,,
4,engineered,food production,dairy products,,


## Build dataframe of unique label paths and their checksums
#### For example: 'enviromental > aquatic > freshwater > sediment'

In [7]:
## helper functions for creating label path, hash, and iri
def make_label_path(row, include_missing=False):
    path_list = list(row)
    temp = [e for e in path_list if e != ""] # remove all empty strings
    if len(temp) > 0:
        if include_missing:
            ## if the value in the list an empty string (e.g., ['host-associated', 'plants', 'endosphere', ''])
            ## this will put an " > " it (e.g., host-associated > plants > endosphere >)
            return " > ".join(path_list)
        else:
            ## this only retuns a path devoid of empty string
            ## e.g, ['host-associated', 'plants', 'endosphere', ''] returns host-associated > plants > endosphere
            return " > ".join(temp)
    else:
        return ""


def make_parent_label_path(label_path):
    path_list = label_path.split(" > ")
    if len(path_list) > 1:
        temp = path_list[0:-1]
        return " > ".join(temp)
    else:
        return ""


def make_row_list(row, subset_list=[]):
    if len(subset_list) > 0:  # only get values in subset list
        row_list = [v for k,v in row.to_dict().items() 
                    if k in subset_list]
    else:
        row_list = list(row)
    return row_list


def make_row_hash(row, subset_list=[]):
    row_list = make_row_list(row, subset_list)
    
    temp = ["" if pds.isnull(e) else e for e in row_list] # replace NaNs with ''
    temp = "".join(temp)
#     print("row: ", temp)
    return make_hash(temp)


def make_parent_hash(row, subset_list=[]):
    row_list = make_row_list(row, subset_list)
    row_list = row_list[0:-1] # to get the parent, exclude last value in list

    temp = ["" if pds.isnull(e) else e for e in row_list] # replace NaNs with ''
    temp = "".join(temp)
#     print("parent: ", temp)
    return make_hash(temp)


def make_hash(val):
    if len(val) > 0:
        hash = md5(val.encode('utf-8'))
        return str(hash.hexdigest())
    else:
        return ""


def make_iri(val, prefix="http://purl.obolibrary.org/obo/GOLD_"):
    if None != val and len(val) > 0:
#         hash = make_hash(val)
        return f"{prefix}{val}"
    else:
        return ""


def make_annotation_dict(value_list, prefix="http://purl.obolibrary.org/obo/GOLD_"):
    annotation_dict = {}
    for val in value_list:
        val = str(val).lower().strip()
        annotation_dict[val] = \
            {'iri': make_iri(str(val), prefix=prefix), 'label': val}
    return annotation_dict


def make_json_annotation(row, annotation_dict):
    annotation_value_dict = {}
    annotation_dict_keys = annotation_dict.keys()
    for k,v in row.to_dict().items():
        k = str(k).lower().strip()
        if v != "" and (k in annotation_dict_keys):
            annotation_value_dict[annotation_dict[k]['iri']] = v
    return json.dumps(annotation_value_dict)

In [8]:
ontdf = pds.DataFrame(columns=['label_path', 'parent_label_path', 'iri', 'parent_iri', 'annotation'])
annotation_dict = make_annotation_dict(gold_elevels)

#gold_elevelsdf = gold_elevelsdf.head() # used for testing

for i in range(len(gold_elevels)):
    df = gold_elevelsdf[gold_elevels[0:i + 1]].fillna("").drop_duplicates()
    df['label_path'] = df.apply(lambda row: make_label_path(row), axis=1)
    df['row_hash'] = df.apply(lambda row: make_row_hash(row, subset_list=gold_elevels), axis=1)
    
    df['parent_label_path'] = df.apply(lambda row: make_parent_label_path(row['label_path']), axis=1)
    df['parent_hash'] = df.apply(lambda row: make_parent_hash(row, subset_list=gold_elevels), axis=1)
    
    df['iri'] = df.apply(lambda row: make_iri(row['row_hash']), axis=1)
    df['parent_iri'] = df.apply(lambda row: make_iri(row['parent_hash']), axis=1)
    
    df['annotation'] = df.apply(lambda row: make_json_annotation(row, annotation_dict), axis=1)
    ontdf = ontdf.append(df[['label_path', 'parent_label_path', 'iri', 'parent_iri', 'annotation']]).drop_duplicates()

In [9]:
## visually examine output
# pds.set_option('max_rows', None)
# pds.set_option('display.max_colwidth', 1000)
# print(ontdf[['label_path', 'parent_label_path']].head())
# print(ontdf[['iri', 'parent_iri']].head())
# print(ontdf[['label_path', 'parent_label_path', 'iri', 'parent_iri']].head())
# print(ontdf[['annotation']].head())
# print(len(ontdf))
# ontdf[['label_path', 'parent_label_path', 'iri', 'parent_iri']].head()

In [10]:
## check that number of unique iris matches the number of rows
print("Nuumber of rows:", len(ontdf))
print("Number of iris: ", len(ontdf.iri.unique()))

Nuumber of rows: 1073
Number of iris:  833


In [11]:
## check the number of unique vs non-unique parent iris
## there should be less unique ones; there is a one-to-many relation between parent iris and iris(i.e., parents have many children)
print("Number of unique parent iris: ", len(ontdf.parent_iri.unique()))
print("Number of parent iris: ", len(ontdf.parent_iri))

Number of unique parent iris:  480
Number of parent iris:  1073


In [12]:
## check that the number of non-empty lable paths matches the number of non-empty iris
## there should be one less (see code below which returns one row of empty strings)
print("Number of non-empty iris: ", len(ontdf[ontdf.iri != ''].iri.unique()))
print("Number of non-empty label paths: ", len(ontdf[ontdf.label_path != ''].label_path.unique()))

print("Number of non-empty parent iris: ", len(ontdf[ontdf.parent_iri != ''].parent_iri.unique()))
print("Number of non-empty parent label paths: ", len(ontdf[ontdf.parent_label_path != ''].parent_label_path.unique()))

Number of non-empty iris:  832
Number of non-empty label paths:  832
Number of non-empty parent iris:  479
Number of non-empty parent label paths:  399


In [13]:
## only only records (show above) should have the same parent iri and iri value
print("Number of matching iri/parent iri: ", len(ontdf[ontdf.iri == ontdf.parent_iri]))
# ontdf[ontdf.iri == ontdf.parent_iri]

Number of matching iri/parent iri:  241


Unnamed: 0,label_path,parent_label_path,iri,parent_iri,annotation
8786,,,,,{}
25710,host-associated > human,host-associated,http://purl.obolibrary.org/obo/GOLD_9529610ff4...,http://purl.obolibrary.org/obo/GOLD_9529610ff4...,"{""http://purl.obolibrary.org/obo/GOLD_ecosyste..."
28021,host-associated > fungi,host-associated,http://purl.obolibrary.org/obo/GOLD_dcf1ddce5f...,http://purl.obolibrary.org/obo/GOLD_dcf1ddce5f...,"{""http://purl.obolibrary.org/obo/GOLD_ecosyste..."
28573,host-associated > plants,host-associated,http://purl.obolibrary.org/obo/GOLD_00ca2ff8d8...,http://purl.obolibrary.org/obo/GOLD_00ca2ff8d8...,"{""http://purl.obolibrary.org/obo/GOLD_ecosyste..."
1,engineered > food production > dairy products,engineered > food production,http://purl.obolibrary.org/obo/GOLD_4b895df0aa...,http://purl.obolibrary.org/obo/GOLD_4b895df0aa...,"{""http://purl.obolibrary.org/obo/GOLD_ecosyste..."
54,environmental > terrestrial > rock-dwelling (s...,environmental > terrestrial,http://purl.obolibrary.org/obo/GOLD_70b53ec9dd...,http://purl.obolibrary.org/obo/GOLD_70b53ec9dd...,"{""http://purl.obolibrary.org/obo/GOLD_ecosyste..."
59,host-associated > arthropoda > ant dump,host-associated > arthropoda,http://purl.obolibrary.org/obo/GOLD_6b920c9a94...,http://purl.obolibrary.org/obo/GOLD_6b920c9a94...,"{""http://purl.obolibrary.org/obo/GOLD_ecosyste..."
68,environmental > terrestrial > deep subsurface,environmental > terrestrial,http://purl.obolibrary.org/obo/GOLD_a0a44581ba...,http://purl.obolibrary.org/obo/GOLD_a0a44581ba...,"{""http://purl.obolibrary.org/obo/GOLD_ecosyste..."
79,environmental > terrestrial > soil,environmental > terrestrial,http://purl.obolibrary.org/obo/GOLD_ca4adbfe67...,http://purl.obolibrary.org/obo/GOLD_ca4adbfe67...,"{""http://purl.obolibrary.org/obo/GOLD_ecosyste..."
122,engineered > wastewater > activated sludge,engineered > wastewater,http://purl.obolibrary.org/obo/GOLD_57fbdd9c21...,http://purl.obolibrary.org/obo/GOLD_57fbdd9c21...,"{""http://purl.obolibrary.org/obo/GOLD_ecosyste..."


In [14]:
## examine those records w/o a parent iri
ontdf[ontdf.parent_iri == '']

Unnamed: 0,label_path,parent_label_path,iri,parent_iri,annotation
0,host-associated,,http://purl.obolibrary.org/obo/GOLD_daf7fb2e82...,,"{""http://purl.obolibrary.org/obo/GOLD_ecosyste..."
1,engineered,,http://purl.obolibrary.org/obo/GOLD_fee6b5a458...,,"{""http://purl.obolibrary.org/obo/GOLD_ecosyste..."
7,environmental,,http://purl.obolibrary.org/obo/GOLD_3bbfdace1e...,,"{""http://purl.obolibrary.org/obo/GOLD_ecosyste..."
8786,,,,,{}


## Use rdflib to build ontology from dataframe

In [15]:
g = Graph() # instantiate graph

### Create annotaiton properties for each of the columns (saved in the annotation dictionary)

In [16]:
for k, v in annotation_dict.items():
    ## note: the value is a dict with keys 'iri' and 'label'
    g.add((URIRef(v['iri']), RDF.type, OWL.AnnotationProperty))
    g.add((URIRef(v['iri']), RDFS.label, Literal(v['label'])))

### Add each row from the ontology dataframe to graph

In [17]:
for (ix, label_path, parent_label_path, iri, parent_iri, annotation) in ontdf.itertuples(): # ontdf.head(100).itertuples():
    if len(iri) > 0:
        ## add iri to graph
        g.add((URIRef(iri), RDF.type, OWL.Class))
        g.add((URIRef(iri), RDFS.label, Literal(label_path)))
        
        ## add iri annotations to graph (note: annotation is a json string of form iri:value)
        ann = json.loads(annotation)
        for k, v in ann.items():
            g.add((URIRef(iri), URIRef(k), Literal(v)))
    
    if len(parent_iri) > 0:
        g.add((URIRef(iri), RDFS.subClassOf, URIRef(parent_iri))) # add parent iri to graph

In [18]:
## save graph (note: different formatats (e.g., turtle) are possible)
g.serialize(destination='output/gold-dataset-translation.owl', format='xml')