In [1]:
import pandas as pds
import json
from rdflib import Graph, RDFS, RDF, OWL, URIRef, Literal
from hashlib import md5
from pandasql import sqldf

def pysqldf(q):
    return sqldf(q, globals())

## Build dataframe containing FICUS project data

In [2]:
ficusdf = pds.read_excel("data/30_FICUS_Proposals_Metadata_4_Emiley_Chris_11082019.xlsx")

## Normalize column names to lower case and trim/strip spaces

In [3]:
clean_columns = [c.lower().strip() for c in ficusdf.columns]
# clean_columns

In [4]:
ficusdf.columns = clean_columns

## Specify GOLD elevels and ID fields

In [5]:
gold_elevels = ['ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem']
id_fields = ['gold_id', 'ecosystem_path_id', 'biosample_id', 'organism_id', 'analysis_project_id', 'submission_id', 'img_taxon_id']

## Define helper functions

In [6]:
def make_row_list(row, subset_list=[]):
    if len(subset_list) > 0:  # only get values in subset list
        row_list = [str(v).lower().strip()
                        for k,v in row.to_dict().items() 
                        if k in subset_list]
    else:
        row_list = [str(e) for e in list(row)]
        
    return row_list


def make_ficus_row_iri(row, id_field_list, prefix="http://purl.obolibrary.org/obo/GOLD_"):
    row_list = make_row_list(row, id_field_list)
    return make_iri("_".join(row_list), prefix)


def make_iri(val, prefix="http://purl.obolibrary.org/obo/GOLD_"):
    if None != val and len(val) > 0:
        return f"{prefix}{val}"
    else:
        return ""


def make_class_iri(row, gold_elevel_list, prefix="http://purl.obolibrary.org/obo/GOLD_"):
    row_hash = make_row_hash(row, gold_elevel_list)
    return make_iri(row_hash, prefix)


def make_row_hash(row, subset_list=[]):
    row_list = make_row_list(row, subset_list)

    temp = ["" if pds.isnull(e) else e for e in row_list] # replace NaNs with ''
    temp = "".join(temp)
#     print("row: ", temp)
    return make_hash(temp)


def make_hash(val):
    if len(val) > 0:
        hash = md5(val.encode('utf-8'))
        return str(hash.hexdigest())
    else:
        return ""


def make_annotation_dict(value_list, prefix="http://purl.obolibrary.org/obo/GOLD_"):
    annotation_dict = {}
    for val in value_list:
        val = str(val).lower().strip()
        annotation_dict[val] = \
            {'iri': make_iri(str(val), prefix=prefix), 'label': val}
    return annotation_dict

## Make dict to map column headers to IRIs

In [7]:
annotation_dict = make_annotation_dict(clean_columns)

In [8]:
annotation_dict

{'gold_id': {'iri': 'http://purl.obolibrary.org/obo/GOLD_gold_id',
  'label': 'gold_id'},
 'is_public': {'iri': 'http://purl.obolibrary.org/obo/GOLD_is_public',
  'label': 'is_public'},
 'its_proposal_id': {'iri': 'http://purl.obolibrary.org/obo/GOLD_its_proposal_id',
  'label': 'its_proposal_id'},
 'its_spid': {'iri': 'http://purl.obolibrary.org/obo/GOLD_its_spid',
  'label': 'its_spid'},
 'pi_name': {'iri': 'http://purl.obolibrary.org/obo/GOLD_pi_name',
  'label': 'pi_name'},
 'pi_email': {'iri': 'http://purl.obolibrary.org/obo/GOLD_pi_email',
  'label': 'pi_email'},
 'sequencing_strategy': {'iri': 'http://purl.obolibrary.org/obo/GOLD_sequencing_strategy',
  'label': 'sequencing_strategy'},
 'project_name': {'iri': 'http://purl.obolibrary.org/obo/GOLD_project_name',
  'label': 'project_name'},
 'biosample_id': {'iri': 'http://purl.obolibrary.org/obo/GOLD_biosample_id',
  'label': 'biosample_id'},
 'biosample_name': {'iri': 'http://purl.obolibrary.org/obo/GOLD_biosample_name',
  'labe

## Create IRIs to identify each row of data and class to which the row belongs

In [9]:
ficusdf['row_iri'] = ficusdf.apply(lambda row: make_ficus_row_iri(row, id_fields), axis=1)
ficusdf['class_iri'] = ficusdf.apply(lambda row: make_class_iri(row, gold_elevels), axis=1)

In [10]:
## examine output
pds.set_option('display.max_colwidth', 1000)
ficusdf.class_iri.head()
ficusdf[['ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem', 'class_iri']].head()

Unnamed: 0,ecosystem,ecosystem_category,ecosystem_type,ecosystem_subtype,specific_ecosystem,class_iri
0,Host-associated,Microbial,Bacteria,Unclassified,Unclassified,http://purl.obolibrary.org/obo/GOLD_8933c51848ce54db789cba16bf6707ea
1,Host-associated,Microbial,Bacteria,Unclassified,Unclassified,http://purl.obolibrary.org/obo/GOLD_8933c51848ce54db789cba16bf6707ea
2,Host-associated,Microbial,Bacteria,Unclassified,Unclassified,http://purl.obolibrary.org/obo/GOLD_8933c51848ce54db789cba16bf6707ea
3,Host-associated,Microbial,Bacteria,Unclassified,Unclassified,http://purl.obolibrary.org/obo/GOLD_8933c51848ce54db789cba16bf6707ea
4,Host-associated,Microbial,Bacteria,Unclassified,Unclassified,http://purl.obolibrary.org/obo/GOLD_8933c51848ce54db789cba16bf6707ea


## Use rdflib to build ontology from dataframe
Each row in the dataframe is an instance of the class defined by the GOLD elevels

In [11]:
g = Graph() # instantiate graph

### Create annotaiton properties for each of the columns (saved in the annotation dictionary)

In [12]:
for k, v in annotation_dict.items():
    ## note: the value is a dict with keys 'iri' and 'label'
    g.add((URIRef(v['iri']), RDF.type, OWL.AnnotationProperty))
    g.add((URIRef(v['iri']), RDFS.label, Literal(v['label'])))

### Add each row from the dataframe into to graph

In [15]:
for (ix, row) in ficusdf.iterrows(): # ontdf.head(100).itertuples():
    if len(row.row_iri) > 0: 
        row_iri = URIRef(row['row_iri'])    
        g.add((row_iri, RDF.type, OWL.NamedIndividual)) # add instance iri to graph
        
        if len(row.class_iri) > 0: 
            class_iri = URIRef(row['class_iri'])
            g.add((row_iri, RDF.type, class_iri)) # add type the row instantiates
        
        for k, v in annotation_dict.items(): # field values in spreadsheet as annotation values
            field_val = row[k]
            annotation_iri = URIRef(v['iri'])
            g.add((row_iri, annotation_iri, Literal(field_val)))

In [16]:
## save graph (note: different formatats (e.g., turtle) are possible)
g.serialize(destination='output/FICUS-projects-translation.owl', format='xml')