In [1]:
import pandas as pds
from lib import data_operations as dop
from rdflib import Graph, ConjunctiveGraph, RDFS, RDF, OWL, URIRef, Literal
from pandasql import sqldf

def pysqldf(q):
    return sqldf(q, globals())

# Read ENVO into graph
#### The ENVO graph will be used to find labels associated with ENVO IRIs.
Note: up to date envo.owl file needs to be in the data directory

In [2]:
envo = Graph()
envo.parse("data/envo.owl")

<Graph identifier=N1a7962f6427e48fa8865f59b445b8665 (<class 'rdflib.graph.Graph'>)>

In [3]:
mapping_df = dop.make_dataframe("data/GOLD-Paths-to-MIxS-ENVO-triad-mapping.tsv")
mapping_df = mapping_df.fillna("")

In [4]:
mapping_df.head() # peek at data

Unnamed: 0,ebi_biomes,environmental_feature,environmental_material,biome,quality,gold_path_>_delim
0,Environmental,,,http://purl.obolibrary.org/obo/ENVO_01000254,,root > Environmental
1,Air,,http://purl.obolibrary.org/obo/ENVO_00002005,,,root > Environmental > Air
2,Indoor Air,http://purl.obolibrary.org/obo/ENVO_00000073,,,,root > Environmental > Air > Indoor Air
3,Outdoor Air,http://purl.obolibrary.org/obo/ENVO_01000811,,,,root > Environmental > Air > Outdoor Air
4,Aquatic,,,http://purl.obolibrary.org/obo/ENVO_00002030,,root > Environmental > Aquatic


## Create lables associated with ENVO enviromental feature, material, biome

In [5]:
## for each iri get the envo rdfs:label associated with it
mapping_df["feature_label"] = \
    mapping_df.environmental_feature.map(lambda x: "" if "" == x else str(envo.label(URIRef(x))))

mapping_df["material_label"] = \
    mapping_df.environmental_material.map(lambda x: "" if "" == x else str(envo.label(URIRef(x))))

mapping_df["biome_label"] = \
    mapping_df.biome.map(lambda x: "" if "" == x else str(envo.label(URIRef(x))))

In [6]:
mapping_df.head() # peek at data

Unnamed: 0,ebi_biomes,environmental_feature,environmental_material,biome,quality,gold_path_>_delim,feature_label,material_label,biome_label
0,Environmental,,,http://purl.obolibrary.org/obo/ENVO_01000254,,root > Environmental,,,environmental system
1,Air,,http://purl.obolibrary.org/obo/ENVO_00002005,,,root > Environmental > Air,,air,
2,Indoor Air,http://purl.obolibrary.org/obo/ENVO_00000073,,,,root > Environmental > Air > Indoor Air,building,,
3,Outdoor Air,http://purl.obolibrary.org/obo/ENVO_01000811,,,,root > Environmental > Air > Outdoor Air,nitrogen-oxygen planetary atmosphere,,
4,Aquatic,,,http://purl.obolibrary.org/obo/ENVO_00002030,,root > Environmental > Aquatic,,,aquatic biome


# Transform mapping spreadsheet into form that can be combined with GOLD path spreadsheet

## Change ENVO and FOODON IRIs to CURIs

In [70]:
## collaps full iri to shortened form
def make_curi(iri):
    if 'obo/ENVO_' in iri:
        return "ENVO:" + iri.split("obo/ENVO_")[-1]
    if 'obo/FOODON_' in iri:
        return "FOODON:" + iri.split("obo/FOODON_")[-1]
    return ""

    
mapping_df.environmental_feature = \
    mapping_df.environmental_feature.map(lambda x: make_curi(x))

mapping_df.environmental_material = \
    mapping_df.environmental_material.map(lambda x: make_curi(x))

mapping_df.biome = \
    mapping_df.biome.map(lambda x: make_curi(x))

NameError: name 'x' is not defined

In [None]:
mapping_df.head() # peek at data

## Split gold_path_>_delim column in GOLD path levels
#### ecosystem, ecosystem_category, ecosystem_type, ecosystem_subtype, specific_ecosystem

#### Add gold levels to dataframe; default values to empty string

In [None]:
gold_levels = ["ecosystem", "ecosystem_category", "ecosystem_type", "ecosystem_subtype", "specific_ecosystem"]
for level in gold_levels:
    mapping_df[level] = ""

In [None]:
mapping_df.head()

#### For each GOLD level add the corresponding values for the delimited path string

In [None]:
def parse_gold_path_value(gold_path_string, column, column_list, delimiter, start_index, end_index):
    # elevels = ["ecosystem", "ecosystem_category", "ecosystem_type", "ecosystem_subtype", "specific_ecosystem"]
    
    buffer_list = ['' for item in column_list] # create a buffer list of empty strings
    path_list = gold_path_string.split(delimiter) + buffer_list # split based on delim and buffer with list of empty strings
    path_list = [item.strip() for item in path_list] # strip white space from items
    path_list = path_list[start_index:end_index] # subset path list to parts that align with gold paths
    
    ## return the value parsed out of the path string whose index in the path list matches the index of the column
    column_index = column_list.index(column)
    path_value = path_list[column_index]
    return path_value

In [None]:
## add values to dataframe columns; this could have done in loop, but doing each allows for easier debugging
mapping_df.ecosystem = \
    mapping_df['gold_path_>_delim'].map(lambda x: parse_gold_path_value(x, 'ecosystem', gold_levels, ' > ', 1, 6))

mapping_df.ecosystem_category = \
    mapping_df['gold_path_>_delim'].map(lambda x: parse_gold_path_value(x, 'ecosystem_category', gold_levels, ' > ', 1, 6))

mapping_df.ecosystem_type = \
    mapping_df['gold_path_>_delim'].map(lambda x: parse_gold_path_value(x, 'ecosystem_type', gold_levels, ' > ', 1, 6))

mapping_df.ecosystem_subtype = \
    mapping_df['gold_path_>_delim'].map(lambda x: parse_gold_path_value(x, 'ecosystem_subtype', gold_levels, ' > ', 1, 6))

mapping_df.specific_ecosystem = \
    mapping_df['gold_path_>_delim'].map(lambda x: parse_gold_path_value(x, 'specific_ecosystem', gold_levels, ' > ', 1, 6))

In [None]:
mapping_df.head()

## Check to see if all the ebi_biomes values are the same as the leafs of the GOLD paths
#### Result: All ebi_biomes matched leaf values. So, we can drop the ebi_biomes column.

In [None]:
check_df = mapping_df[['ebi_biomes', 'gold_path_>_delim']]

In [None]:
## helper function
def get_gold_path_string_leaf_value(path_string):
    path_list = path_string.split(' > ')
    path_list = [item.strip() for item in path_list]
    return path_list[-1]

In [None]:
# create column for leaf values
check_df['leaf'] = ""
check_df['leaf'] = check_df['gold_path_>_delim'].map(lambda x: get_gold_path_string_leaf_value(x))

In [None]:
check_df.head() # peek at data

In [None]:
## iterate checking for non-equal values
for (idx, biome, leaf) in check_df[['ebi_biomes', 'leaf']].itertuples():
    if biome.strip() != leaf: print("'" + biome + "'", "!=", "'" + leaf + "'")
        
## NONE FOUND!

## Create final dataframe for output
* Add ecosystem_path_id colum
* Save and reorder needed columns
* Rename columns

In [None]:
mapping_df['ecosystem_path_id'] = ''

In [None]:
mapping_out_df = mapping_df[['ecosystem_path_id', 'ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem',
                     'biome_label', 'biome', 'material_label', 'environmental_material', 'feature_label', 'environmental_feature']]

In [None]:
mapping_out_df.rename(columns={'biome': 'biome_iri', 'environmental_material': 'material_iri', 'environmental_feature': 'feature_iri'}, inplace=True)

In [None]:
mapping_out_df.head()

# Transform GOLD path spreadseet into form to combine with mapping spreadsheet

In [71]:
paths_df = dop.make_dataframe("data/GOLDs5levelEcosystemClassificationPaths.xlsx", file_type="excel")
paths_df.ecosystem_path_id = paths_df.ecosystem_path_id.astype(str) # path ids need to be strings

In [72]:
paths_df.head()

Unnamed: 0,ecosystem_path_id,ecosystem,ecosystem_category,ecosystem_type,ecosystem_subtype,specific_ecosystem
0,4845,Engineered,Artificial ecosystem,Mud microcosm,Unclassified,Unclassified
1,4536,Engineered,Bioreactor,Aerobic,Unclassified,Unclassified
2,4912,Engineered,Bioreactor,Anaerobic,Food waste,Unclassified
3,4914,Engineered,Bioreactor,Anaerobic,Manure,Unclassified
4,4442,Engineered,Bioreactor,Anaerobic,Unclassified,Unclassified


## Check for duplicate five tuples

In [73]:
subset_df = paths_df[['ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem']]

In [74]:
q = """
select
    ecosystem, ecosystem_category, ecosystem_type, ecosystem_subtype, specific_ecosystem, count(*) as total
from
    subset_df
group by
    ecosystem, ecosystem_category, ecosystem_type, ecosystem_subtype, specific_ecosystem
having
    count(*) > 1
"""
dups_df = sqldf(q)
dups_df

Unnamed: 0,ecosystem,ecosystem_category,ecosystem_type,ecosystem_subtype,specific_ecosystem,total
0,Environmental,Aquatic,Marine,Neritic zone,Sediment,2


In [75]:
#### get rows of the duplicates

In [76]:
q = """
select 
    *
from 
    paths_df
where
    ecosystem = 'Environmental'
    and ecosystem_category = 'Aquatic'
    and ecosystem_type = 'Marine'
    and ecosystem_subtype = 'Neritic zone'
    and specific_ecosystem = 'Sediment'
"""
sqldf(q)

Unnamed: 0,ecosystem_path_id,ecosystem,ecosystem_category,ecosystem_type,ecosystem_subtype,specific_ecosystem
0,4035,Environmental,Aquatic,Marine,Neritic zone,Sediment
1,4048,Environmental,Aquatic,Marine,Neritic zone,Sediment


#### update ecosystem_path_id to be a concat of both and remove duplicate row

In [77]:
paths_df.loc[paths_df['ecosystem_path_id'] == '4035', 'ecosystem_path_id'] = '4035|4048'

In [78]:
paths_df.drop(paths_df.loc[paths_df['ecosystem_path_id'] == '4048'].index, inplace=True)

#### verify duplicate has been dropped

In [79]:
q = """
select 
    *
from 
    paths_df
where
    ecosystem = 'Environmental'
    and ecosystem_category = 'Aquatic'
    and ecosystem_type = 'Marine'
    and ecosystem_subtype = 'Neritic zone'
    and specific_ecosystem = 'Sediment'
"""
sqldf(q)

Unnamed: 0,ecosystem_path_id,ecosystem,ecosystem_category,ecosystem_type,ecosystem_subtype,specific_ecosystem
0,4035|4048,Environmental,Aquatic,Marine,Neritic zone,Sediment


## Add biome-material-feature columns to paths dataframe

In [80]:
paths_df['biome_label'] = ''
paths_df['biome_iri'] = ''
paths_df['material_label'] = ''
paths_df['material_iri'] = ''
paths_df['feature_label'] = ''
paths_df['feature_iri'] = ''

In [81]:
paths_df.head() # peek at data

Unnamed: 0,ecosystem_path_id,ecosystem,ecosystem_category,ecosystem_type,ecosystem_subtype,specific_ecosystem,biome_label,biome_iri,material_label,material_iri,feature_label,feature_iri
0,4845,Engineered,Artificial ecosystem,Mud microcosm,Unclassified,Unclassified,,,,,,
1,4536,Engineered,Bioreactor,Aerobic,Unclassified,Unclassified,,,,,,
2,4912,Engineered,Bioreactor,Anaerobic,Food waste,Unclassified,,,,,,
3,4914,Engineered,Bioreactor,Anaerobic,Manure,Unclassified,,,,,,
4,4442,Engineered,Bioreactor,Anaerobic,Unclassified,Unclassified,,,,,,


#### verify column header of each dataframe are the same

In [82]:
list(mapping_out_df.columns) == list(paths_df.columns)

True

# Combine mapping and GOLD spreadsheets

In [83]:
concat_df = pds.concat([mapping_out_df, paths_df])

In [84]:
concat_df.head() # peek at data

Unnamed: 0,ecosystem_path_id,ecosystem,ecosystem_category,ecosystem_type,ecosystem_subtype,specific_ecosystem,biome_label,biome_iri,material_label,material_iri,feature_label,feature_iri
0,,Environmental,,,,,environmental system,ENVO:01000254,,,,
1,,Environmental,Air,,,,,,air,ENVO:00002005,,
2,,Environmental,Air,Indoor Air,,,,,,,building,ENVO:00000073
3,,Environmental,Air,Outdoor Air,,,,,,,nitrogen-oxygen planetary atmosphere,ENVO:01000811
4,,Environmental,Aquatic,,,,aquatic biome,ENVO:00002030,,,,


#### verify concatenated dataframe as some length as other two combined

In [85]:
len(concat_df) == len(mapping_out_df) + len(paths_df)

True

In [86]:
concat_df.to_csv("output/GOLD-Paths-to-MIxS-ENVO-triad-mapping.tsv", sep="\t", index=False)