In [19]:
from cProfile import label

from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A, PART_OF  # todo be careful with part of assumptions!

import pprint
import csv
import pandas as pd

In [2]:
adapter_string = "sqlite:obo:envo"

In [3]:
acceptable_prefixes = {
    "ENVO"
}

In [4]:
target_labels = [
    "material entity",
]


In [5]:
is_a_only_exclusion_labels = [
    "astronomical body"
]

In [6]:
is_a_label_fragment_exclusion = [
    "UNEP-WCMC",
    "agricult",
    "anthropogenic",
    "astronomical",
    "biosphere",
    "coast",
    "construct",
    "facility",
    "feature",
    "from",
    "geographic",
    "geologic",
    "landform",
    "marine",
    "ocean",
    "raised",
    "sea",
    "system",
    "volcanic",
]

In [7]:
is_a_part_of_exclusion_labels = [
    "anatomical entity environment",
    "anatomical entity",
    "animal habitation",
    "archipelago",
    "biome",
    "body of liquid",
    "building",
    "channel of a watercourse",
    "chemical entity",
    "container of an intermittent water body",
    "cryoform",
    "educational facility",
    "environmental material",
    "environmental system",  # includes ecosystems
    "environmental zone",
    "fiat part of an astronomical object",
    "fluid astronomical body part",  # todo gets rid of plume but also lakes, stream etc.
    "healthcare facility",
    "high-elevation mountain",
    "hospital unit facility",
    "layer",
    "manufactured product",
    "marine environmental zone",
    "marine layer",
    "mass of compounded environmental materials",  # todo eliminates glacier
    "mass of environmental material",
    "mass of fluid",
    "meteor",  # includes clouds!
    "mid-elevation mountain",
    "mineral deposit",
    "object aggregate",  # esp for rain
    "organ",
    "particle",
    "piece of plastic",
    "political entity",
    "star",  # todo for photosphere, which should have been removed by mass of environmental material
]

# todo lost rhizosphere (an environmental system) ... steppe (part of part of a biome) ...

# todo are we really saying that we aren't interested in soil near a lake? 


In [8]:
adapter = get_adapter(adapter_string)

In [9]:
targets = set()

In [10]:
# accumulate subclasses of the target classes (by label)
# 
for current_label in target_labels:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id, predicates=[IS_A])
        for d in descendant:
            targets.add(d)
            
print(len(targets))

5132


In [11]:
# write the targets and their labels to a tsv file "sels_targets.txt"
#
with open("sels_targets.tsv", "w") as f:
    for t in targets:
        f.write(t + "\t" + adapter.label(t) + "\n")

In [12]:
# deplete subclasses of the exclusion classes (by label)
# 

depletion_log = []

for current_label in is_a_part_of_exclusion_labels:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id,
                                         predicates=[IS_A, PART_OF])  # todo be careful with part of assumptions!
        for d in descendant:
            if d in targets:
                depletion_line = {
                    "removed": adapter.label(d),
                    "reason": current_label
                }
                # print(log_line)
                depletion_log.append(depletion_line)
                targets.remove(d)

for current_label in is_a_only_exclusion_labels:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id, predicates=[IS_A])  # todo be careful with part of assumptions!
        for d in descendant:
            if d in targets:
                depletion_line = {
                    "removed": adapter.label(d),
                    "reason": current_label
                }
                # print(log_line)
                depletion_log.append(depletion_line)
                targets.remove(d)

print(len(targets))

759


In [13]:
# write the lines in log to a TSV file "sels_depletion.tsv"" with csv's dictwriter
#
with open("sels_depletion.tsv", "w") as f:
    writer = csv.DictWriter(f, fieldnames=["removed", "reason"])
    writer.writeheader()
    for line in depletion_log:
        writer.writerow(line)


In [14]:
# # write the remaining targets and their labels to a tsv file 
# #
# with open("sels_targets_depletion_remaining.tsv", "w") as f:
#     for t in targets:
#         f.write(t + "\t" + adapter.label(t) + "\n")

In [15]:
# remove curies from the targets if their prefix, when splitting on colon, isn't in the acceptable prefixes
for t in list(targets):
    if t.split(":")[0] not in acceptable_prefixes:
        targets.remove(t)   
print(len(targets))

475


In [16]:
# remove curies from the targets if their label contains any of the exclusion fragments
for t in list(targets):
    for fragment in is_a_label_fragment_exclusion:
        if fragment in adapter.label(t):
            if t in targets:
                targets.remove(t)
print(len(targets))

386


In [17]:
# write the remaining targets and their labels to a tsv file 
#
with open("sels_targets_remaining.tsv", "w") as f:
    for t in targets:
        f.write(t + "\t" + adapter.label(t) + "\n")

In [21]:
labelled_targets = []
for t in targets:
    labelled_targets.append(
        {
            "id": t,
            "label": adapter.label(t)
        }
    )

In [22]:
labeled_frame = pd.DataFrame(labelled_targets)

In [24]:
labeled_frame

Unnamed: 0,id,label
0,ENVO:01000518,diapir
1,ENVO:00000403,shield volcano
2,ENVO:01001111,fossil worm burrow
3,ENVO:00010624,botanical garden
4,ENVO:01000731,igneous extrusion
...,...,...
381,ENVO:01000863,overflow structure
382,ENVO:00000332,doline
383,ENVO:00000417,machair
384,ENVO:00000324,outflow cave


In [25]:
labeled_frame.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>id</th>\n      <th>label</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>ENVO:01000518</td>\n      <td>diapir</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>ENVO:00000403</td>\n      <td>shield volcano</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>ENVO:01001111</td>\n      <td>fossil worm burrow</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>ENVO:00010624</td>\n      <td>botanical garden</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>ENVO:01000731</td>\n      <td>igneous extrusion</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>ENVO:00000302</td>\n      <td>rift valley</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>ENVO:03500001</td>\n      <td>playground</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>ENVO:00000100</td>\n      <td>valley</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <