In [1]:
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A, PART_OF

import csv
import pandas as pd

In [2]:
adapter_string = "sqlite:obo:envo"

In [3]:
acceptable_prefixes = {
    "ENVO"
}

In [4]:
target_labels = [
    "material entity",
]


In [5]:
is_a_only_exclusion_labels = [
    "astronomical body"
]

In [6]:
is_a_label_fragment_exclusion = [
    "UNEP-WCMC",
    "agricult",
    "anthropogenic",
    "astronomical",
    "biosphere",
    "coast",
    "construct",
    "facility",
    "feature",
    "from",
    "geographic",
    "geologic",
    "landform",
    "marine",
    "ocean",
    "raised",
    "sea",
    "system",
    "volcanic",
]

In [7]:
is_a_part_of_exclusion_labels = [
    "anatomical entity environment",
    "anatomical entity",
    "animal habitation",
    "archipelago",
    "biome",
    "body of liquid",
    "building",
    "channel of a watercourse",
    "chemical entity",
    "container of an intermittent water body",
    "cryoform",
    "educational facility",
    "environmental material",
    "environmental system",  # includes ecosystems
    "environmental zone",
    "fiat part of an astronomical object",
    "fluid astronomical body part",  # todo gets rid of plume but also lakes, stream etc.
    "healthcare facility",
    "high-elevation mountain",
    "hospital unit facility",
    "layer",
    "manufactured product",
    "marine environmental zone",
    "marine layer",
    "mass of compounded environmental materials",  # todo eliminates glacier
    "mass of environmental material",
    "mass of fluid",
    "meteor",  # includes clouds!
    "mid-elevation mountain",
    "mineral deposit",
    "object aggregate",  # esp for rain
    "organ",
    "particle",
    "piece of plastic",
    "political entity",
    "star",  # todo for photosphere, which should have been removed by mass of environmental material
]

# todo lost rhizosphere (an environmental system) ... steppe (part of part of a biome) ...

# todo are we really saying that we aren't interested in soil near a lake? 


In [8]:
keep_base_only = [
    'artificial',
    'basin'
    'beach',
    'bridge',
    'campground',
    'cave',
    'channel',
    'cliff',
    'cone',
    'conveyor',
    'crater',
    'cut',
    'dam',
    'dune',
    'fault',
    'field',
    'flood',
    'garden',
    'hill',
    'karst',
    'landfill',
    'market',
    'mine',
    'moraine',
    'mountain',
    'natural',
    'network',
    'oil',
    'park',
    'pipeline',
    'pit',
    'plain',
    'plane',
    'planetary',
    'pond',
    'pond',
    'prison',
    'private',
    'public',
    'pyroclastic',
    'range',
    'reactor',
    'refinery',
    'ridge'
    'rift',
    'road',
    'rock',
    'site',
    'slope',
    'tailings',
    'transport',
    'tunnel',
    'valley',
    'volcano',
    'water',
    'zone',
]


In [9]:
force_back_in = [
    'agricultural ecosystem',
    'agricultural field',
    # 'allotment garden',
    'alluvial plain',
    'alluvial fan',
    # 'bare soil surface layer',
    'dune',
    # 'elevated landform',
    'forested area',
    'frozen land',
    'garden',
    'grassland area',
    'meadow ecosystem',
    'mountain',
    # 'paddy field',
    'pasture',
    'peatland',
    'pond',
    'roadside',
    # 'rubber plantation',
    'savanna',
    'steppe',
    'swamp ecosystem',
    # 'technosphere',
    # 'troposhere',
]

# for the sake of "soils whose differentia can be expressed in the broad or local scale slots

# run this after discover_excludable_soils.ipynb
# and before post_google_sheets_soil_env_medium.ipynb

# see discover_excludable_soils.tsv with relation_is_reasonable filtered to true
# then remove blanks from sole_reasonable_other and sole_soil

In [10]:
force_back_in.extend([
    'active permafrost layer',
    'fen',
    'fjord',
    'glacier',
    'marsh',
    'peatland',
    'prairie',
    'rhizosphere',
    'river',
    'spring',
    'stream',
    'woodland area',
])

# because they had 2+ upvotes

In [11]:
force_back_in.extend([
    'arable land',
    'biochar',
    'flood plain',
    'forest ecosystem',
    'meadow ecosystem',
    'mine drainage',
    'oil spill',
    'pasture',
    'permafrost',
    'ranch',
    'thermokarst',
    'wetland ecosystem', 
])

# from gold terms soil exploration

In [12]:
force_out = [
    'cut',
    'field',
    'flattened elevation',
    'market',
    'mount',
    'peak',
    'trough',
    'yard',
]

# due to 3+ down votes

In [13]:
force_out.extend([
    'airport',
    'bike path',
    'biota',
    'bowling alley',
    'clinic',
    'fairground',
    'fish hatchery',
    'footbridge',
    'fossil worm burrow',
    'galaxy',
    'globular star cluster',
    'handrail',
    'hangar',
    'human nursery',
    'limestone pavement',
    'manufactured plastisphere',
    'military training area',
    'nebula',
    'patio',
    'paved parking lot',
    'petting zoo',
    'place of worship',
    'plastisphere',
    'poultry hatchery',
    'processing line',
    'protoplanetary disk',
    'railway',
    'shipwreck',
    'technosphere',
    'unexploded-ordnance dump',
    'young plant',

])

# MAM judgement

In [14]:
# model_with_local_scale_file = "post_google_sheets_soil_env_medium_relations_no_subclasses_useful_classes_curated.tsv"

In [15]:
output_file = "post_google_sheets_soil_env_local_scale.tsv"

In [16]:
adapter = get_adapter(adapter_string)

In [17]:
targets = set()

In [18]:
# accumulate subclasses of the target classes (by label)
# 
for current_label in target_labels:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id, predicates=[IS_A])
        for d in descendant:
            targets.add(d)

print(len(targets))

5132


In [19]:
# write the targets and their labels to a tsv file "sels_targets.txt"
#
with open("sels_targets.tsv", "w") as f:
    for t in targets:
        f.write(t + "\t" + adapter.label(t) + "\n")

In [20]:
# deplete subclasses of the exclusion classes (by label)
# 

depletion_log = []

for current_label in is_a_part_of_exclusion_labels:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id,
                                         predicates=[IS_A, PART_OF])  # todo be careful with part of assumptions!
        for d in descendant:
            if d in targets:
                depletion_line = {
                    "removed": adapter.label(d),
                    "reason": current_label
                }
                # print(log_line)
                depletion_log.append(depletion_line)
                targets.remove(d)

for current_label in is_a_only_exclusion_labels:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id, predicates=[IS_A])  # todo be careful with part of assumptions!
        for d in descendant:
            if d in targets:
                depletion_line = {
                    "removed": adapter.label(d),
                    "reason": current_label
                }
                # print(log_line)
                depletion_log.append(depletion_line)
                targets.remove(d)

# for current_label in keep_base_only:
targets_dupe = targets.copy()
for t in targets_dupe:
    label = adapter.label(t)
    label_words = label.split(" ")
    for kbo in keep_base_only:
        if kbo in label_words and len(label_words) > 1:
            depletion_line = {
                "removed": label,
                "reason": f"{t}'s label {label} contains but is not limited to {kbo}"
            }
            depletion_log.append(depletion_line)
            if t in targets:
                targets.remove(t)

print(len(targets))

553


In [21]:
# # write the lines in log to a TSV file "sels_depletion.tsv"" with csv's dictwriter
# #
# with open("sels_depletion.tsv", "w") as f:
#     writer = csv.DictWriter(f, fieldnames=["removed", "reason"])
#     writer.writeheader()
#     for line in depletion_log:
#         writer.writerow(line)


In [22]:
# remove curies from the targets if their prefix, when splitting on colon, isn't in the acceptable prefixes
for t in list(targets):
    if t.split(":")[0] not in acceptable_prefixes:
        targets.remove(t)
print(len(targets))

270


In [23]:
# remove curies from the targets if their label contains any of the exclusion fragments
for t in list(targets):
    for fragment in is_a_label_fragment_exclusion:
        if fragment in adapter.label(t):
            if t in targets:
                targets.remove(t)
print(len(targets))

208


In [24]:
for fbi in force_back_in: 
    # add tracking of fbis that don't map to any terms, fbis that already are in targets...
    fbi_count = 0
    for class_id in adapter.basic_search(fbi):
        fbi_count += 1
        if class_id not in targets:
            targets.add(class_id)
        else:
            print(f"{fbi} already in targets")
    if fbi_count == 0:
        print(f"Can't find {fbi} in adapter")

dune already in targets
garden already in targets
mountain already in targets
peatland already in targets
meadow ecosystem already in targets
pasture already in targets


In [25]:
for fo in force_out:
    for class_id in adapter.basic_search(fo):
        if class_id in targets:
            targets.remove(class_id)
        else:
            print(f"{fo} not in targets")
            
# these won't show up in the previously closed sels_depletion.tsv

In [26]:
labelled_targets = []
for t in targets:
    labelled_targets.append(
        {
            "id": t,
            "label": adapter.label(t)
        }
    )

In [27]:
export_frame = pd.DataFrame(labelled_targets)

export_frame.to_csv(output_file, sep="\t", index=False)