In [1]:
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A, PART_OF

import csv
import pandas as pd

In [2]:
adapter_string = 'sqlite:obo:envo'

In [3]:
acceptable_prefixes = {
    'ENVO'
}

In [4]:
target_labels = [
    'material entity',
]


In [5]:
exclusions_by_label_over_is_a = [
    'astronomical body'
]

In [6]:
exclude_by_superstring_over_is_a = [
    'UNEP-WCMC',
    'agricult',
    'anthropogenic',
    'astronomical',
    'biosphere',
    'coast',
    'construct',
    'facility',
    'feature',
    'from',
    'geographic',
    'geologic',
    'landform',
    'marine',
    'ocean',
    'pingo',
    'raised',
    'salt',
    'sea',
    'system',
    'volcanic',
]

In [7]:
exclude_by_label_over_is_a_and_parts = [
    'anatomical entity environment',
    'anatomical entity',
    'animal habitation',
    'archipelago',
    'biome',
    'body of liquid',
    'building',
    'channel of a watercourse',
    'chemical entity',
    'container of an intermittent water body',
    'cryoform',
    'educational facility',
    'environmental material',
    'environmental system',  # includes ecosystems
    'environmental zone',
    'fiat part of an astronomical object',
    'fluid astronomical body part',  # todo gets rid of plume but also lakes, stream etc.
    'hatchery',
    'healthcare facility',
    'high-elevation mountain',
    'hospital unit facility',
    'intrusion',
    'karst',
    'layer',
    'manufactured product',
    'marine environmental zone',
    'marine layer',
    'mass of compounded environmental materials',  # todo eliminates glacier
    'mass of environmental material',
    'mass of fluid',
    'meteor',  # includes clouds!
    'mid-elevation mountain',
    'mineral deposit',
    'object aggregate',  # esp for rain
    'organ',
    'particle',
    'pedosphere',
    'penplain',
    'piece of plastic',
    'polder',
    'pole',
    'political entity',
    'protoplanetary disk',
    'salt mass',
    'sandur',
    'seamount',
    'see cliff',
    'star',  # todo for photosphere, which should have been removed by mass of environmental material
    'thermokarst',
    'volcanic feature',
    'whole plant',
]

# todo lost rhizosphere (an environmental system) ... steppe (part of part of a biome) ...

# todo are we really saying that we aren't interested in soil near a lake? 


In [8]:
keep_base_only = [
    'artificial',
    'basin'
    'beach',
    'bridge',
    'campground',
    'cave',
    'channel',
    'cliff',
    'cone',
    'conveyor',
    'crater',
    'cut',
    'dam',
    'dune',
    'fault',
    'field',
    'flood',
    'garden',
    'hill',
    'landfill',
    'market',
    'mine',
    'moraine',
    'mountain',
    'natural',
    'network',
    'oil',
    'park',
    'pipeline',
    'pit',
    'plain',
    'plane',
    'planetary',
    'pond',
    'pond',
    'prison',
    'private',
    'public',
    'pyroclastic',
    'range',
    'reactor',
    'refinery',
    'ridge'
    'rift',
    'road',
    'rock',
    'site',
    'slope',
    'tailings',
    'terrace',
    'transport',
    'tunnel',
    'valley',
    'volcano',
    'water',
    'zone',
]


In [9]:
voting_sheet = 'https://docs.google.com/spreadsheets/d/1--lz0fJP7LPpf8tJED0ij7W5LIzgYGCC/export?format=tsv&gid=154155133'

In [10]:
voting_frame = pd.read_csv(voting_sheet, sep='\t')

In [11]:
force_back_out_max_vote_sum = -2
force_back_out_min_iaa = 0.4

In [12]:
force_back_out_frame = voting_frame[
    (voting_frame['vote_sum'] <= force_back_out_max_vote_sum) & 
    (voting_frame['IAA_score'] >= force_back_out_min_iaa)
]

In [13]:
force_back_out = force_back_out_frame['label'].tolist()

In [14]:
force_back_in_min_vote_sum = 2
force_back_in_min_iaa = 0.4

In [15]:
force_back_in_frame = voting_frame[
    (voting_frame['vote_sum'] >= force_back_in_min_vote_sum) & 
    (voting_frame['IAA_score'] >= force_back_in_min_iaa)
]

In [16]:
force_back_in = force_back_in_frame['label'].tolist()

In [17]:
gold_inferences_sheet = '../gold/gold-soils-by-semsql-wide.tsv'

In [18]:
gold_inferences_frame = pd.read_csv(gold_inferences_sheet, sep='\t')

In [19]:
gold_non_local_inferences = set(pd.concat([gold_inferences_frame['env_broad_label'], gold_inferences_frame['env_local_label']]))


In [20]:
gold_substring_exclusion_list = ['biome', 'environmental system']

In [21]:
gold_add_back_in = {s for s in gold_non_local_inferences if not any(sub in s for sub in gold_substring_exclusion_list)}


In [22]:
force_back_in.extend(gold_add_back_in)

In [23]:
output_file = 'post_google_sheets_soil_env_local_scale.tsv'

In [24]:
adapter = get_adapter(adapter_string)

In [25]:
targets = set()

In [26]:
# accumulate subclasses of the target classes (by label)
# 
for current_label in target_labels:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id, predicates=[IS_A])
        for d in descendant:
            targets.add(d)

print(len(targets))

5132


In [27]:
# write the targets
#
with open('post_google_sheets_soil_env_local_scale_initial.tsv', 'w') as f:
    for t in targets:
        f.write(t + '\t' + adapter.label(t) + '\n')

In [28]:
# deplete subclasses of the exclusion classes (by label)
# 

depletion_log = []

for current_label in exclude_by_label_over_is_a_and_parts:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id,
                                         predicates=[IS_A, PART_OF])  # todo be careful with part of assumptions!
        for d in descendant:
            if d in targets:
                depletion_line = {
                    'removed': adapter.label(d),
                    'reason': current_label
                }
                # print(log_line)
                depletion_log.append(depletion_line)
                targets.remove(d)

for current_label in exclusions_by_label_over_is_a:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id, predicates=[IS_A])  # todo be careful with part of assumptions!
        for d in descendant:
            if d in targets:
                depletion_line = {
                    'removed': adapter.label(d),
                    'reason': current_label
                }
                # print(log_line)
                depletion_log.append(depletion_line)
                targets.remove(d)

# for current_label in keep_base_only:
targets_dupe = targets.copy()
for t in targets_dupe:
    label = adapter.label(t)
    label_words = label.split(' ')
    for kbo in keep_base_only:
        if kbo in label_words and len(label_words) > 1:
            depletion_line = {
                'removed': label,
                'reason': f"{t}'s label {label} contains but is not limited to {kbo}"
            }
            depletion_log.append(depletion_line)
            if t in targets:
                targets.remove(t)

print(len(targets))

505


In [29]:
# # write the lines in log to a TSV file 'sels_depletion.tsv'' with csv's dictwriter
# #
# with open('sels_depletion.tsv', 'w') as f:
#     writer = csv.DictWriter(f, fieldnames=['removed', 'reason'])
#     writer.writeheader()
#     for line in depletion_log:
#         writer.writerow(line)


In [30]:
# remove curies from the targets if their prefix, when splitting on colon, isn't in the acceptable prefixes
for t in list(targets):
    if t.split(':')[0] not in acceptable_prefixes:
        targets.remove(t)
print(len(targets))

229


In [31]:
# remove curies from the targets if their label contains any of the exclusion fragments
for t in list(targets):
    for fragment in exclude_by_superstring_over_is_a:
        if fragment in adapter.label(t):
            if t in targets:
                targets.remove(t)
print(len(targets))

169


In [32]:
for fbi in force_back_in: 
    # add tracking of fbis that don't map to any terms, fbis that already are in targets...
    fbi_count = 0
    for class_id in adapter.basic_search(fbi):
        fbi_count += 1
        if class_id not in targets:
            print(f"Adding {fbi} to targets")
            targets.add(class_id)
        else:
            print(f'{fbi} already in targets')
    if fbi_count == 0:
        print(f"Can't find {fbi} in adapter")

Adding river to targets
Adding stream to targets
Adding spring to targets
Adding marsh to targets
Adding fjord to targets
Adding peatland to targets
cave already in targets
mine already in targets
mountain already in targets
hill already in targets
beach already in targets
caldera already in targets
Adding glacier to targets
dune already in targets
isthmus already in targets
Adding karst to targets
plateau already in targets
Adding fen to targets
volcano already in targets
Adding prairie to targets
Adding steppe to targets
drainage basin already in targets
crater already in targets
landfill already in targets
park already in targets
Adding rhizosphere to targets
channel already in targets
Adding active permafrost layer to targets
Adding flood plain to targets
river already in targets
Adding farm to targets
Adding meadow ecosystem to targets
Adding forest ecosystem to targets
desert already in targets
Adding scrubland area to targets
Adding soil biocrust to targets
garden already in tar

In [33]:
for fbo in force_back_out:
    for class_id in adapter.basic_search(fbo):
        if class_id in targets:
            print(f"Removing {fbo} from targets")
            targets.remove(class_id)
        else:
            print(f'{fbo} not in targets')
            
# these won't show up in the previously closed sels_depletion.tsv

fertilizer not in targets
geographic feature not in targets
bedding-plane cave not in targets
anthropogenic geographic feature not in targets
administrative region not in targets
first-order administrative region not in targets
second-order administrative region not in targets
third-order administrative region not in targets
fourth-order administrative region not in targets
national geopolitical entity not in targets
hydrographic feature not in targets
ocean not in targets
sea not in targets
saline lake not in targets
watercourse not in targets
irrigation canal not in targets
lagoon not in targets
volcanic crater lake not in targets
saline marsh not in targets
underground stream not in targets
man-made tunnel not in targets
aquaduct not in targets
building not in targets
Removing agricultural ecosystem from targets
mountain range not in targets
hill range not in targets
sea beach not in targets
volcanic feature not in targets
desert area not in targets
heath not in targets
obsolete cul

In [34]:
labelled_targets = []
for t in targets:
    labelled_targets.append(
        {
            'id': t,
            'label': adapter.label(t)
        }
    )

In [35]:
export_frame = pd.DataFrame(labelled_targets)

export_frame.to_csv(output_file, sep='\t', index=False)