In [1]:
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A

import pandas as pd 

The environmental triad squad voted on classes to include in the soil env_medium value set. The result of that was the conclusion to include all subclasses of the soil class, except for enriched soils. 

In [2]:
pd.set_option('future.no_silent_downcasting', True)

In [3]:
adapter_string = "sqlite:obo:envo"

In [4]:
target_labels = ["soil"]

In [5]:
exclusion_labels = ["enriched soil"]

In [6]:
output_file = "post_google_sheets_soil_env_medium.tsv"

We also exclude soils who are differentiated by a relationship with someting that's in the env_ocal_scale value set, like 'orchard soil' and 'orchard'. See discover_excludable_soils.ipynb.

In [7]:
excludable_soils_filename = "discover_excludable_soils_curated.tsv"

In [8]:
excludable_soils_frame = pd.read_csv(excludable_soils_filename, sep="\t")

excludable_soils_frame['relation_is_reasonable'] = excludable_soils_frame['relation_is_reasonable'].fillna(False)

In [9]:
reasonable_frame = excludable_soils_frame[
    (excludable_soils_frame['relation_is_reasonable']) |
    (excludable_soils_frame['override'] == 'exclude soil')
]


In [10]:
reasonable_frame = reasonable_frame[reasonable_frame['override'] != 'keep soil']

reasonable_frame = reasonable_frame.dropna(subset=['sole_reasonable_other', 'sole_soil'])

In [11]:
soil_env_local_scale_filename = "../env_local_scale/post_google_sheets_soil_env_local_scale.tsv"

In [12]:
soil_env_local_scale_frame = pd.read_csv(soil_env_local_scale_filename, sep="\t")

In [13]:
soils_with_local_or_override_exclusion_justification = reasonable_frame[
    (reasonable_frame['sole_reasonable_other'].isin(soil_env_local_scale_frame['label'])) |
    (reasonable_frame['override'] == 'exclude soil')
]

The `sole_soil` column in this dataframe shows the soil subclasses that can be excluded based on related classes that have been included in the soil env_local_scale value set

In [14]:
soils_with_local_or_override_exclusion_justification

Unnamed: 0,s,s_is_soil,s_is_reasonable_other,slab,p,plab,p_lab_or_id,relation_is_reasonable,o,o_is_soil,o_is_reasonable_other,olab,slab_olab_cosine,sole_reasonable_other,sole_soil,override
53,ENVO:00002260,True,False,dune soil,BFO:0000050,part of,part of,True,ENVO:00000170,False,True,dune,0.666667,dune,dune soil,
59,ENVO:00002263,True,False,garden soil,BFO:0000050,part of,part of,True,ENVO:00000011,False,True,garden,0.738549,garden,garden soil,
60,ENVO:00002263,True,False,garden soil,RO:0001025,located in,located in,True,ENVO:00000011,False,True,garden,0.738549,garden,garden soil,
108,ENVO:00005743,True,False,roadside soil,RO:0001025,located in,located in,True,ENVO:01000447,False,True,roadside,0.784465,roadside,roadside soil,
126,ENVO:00005755,True,False,field soil,BFO:0000050,part of,part of,True,ENVO:00000114,False,True,agricultural field,0.596285,agricultural field,field soil,
156,ENVO:00005772,True,False,orchard soil,BFO:0000050,part of,part of,True,ENVO:00000115,False,True,orchard,0.763763,orchard,orchard soil,exclude soil
159,ENVO:00005773,True,False,pasture soil,RO:0001025,located in,located in,True,ENVO:00000266,False,True,pasture,0.763763,pasture,pasture soil,
168,ENVO:00005777,True,False,steppe soil,RO:0001025,located in,located in,True,ENVO:00000262,False,True,steppe,0.738549,steppe,steppe soil,
209,ENVO:02000138,True,False,mangrove biome soil,RO:0001025,located in,located in,True,ENVO:01000181,False,False,mangrove biome,0.858395,mangrove biome,mangrove biome soil,exclude soil
227,ENVO:00000044,False,True,peatland,RO:0002473,composed primarily of,composed primarily of,True,ENVO:00005774,True,False,peat soil,0.589256,peatland,peat soil,


In [15]:
soils_reiterating_broad_or_local = set(soils_with_local_or_override_exclusion_justification['sole_soil'].unique())


In [16]:
adapter = get_adapter(adapter_string)

In [17]:
targets = set()

accumulate subclasses of the target classes, by label. (Doing it by CURIe would be faster and require less code)

In [18]:
for current_label in target_labels:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id, predicates=[IS_A])
        for d in descendant:
            if adapter.label(d) not in soils_reiterating_broad_or_local:
                targets.add(d)
            else:
                print(f"Excluding {d} {adapter.label(d)} because its differentia can be expressed in terms of the broad or local scale")
            
print(len(targets))

Excluding ENVO:00005743 roadside soil because its differentia can be expressed in terms of the broad or local scale
Excluding ENVO:00002263 garden soil because its differentia can be expressed in terms of the broad or local scale
Excluding ENVO:00005773 pasture soil because its differentia can be expressed in terms of the broad or local scale
Excluding ENVO:00005772 orchard soil because its differentia can be expressed in terms of the broad or local scale
Excluding ENVO:00002260 dune soil because its differentia can be expressed in terms of the broad or local scale
Excluding ENVO:02000138 mangrove biome soil because its differentia can be expressed in terms of the broad or local scale
Excluding ENVO:00005774 peat soil because its differentia can be expressed in terms of the broad or local scale
Excluding ENVO:00005755 field soil because its differentia can be expressed in terms of the broad or local scale
Excluding ENVO:00005780 greenhouse soil because its differentia can be expressed 

deplete subclasses of the exclusion classes, again by label

In [19]:

for current_label in exclusion_labels:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id, predicates=[IS_A])
        for d in descendant:
            targets.remove(d)

print(len(targets))

85


In [20]:
labelled_targets = []
for t in targets:
    labelled_targets.append(
        {
            "id": t,
            "label": adapter.label(t)
        }
    )

export_frame = pd.DataFrame(labelled_targets)

export_frame.to_csv(output_file, sep="\t", index=False)
