In [1]:
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A
import csv # don't really need csv if we are importing pandas

import pandas as pd 

In [2]:
pd.set_option('future.no_silent_downcasting', True)

In [3]:
adapter_string = "sqlite:obo:envo"

In [4]:
target_labels = ["soil"]

In [5]:
exclusion_labels = ["enriched soil"]

In [6]:
output_file = "post_google_sheets_soil_env_medium.tsv"

In [7]:
excludable_soils_filename = "../discover_excludable_soils_curated.tsv"

In [8]:
excludable_soils_frame = pd.read_csv(excludable_soils_filename, sep="\t")

excludable_soils_frame['relation_is_reasonable'] = excludable_soils_frame['relation_is_reasonable'].fillna(False)

In [9]:
reasonable_frame = excludable_soils_frame[
    (excludable_soils_frame['relation_is_reasonable']) |
    (excludable_soils_frame['override'] == 'exclude soil')
]


In [10]:
reasonable_frame = reasonable_frame[reasonable_frame['override'] != 'keep soil']

reasonable_frame = reasonable_frame.dropna(subset=['sole_reasonable_other', 'sole_soil'])

In [11]:
soil_env_local_scale_filename = "../env_local_scale/post_google_sheets_soil_env_local_scale.tsv"

In [12]:
soil_env_local_scale_frame = pd.read_csv(soil_env_local_scale_filename, sep="\t")

In [13]:
soils_with_local_or_override_exclusion_justification = reasonable_frame[
    (reasonable_frame['sole_reasonable_other'].isin(soil_env_local_scale_frame['label'])) |
    (reasonable_frame['override'] == 'exclude soil')
]

In [14]:
soils_reiterating_broad_or_local = set(soils_with_local_or_override_exclusion_justification['sole_soil'].unique())


In [15]:
adapter = get_adapter(adapter_string)

In [16]:
targets = set()

In [17]:
# accumulate subclasses of the target classes (by label)
# 
for current_label in target_labels:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id, predicates=[IS_A])
        for d in descendant:
            if adapter.label(d) not in soils_reiterating_broad_or_local:
                targets.add(d)
            else:
                print(
                    f"Excluding {d} {adapter.label(d)} because its differentia can be expressed in terms of the broad or local scale")
                print(soils_with_local_or_override_exclusion_justification[
                          soils_with_local_or_override_exclusion_justification['sole_soil'] == adapter.label(d)])

print(len(targets))

Excluding ENVO:00002263 garden soil because its differentia can be expressed in terms of the broad or local scale
                s s_is_soil s_is_reasonable_other         slab            p  \
59  ENVO:00002263      True                 False  garden soil  BFO:0000050   
60  ENVO:00002263      True                 False  garden soil   RO:0001025   

          plab p_lab_or_id relation_is_reasonable              o o_is_soil  \
59     part of     part of                   True  ENVO:00000011     False   
60  located in  located in                   True  ENVO:00000011     False   

   o_is_reasonable_other    olab  slab_olab_cosine sole_reasonable_other  \
59                  True  garden          0.738549                garden   
60                  True  garden          0.738549                garden   

      sole_soil override  
59  garden soil      NaN  
60  garden soil      NaN  
Excluding ENVO:00005773 pasture soil because its differentia can be expressed in terms of the broad or 

In [18]:
# deplete subclasses of the exclusion classes (by label)
# 
for current_label in exclusion_labels:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id, predicates=[IS_A])
        for d in descendant:
            targets.remove(d)

print(len(targets))

86


In [19]:
labelled_targets = []
for t in targets:
    labelled_targets.append(
        {
            "id": t,
            "label": adapter.label(t)
        }
    )

export_frame = pd.DataFrame(labelled_targets)

export_frame.to_csv(output_file, sep="\t", index=False)
