In [1]:
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A
import csv # don't really need csv if we are importing pandas

import pandas as pd 

In [2]:
adapter_string = "sqlite:obo:envo"

In [3]:
target_labels = ["soil"]

In [4]:
exclusion_labels = ["enriched soil"]

In [5]:
output_file = "post_google_sheets_soil_env_medium.tsv"

In [6]:
excludable_soils_filename = "discover_excludable_soils.tsv"

In [7]:
excludable_soils_frame = pd.read_csv(excludable_soils_filename, sep="\t")

In [8]:
reasonable_frame = excludable_soils_frame[excludable_soils_frame['relation_is_reasonable']]
# filter out rows with null/NA values
reasonable_frame = reasonable_frame.dropna()

In [9]:
soil_env_local_scale_filename = "post_google_sheets_soil_env_local_scale.tsv"

In [10]:
soil_env_local_scale_frame = pd.read_csv(soil_env_local_scale_filename, sep="\t")

In [11]:
soils_with_final_local_constraints = reasonable_frame[
    reasonable_frame['sole_reasonable_other'].isin(soil_env_local_scale_frame['label'])
]

In [12]:
soils_reiterating_broad_or_local = set(soils_with_final_local_constraints['sole_soil'].unique())


In [13]:
adapter = get_adapter(adapter_string)

In [14]:
targets = set()

In [15]:
# accumulate subclasses of the target classes (by label)
# 
for current_label in target_labels:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id, predicates=[IS_A])
        for d in descendant:
            if adapter.label(d) not in soils_reiterating_broad_or_local:
                targets.add(d)
            else:
                print(f"Skipping {d} {adapter.label(d)} because its differentia can be expressed in terms of the braod or local scale")
            
print(len(targets))

Skipping ENVO:00005743 roadside soil because its differentia can be expressed in terms of the braod or local scale
Skipping ENVO:00002263 garden soil because its differentia can be expressed in terms of the braod or local scale
Skipping ENVO:00005758 alluvial swamp soil because its differentia can be expressed in terms of the braod or local scale
Skipping ENVO:00005773 pasture soil because its differentia can be expressed in terms of the braod or local scale
Skipping ENVO:00002260 dune soil because its differentia can be expressed in terms of the braod or local scale
Skipping ENVO:00005746 savanna soil because its differentia can be expressed in terms of the braod or local scale
Skipping ENVO:00005774 peat soil because its differentia can be expressed in terms of the braod or local scale
Skipping ENVO:00002259 agricultural soil because its differentia can be expressed in terms of the braod or local scale
Skipping ENVO:00005755 field soil because its differentia can be expressed in term

In [16]:
# deplete subclasses of the exclusion classes (by label)
# 
for current_label in exclusion_labels:
    for class_id in adapter.basic_search(current_label):
        descendant = adapter.descendants(class_id, predicates=[IS_A])
        for d in descendant:
            targets.remove(d)

print(len(targets))

80


In [17]:
labelled_targets = []
for t in targets:
    labelled_targets.append(
        {
            "id": t,
            "label": adapter.label(t)
        }
    )

export_frame = pd.DataFrame(labelled_targets)

export_frame.to_csv(output_file, sep="\t", index=False)
