This notebook creates a dataset of iDigBio records using the following algorithm:

```
parameter: target_total_family_count
parameter: target_family_record_count

sorted_phyla <- sort phyla by *ascending* family count

num_phyla_remaining <- size(phyla)
for each phylum in sorted_phyla {
    target_family_count <- floor(target_total_family_count / num_phyla_remaining)
    family_record_counts <- count records for each family
    families <- get families under phylum with sufficient number of records

    family_bins <- place families into bins of similar record counts
    sorted_family_bins <- sort bins by *ascending* family count

    num_bins_remaining <- size(family_bins)
    for each bin in sorted_family_bins {
        target_family_count <- floor(target_family_count / num_bins_remaining)
        sorted_families <- sort families in bin by *descending* record count
        
        family_count <- 0
        for family in bin while family_count < target_family_count {
            family_records <- get diverse sample of records for family
            if size(family_records) > target_family_record_count {
                save family_records
                increment family_count
            }
        }

        decrement num_bins_remaining
    }

    decrement num_phyla_remaining
}
```

Algorithm for sampling a diverse set of records for each family:
* Get list of unique species, county pairs
* Collect one record per pair

In [None]:
if "snakemake" in globals():
    print("Parameters:", dict(snakemake.params))
    records_path = snakemake.output[0]
    kingdom = snakemake.params.kingdom
    phyla = snakemake.params.phyla
    min_records_per_family = snakemake.params.min_records_per_family
    max_records_per_family = snakemake.params.max_records_per_family
    families_per_kingdom = snakemake.params.families_per_kingdom
    required_fields = snakemake.params.required_fields
else:
    records_path = "test.jsonl"
    kingdom = "plantae"
    phyla = {"tracheophyta", "bryophyta", "marchantiophyta", "rhodophyta", "chlorophyta", "charophyta", "anthocerotophyta"}
    min_records_per_family = 25
    target_records_per_family = 120
    max_records_per_family = 200
    families_per_kingdom = 168
    required_fields = ["kingdom", "family", "genus", "specificepithet", "country", "stateprovince", "county"]
    

In [None]:
print(f"Records per kingdom: between {min_records_per_family * families_per_kingdom} and {max_records_per_family * families_per_kingdom}")
print("Families per kingdom:", families_per_kingdom)

phyla_in_kingdom = len(phyla)
families_per_phylum = int(families_per_kingdom / phyla_in_kingdom)
print("Number of phyla:", phyla_in_kingdom)
print("Families per phylum:", families_per_phylum)

In [None]:
import json
import requests
import math
import numpy as np
from attr import dataclass

api = "https://beta-search.idigbio.org/v2"

required_fields_rq = {field: {"type": "exists"} for field in required_fields}
default_rq = required_fields_rq | {
    "taxonrank": "species"
}

def get_idigbio_summary(rq, top_fields, count, rf=required_fields, verbose=False):
    response = requests.post(f"{api}/summary/top/records", json=dict(
        rq=default_rq | rq,
        top_fields=top_fields,
        count=count
    ))

    return response.json()[top_fields[0]]

# limit=5000 is the max allowed by the APIs
def get_idigbio_records(rq, limit=5000, gt_uuid=None, api=api):
    data = {
        "rq": default_rq | rq,
        "limit": limit,
    }

    if gt_uuid is not None:
        data["rq"] |= {"uuid": {"type": "range", "gt": gt_uuid}}
        data |= {
            "sort": [{ "uuid": "asc" }]
        }

    response = requests.post(f"{api}/search/records", json=data)

    return response.json()

In [None]:
from typing import NamedTuple


class SpeciesCounty(NamedTuple):
    species: str
    county: str

    def _get_simplified_species_name(self):
        return " ".join(self.species.split()[:2])

    def _get_simplified_county_name(self):
        return " ".join([s for s in self.county.split() if s not in {"co", "co.", "county"}])

    def __hash__(self):
        """
        Use clean county names to test equivalence, but preserve the original uncleaned names to
        maintain record retrievability
        """
        species = self._get_simplified_species_name()
        county = self._get_simplified_county_name()
        return hash(f"{species}\t{county}")

    def __eq__(self, value: object) -> bool:
        return isinstance(value, self.__class__) and hash(self) == hash(value)


assert len(set([
    SpeciesCounty("big bug", "alachua"),
    SpeciesCounty("big bug l.", "alachua co."),
    SpeciesCounty("big bug l. 1788", "alachua county"),
])) == 1

In [None]:
def get_family_species_county_pairs(kingdom, family):
    family_summary = get_idigbio_summary(
        rq= {
            "kingdom": kingdom,
            "family": family,
            "basisofrecord": "preservedspecimen",
            "taxonrank": "species"
        },
        top_fields=["phylum", "scientificname", "county"],
        count=max_records_per_family,
        verbose=False
    )

    unique_species_county_pairs = {
        SpeciesCounty(species, county)
        for family, family_data in family_summary.items()
        for species, species_data in family_data["scientificname"].items()
        for county, county_data in species_data["county"].items()
    }

    return list(unique_species_county_pairs)


next(iter(get_family_species_county_pairs("plantae", "stereodontaceae")))

In [None]:
def get_record_for_species_county(kingdom: str, species_county: SpeciesCounty) -> dict:
    return get_idigbio_records(
        rq={
            "kingdom": kingdom,
            "scientificname": species_county.species,
            "county": species_county.county
        },
        limit=1
    )["items"][0]


def get_species_county_records(kingdom, sc_pairs, count):
    shuffle_index = np.random.permutation(len(sc_pairs))[:count]
    for sc in (sc_pairs[i] for i in shuffle_index):
        yield get_record_for_species_county(kingdom, sc)


# Example use:
if True:
    sc_pairs = get_family_species_county_pairs("plantae", "stereodontaceae")
    print(next(iter(get_species_county_records("plantae", sc_pairs, 1))).keys())

In [None]:
phylum_summaries = get_idigbio_summary(
    rq={
        "kingdom": kingdom,
        "phylum": list(phyla)
    },
    top_fields=["phylum", "family"],
    count=families_per_kingdom,
    verbose=False
)

# For families that appear in more than one phylum, assign them to the phylum for which
# they have the most records
all_family_phylum_assignments = dict()
family_record_counts_by_phylum = dict()

for phylum, data in phylum_summaries.items():
    family_counts = dict(map(lambda x: (x[0], x[1]["itemCount"]), data["family"].items()))

    for family, count in family_counts.items():
        current_count = family_record_counts_by_phylum.get(family, 0)
        if count > current_count:
            all_family_phylum_assignments[family] = phylum
            family_record_counts_by_phylum[family] = count

In [None]:
def get_family_candidates(phylum, phylum_family_counts):
    bad_family_names = {
        "",
        "\"\"",
        "unknown",
        "unplaced county",
    }
    
    def check_family_name(family_name: str, family_summary: dict):
        return (family_name.lower() not in bad_family_names and
                all_family_phylum_assignments[family_name] == phylum and
                family_summary["itemCount"] >= min_records_per_family)
    
    return {f: v["itemCount"] for f, v in phylum_family_counts.items() if check_family_name(f, v)}


# 1. For each phylum
# - Get family candidates
# - Count family candidates
phylum_family_candidates = {phylum: get_family_candidates(phylum, data["family"]) for phylum, data in phylum_summaries.items()}
phylum_family_counts = {phylum: len(family_counts) for phylum, family_counts in phylum_family_candidates.items()}

# 2. Sort phyla by family count, ascending
phylum_family_counts = dict(sorted(phylum_family_counts.items(), key=lambda kv: (kv[1], kv[0])))
phylum_family_counts

In [None]:
def gen_family_bins_with_ascending_size(family_sc_counts, num_bins):
    sc_counts = list(family_sc_counts.values())

    if len(sc_counts) == 0:
        return []

    # Bin families by record count
    bin_min: int = min(sc_counts) - 1
    bin_max: int = max(sc_counts) + 1
    bin_edges = np.linspace(bin_min, bin_max, num_bins + 1)

    # Get the bin each family falls into
    family_bin_assignments: np.ndarray = np.digitize(sc_counts, bin_edges) - 1
    
    # Return families in each bucket
    for bin_index in range(num_bins):
        family_names_in_bin = np.array(list(family_sc_counts.keys()))\
            [family_bin_assignments == bin_index]
        
        if len(family_names_in_bin) > 0:
            family_counts_in_bin = np.array(list(family_sc_counts.values()))\
                [family_bin_assignments == bin_index]
        
            yield dict(zip(
                map(str, family_names_in_bin),
                map(int, family_counts_in_bin)
            ))

In [None]:
def iter_families_for_phylum(sc_counts_by_family, target_family_count):
    num_bins = target_family_count
    num_families_remaining_for_phylum = target_family_count

    for j, family_bin_sc_counts in enumerate(gen_family_bins_with_ascending_size(sc_counts_by_family, num_bins)):
        num_bins_remaining = num_bins - j
        max_num_families_per_bucket = math.floor(num_families_remaining_for_phylum / num_bins_remaining)

        # Sort families descending by [(species, county)] count
        family_bin_sc_counts = list(sorted(family_bin_sc_counts.items(), key=lambda kv: (kv[1], kv[0]), reverse=True))\
            [:max_num_families_per_bucket]
        
        bin_families, _ = zip(*family_bin_sc_counts)
        yield from bin_families
        
        num_families_remaining_for_phylum -= len(bin_families)

In [None]:
num_families_remaining = families_per_kingdom

with open(records_path, "w") as f:
    for i, (phylum, phylum_family_count) in enumerate(phylum_family_counts.items()):
        num_phyla_remaining = len(phyla) - i
        target_family_count = math.floor(num_families_remaining / num_phyla_remaining)

        print(f"Phylum {i+1:<2}: {phylum} (max {target_family_count} familiies)")

        species_counties_by_family = {
            family: get_family_species_county_pairs(kingdom, family)
            for family in phylum_family_candidates[phylum]
        }

        # Filter out families with too few records
        sc_counts_by_family = {
            family: len(scs)
            for family, scs in species_counties_by_family.items()
            if len(scs) >= min_records_per_family
        }

        phylum_family_count = 0
        for family in iter_families_for_phylum(sc_counts_by_family, target_family_count):
            print(f"- {family}")
            phylum_family_count += 1

            for record in get_species_county_records(
                    kingdom, 
                    species_counties_by_family[family],
                    max_records_per_family
                ):
                json.dump(record, f)
                f.write("\n")
        
        num_families_remaining -= phylum_family_count
        print(f"  {phylum_family_count} families sampled")