In [None]:
if "snakemake" in globals():
    print(dict(snakemake.params))
    records_path = snakemake.output[0]
    kingdom = snakemake.params.kingdom
    phyla = snakemake.params.phyla
    records_per_family = snakemake.params.records_per_family
    families_per_kingdom = snakemake.params.families_per_kingdom
else:
    kingdom = "animalia"
    phyla = {"tardigrada", "xenacoelomorpha"}
    records_per_family = 120
    families_per_kingdom = 7

In [None]:
records_per_kingdom = records_per_family * families_per_kingdom
print("Records per kingdom:", records_per_kingdom)
print("Families per kingdom:", families_per_kingdom)

phyla_in_kingdom = len(phyla)
families_per_phylum = (records_per_kingdom / phyla_in_kingdom) / records_per_family
print("Number of phyla:", phyla_in_kingdom)
print("Families per phylum:", families_per_phylum)

Records per kingdom: 840
Families per kingdom: 7
Families per phylum: 3.5


In [12]:
import json
import requests
import math
import numpy as np

api = "https://beta-search.idigbio.org/v2"

def _make_summary_query(rq, top_fields, count):
    global api
    rq = json.dumps(rq)
    top_fields = json.dumps(top_fields)
    count = json.dumps(count)
    return f"{api}/summary/top/records/?rq={rq}&top_fields={top_fields}&count={count}".replace(" ", "")

def get_idigbio_summary(rq, top_fields, count, verbose=False):
    url = _make_summary_query(rq, top_fields, count)
    if verbose: print(f"Sending GET to {url}")
    return requests.get(url).json()[top_fields[0]]

def _make_records_query(rq, limit, api):
    rq = json.dumps(rq | {
        # Uh oh! Sorting doesn't work on beta
        # "uuid": {
        #     "type": "range",
        #     "gt": "00000000-0000-0000-0000-000000000000",
        #     "lte": "ffffffff-ffff-ffff-ffff-ffffffffffff"
        # }
    })
    sort = json.dumps([{ "uuid": "asc" }])
    limit = json.dumps(limit)
    return f"{api}/search/records/?rq={rq}&limit={limit}".replace(" ", "")

# limit=5000 is the max allowed by the API
def get_idigbio_records(rq, limit=5000, verbose=False, api=api):
    url = _make_records_query(rq, limit, api)
    if verbose: print(f"Sending GET to {url}")
    return requests.get(url).json()["items"]

## Select families to sample from

In [13]:
def redistribute_group_counts(group_sizes, desired_total_size):
    """
    Given a dict of groups with specified sizes, reduce group sizes to achieve the desired total size. Returns a new dict.
    """
    group_sizes = dict(sorted(group_sizes.items(), key=lambda kv: (kv[1], kv[0])))
    num_groups = len(group_sizes)
    mass_total = desired_total_size

    running_total = 0
    for i, (group, mass) in enumerate(group_sizes.items()):
        groups_remaining = num_groups - i
        mass_remaining = mass_total - running_total
        mass_per_group_remaining = math.floor(mass_remaining / groups_remaining)

        mass = min(math.floor(mass), mass_per_group_remaining)
        group_sizes[group] = mass
        # print(mass, mass_remaining, mass_per_group_remaining, sep="\t")

        running_total += mass
    
    return group_sizes

In [14]:
summary = get_idigbio_summary(
    rq={"kingdom": kingdom, "phylum": list(phyla), "family": {"type": "exists"}},
    top_fields=["phylum", "family"],
    count=1000,
    verbose=False
)

In [15]:
# For families that appear in more than one phylum, assign them to the phylum for which they have the most records
family_phylum_assignments = dict()
family_record_counts = dict()

for phylum, data in summary.items():
    family_counts = dict(map(lambda x: (x[0], x[1]["itemCount"]), data["family"].items()))

    for family, count in family_counts.items():
        current_count = family_record_counts.get(family, 0)
        if count > current_count:
            family_phylum_assignments[family] = phylum
            family_record_counts[family] = count

In [16]:
bad_family_names = {
    ""
}

def get_family_candidates(phylum, phylum_family_counts):
    return {f: v["itemCount"] for f, v in phylum_family_counts.items() if f.lower() not in bad_family_names and v["itemCount"] >= records_per_family and family_phylum_assignments[f] == phylum}

# 1. For each phylum
# - Get family candidates
# - Count family candidates
phylum_family_candidates = {phylum: get_family_candidates(phylum, data["family"]) for phylum, data in summary.items()}
phylum_family_counts = {phylum: len(family_counts) for phylum, family_counts in phylum_family_candidates.items()}

# 2. Sort phyla by family count, ascending
phylum_family_counts = dict(sorted(phylum_family_counts.items(), key=lambda kv: (kv[1], kv[0])))

# 3. Allocate family counts to phyla: spread as evenly as possible, rounding family counts to favor the biggest families
balanced_phylum_family_counts = redistribute_group_counts(phylum_family_counts, families_per_kingdom)
balanced_phylum_family_counts

{'xenacoelomorpha': 3, 'tardigrada': 4}

In [17]:
def select_families_in_phylum(phylum, num_families_to_choose):
    family_record_counts = phylum_family_candidates[phylum]
    counts = list(family_record_counts.values())

    # Bin families by record count
    num_bins = num_families_to_choose
    bin_min = min(counts) - 1
    bin_max = max(counts) + 1
    bin_edges = np.linspace(bin_min, bin_max, num_bins + 1)
    bin_totals, _ = np.histogram(counts, bin_edges)
    
    balanced_bin_totals_map = redistribute_group_counts(dict(enumerate(bin_totals)), num_families_to_choose)
    balanced_bin_totals = [balanced_bin_totals_map[i] for i in range(num_bins)]

    # Select families for each bin
    family_bin_assignments = np.digitize(counts, bin_edges) - 1
    
    selected_families = set()
    for bin_index in range(num_bins):
        selected_families_in_bin = np.array(list(family_record_counts))\
            [family_bin_assignments == bin_index]\
            [:balanced_bin_totals[bin_index]]
        selected_families |= set(map(str, selected_families_in_bin))
    
    return selected_families

In [20]:
phylum_families = { p: select_families_in_phylum(p, balanced_phylum_family_counts[p]) for p in phyla }

# Make sure we got the right counts
selected_family_counts_by_phylum = dict(map(lambda k: (k[0], len(k[1])), phylum_families.items()))
assert selected_family_counts_by_phylum == balanced_phylum_family_counts

all_selected_families = set().union(*phylum_families.values())
total_num_selected_families = len(all_selected_families)
assert total_num_selected_families == families_per_kingdom

phylum_families

{'xenacoelomorpha': {'mecynostomidae', 'proporidae', 'xenoturbellidae'},
 'tardigrada': {'echiniscidae', 'hypsibiidae', 'macrobiotidae', 'milnesiidae'}}

## Download records for each family

In [None]:
from tqdm.notebook import tqdm
import random

with open(records_path, "w") as f:
    for phylum, families in tqdm(phylum_families.items()):
        for family in tqdm(families):
            # TODO: random sampling should be done by the API, but sorting by UUID (the best way I know of to shuffle) gives an error on the beta API
            records = random.sample(get_idigbio_records(
                rq={"kingdom": kingdom, "phylum": phylum, "family": family}
            ), records_per_family)

            assert len(records) == records_per_family

            for record in records:
                json.dump(record, f)
                f.write("\n")

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]