In [1]:
import json
import requests as rq
import os
import time

In [2]:
base_dir = "./"
raw_dir = os.path.join(base_dir, "raw")
processed_dir = os.path.join(base_dir, "processed")
results_dir = os.path.join(base_dir, "results")

for d in (base_dir, raw_dir, processed_dir, results_dir):
    if not os.path.exists(d):
        os.mkdir(d)

In [2]:
kingdoms_phyla_families = rq.get(f"""https://search.idigbio.org/v2/summary/top/records/?top_fields={json.dumps(["kingdom", "phylum", "family"])}""").json()["kingdom"]
animals_plants_fungi = { k: v for k, v in kingdoms_phyla_families.items() if k in ("animalia", "plantae", "fungi")}
animals_plants_fungi.keys()

dict_keys(['animalia', 'plantae', 'fungi'])

In [14]:
animals_plants_fungi["animalia"]["phylum"].keys()

dict_keys(['arthropoda', 'chordata', 'mollusca', 'annelida', 'cnidaria', 'echinodermata', 'brachiopoda', 'porifera', 'bryozoa', 'arthropoda von siebold, 1848'])

In [3]:
def kpfc_gen(max_small_groups):
    trouble_makers = {
        # Phyla
        "arthropoda von siebold, 1848",
        "flowering plants",
        "lichens",
        "amoebozoa",
        "fungi incertae sedis",
        "fungi",
        "glomeromycota",

        # Families
        "family unknown"
    }

    for kingdom, k_data in animals_plants_fungi.items():
        for phylum, p_data in k_data["phylum"].items():
            if phylum not in trouble_makers:
                for family, f_data in list(p_data["family"].items())[:max_small_groups]:
                    if family not in trouble_makers:
                        num_records = f_data["itemCount"]
                        yield {"kingdom": kingdom, "phylum": phylum, "family": family, "count": num_records}

# next(kpfc_gen(max_small_groups=1))

In [7]:
for i, x in enumerate(kpfc_gen(max_small_groups=3)):
    print(i, x)

0 {'kingdom': 'animalia', 'phylum': 'arthropoda', 'family': 'formicidae', 'count': 1477824}
1 {'kingdom': 'animalia', 'phylum': 'arthropoda', 'family': 'apidae', 'count': 1338885}
2 {'kingdom': 'animalia', 'phylum': 'arthropoda', 'family': 'carabidae', 'count': 813649}
3 {'kingdom': 'animalia', 'phylum': 'chordata', 'family': 'cricetidae', 'count': 1285131}
4 {'kingdom': 'animalia', 'phylum': 'chordata', 'family': 'plethodontidae', 'count': 688542}
5 {'kingdom': 'animalia', 'phylum': 'chordata', 'family': 'cyprinidae', 'count': 679658}
6 {'kingdom': 'animalia', 'phylum': 'mollusca', 'family': 'unionidae', 'count': 299394}
7 {'kingdom': 'animalia', 'phylum': 'mollusca', 'family': 'muricidae', 'count': 217286}
8 {'kingdom': 'animalia', 'phylum': 'mollusca', 'family': 'veneridae', 'count': 182111}
9 {'kingdom': 'animalia', 'phylum': 'annelida', 'family': 'spionidae', 'count': 41428}
10 {'kingdom': 'animalia', 'phylum': 'annelida', 'family': 'syllidae', 'count': 38088}
11 {'kingdom': 'anim

In [73]:
def make_family_records_query(kingdom: str, phylum: str, family: str):
    return {
        "rq": {
            "kingdom": kingdom,
            "phylum": phylum,
            "family": family,
            "country": {
                "type": "exists"
            },
            "stateprovince": {
                "type": "exists"
            },
            "county": {
                "type": "exists"
            },
            "genus": {
                "type": "exists"
            },
            "specificepithet": {
                "type": "exists"
            },
            "uuid": {
                "type": "range",
                "gt": "00000000-0000-0000-0000-000000000000",
                "lte": "ffffffff-ffff-ffff-ffff-ffffffffffff"
                }
            },
        "sort": [
            {
                "uuid": "asc"
                }
            ],
        "limit": 5000,
        "offset": 0
    }

In [74]:
def process_record(record: dict) -> str:
    return json.dumps({
        "indexTerms": record["indexTerms"],
        "data": record["data"]
    })

In [80]:
def download_records(query, out_dir, file_prefix, numRecords):
    inc = 1
    while True:
        query["limit"] = min(query["limit"], numRecords)
        response = rq.post("http://search.idigbio.org/v2/search/records/", json=query)
        if response.status_code != 200:
            time.sleep(2)
            continue
        else:
            with open(os.path.join(out_dir, f"{file_prefix}-{str(inc).zfill(3)}.jsonl"), "w") as outfile:
                try:
                    response_data = response.json(strict=False)
                    records = response_data["items"]
                    numRecords -= len(records)
                    for record in records:
                        outfile.write(f"{process_record(record)}\n")
                except ValueError:
                    continue

            inc = inc + 1

            # If this is the last page of records
            if response_data["itemCount"] <= query["limit"] or numRecords <= 0:
                break;
            else:
                # Start with next block of records
                size = len(response_data["items"])
                query["rq"]["uuid"]["gt"] = response_data["items"][size - 1]["uuid"]
        break

In [81]:
# Kingdom, phylum, family counts, max 3 families per phylum
kpfcs = [x for x in kpfc_gen(max_small_groups=3)]

records_per_family = 1000

# TODO: print request URLs to file
n = len(kpfcs)
for i, kpfc in enumerate(kpfcs):
     kingdom, phylum, family, familyCount = kpfc.values()
     query = make_family_records_query(kingdom, phylum, family)
     print(f"{i+1}/{n}\t", kingdom, phylum, family)
     download_records(query, raw_dir, f"{kingdom}-{phylum}-{family}", records_per_family)

1/71	 animalia arthropoda formicidae
2/71	 animalia arthropoda apidae
3/71	 animalia arthropoda carabidae
4/71	 animalia chordata cricetidae
5/71	 animalia chordata plethodontidae
6/71	 animalia chordata cyprinidae
7/71	 animalia mollusca unionidae
8/71	 animalia mollusca muricidae
9/71	 animalia mollusca veneridae
10/71	 animalia annelida spionidae
11/71	 animalia annelida syllidae
12/71	 animalia annelida nereididae
13/71	 animalia cnidaria acroporidae
14/71	 animalia cnidaria faviidae
15/71	 animalia cnidaria plexauridae
16/71	 animalia echinodermata amphiuridae
17/71	 animalia echinodermata ophiuridae
18/71	 animalia echinodermata holothuriidae
19/71	 animalia brachiopoda spirigerellidae
20/71	 animalia brachiopoda spiriferidae
21/71	 animalia brachiopoda rugosochonetidae
22/71	 animalia porifera chalinidae
23/71	 animalia porifera halichondriidae
24/71	 animalia porifera microcionidae
25/71	 animalia bryozoa fenestellidae
26/71	 animalia bryozoa rhabdomesidae
27/71	 animalia bryoz

In [4]:
with open(os.path.join(processed_dir, "taxon-counts.tsv"), "w") as f:
    f.write("\t".join(("kingdom", "phylum", "family", "kingdomCount", "phylumCount", "familyCount")) + "\n")
    for kingdom, kdata in animals_plants_fungi.items():
        for phylum, pdata in kdata["phylum"].items():
            for family, fdata in pdata["family"].items():
                f.write(f"""{kingdom}\t{phylum}\t{family}\t{kdata["itemCount"]}\t{pdata["itemCount"]}\t{fdata["itemCount"]}\n""")