In [8]:
import json
import requests as rq
import os
import time
import pandas as pd
import numpy as np
import requests as rq
import regex as re

In [22]:
RESULTS = "../results"

In [20]:
raw_us = pd.read_csv("../resources/us-maps/us-state-counties.tsv", sep="\t")
raw_us = raw_us.rename(columns={"state": "stateprovince"})
raw_us["country"] = "United States"
raw_us.iloc[0]

stateprovince           Alabama
county           Autauga County
lat                     32.5322
lon                    -86.6464
country           United States
Name: 0, dtype: object

In [14]:
def trim_county_name(county):
    parts = county.split()
    if len(parts) > 1 and re.sub(r'[^\w]', '', parts[-1]).lower() in ("co", "county", "mun", "par", "prov"):
        return " ".join(parts[:-1])
    else:
        return county

In [15]:
def make_record_count_query(record):
    search_values = record.astype(str).to_dict()
    if "county" in search_values:
        search_values["county"] = {
            "type": "prefix",
            "value": trim_county_name(search_values["county"])
        }
    return {
        "rq": search_values,
        "limit": 1,
        "offset": 0
    }

In [16]:
def get_record_count(record):
    query = make_record_count_query(record)
    return rq.post("http://search.idigbio.org/v2/search/records/", json=query).json()["itemCount"]

In [33]:
def get_record_counts(in_df, group_by, out_file_path):
    if os.path.exists(out_file_path):
        df = pd.read_csv(filepath_or_buffer=out_file_path, sep="\t")
    else:
        df = in_df[group_by].drop_duplicates()
        if "county" in df.columns:
            df["county"] = df["county"].map(trim_county_name)
        df.drop_duplicates()
        df["record count"] = df.apply(axis=1, func=get_record_count)
        df.to_csv(out_file_path, sep="\t", index=False)
    return df

In [34]:
counties = get_record_counts(\
    raw_us,
    ["country", "stateprovince", "county"],
      f"{RESULTS}/us-maps/input/record-counts-by-county.tsv")

In [36]:
stateprovinces = get_record_counts(\
    raw_us,
    ["country", "stateprovince"],
      f"{RESULTS}/us-maps/input/record-counts-by-stateprovince.tsv")

In [53]:
df = pd.DataFrame(index=[0])
df["kingdom"] = "plantae"
df["phylum"] = "tracheophyta"
df["family"] = "sapindaceae"
df["genus"] = "acer"
df["specificepithet"] = "saccharum"
sugarmaple = df

df = pd.DataFrame(index=[1])
df["kingdom"] = "plantae"
df["phylum"] = "tracheophyta"
df["family"] = "fabaceae"
df["genus"] = "amorpha"
df["specificepithet"] = "canescens"
leadplant = df

df = pd.DataFrame(index=[2])
df["kingdom"] = "animalia"
df["phylum"] = "chordata"
df["family"] = "dasypodidae"
df["genus"] = "dasypus"
df["specificepithet"] = "novemcinctus"
armadillo = df

df = pd.DataFrame(index=[3])
df["kingdom"] = "animalia"
df["phylum"] = "chordata"
df["family"] = "picidae"
df["genus"] = "leuconotopicus"
df["specificepithet"] = "albolarvatus"
woodpecker = df

df = pd.concat([sugarmaple, leadplant, armadillo, woodpecker], axis=0)

species = get_record_counts(df,
    ["kingdom", "phylum", "family", "genus", "specificepithet"],
      f"{RESULTS}/us-maps/input/record-counts-by-species.tsv")

genera = get_record_counts(df,
    ["kingdom", "phylum", "family", "genus"],
      f"{RESULTS}/us-maps/input/record-counts-by-genus.tsv")

families = get_record_counts(df,
    ["kingdom", "phylum", "family"],
      f"{RESULTS}/us-maps/input/record-counts-by-family.tsv")

phyla = get_record_counts(df,
    ["kingdom", "phylum"],
      f"{RESULTS}/us-maps/input/record-counts-by-phylum.tsv")

df

Unnamed: 0,kingdom,phylum,family,genus,specificepithet
0,plantae,tracheophyta,sapindaceae,acer,saccharum
1,plantae,tracheophyta,fabaceae,amorpha,canescens
2,animalia,chordata,dasypodidae,dasypus,novemcinctus
3,animalia,chordata,picidae,leuconotopicus,albolarvatus
