In [2]:
import requests as rq
import pandas as pd
import regex as re

In [6]:
presence_with_fungi = pd.read_csv("../results/idigbio-sample/input/presence.tsv", sep="\t")
presence = presence_with_fungi[presence_with_fungi["kingdom"] != "fungi"]
presence

Unnamed: 0,kingdom,phylum,family,genus,specificepithet,country,stateprovince,county
300,plantae,dicotyledon,asteraceae,Acmella,ciliata,Brasil,Acre,Cruzeiro Do Sul
301,plantae,dicotyledon,asteraceae,Conyza,bonariensis,Brasil,São Paulo,Ubatuba
302,plantae,dicotyledon,asteraceae,Vernonia,simplex,Brasil,Mato Grosso,Diamantino
303,plantae,dicotyledon,asteraceae,Acmella,ciliata,Brasil,Amazonas,Uarini
304,plantae,dicotyledon,asteraceae,Vernonia,megapotamica,Argentina,Corrientes,Capital
...,...,...,...,...,...,...,...,...
15956,plantae,rhodophyta,rhodomelaceae,Palisada,perforata,Mexico,Baja California,Los Cabos
15957,plantae,rhodophyta,rhodomelaceae,Vertebrata,lanosa,Denmark,North Sea,Ns
15958,plantae,rhodophyta,rhodomelaceae,Choreocolax,polysiphoniae,United States,Maine,York
15959,plantae,rhodophyta,rhodomelaceae,Chondria,capillaris,France,Bretagne,Finistère


In [38]:
def trim_county_name(county):
    parts = county.split()
    if len(parts) > 1 and re.sub(r'[^\w]', '', parts[-1]).lower() in ("co", "county", "mun", "par", "prov"):
        return " ".join(parts[:-1])
    else:
        return county

def make_record_count_query(record):
    search_values = record.astype(str).to_dict()
    if "county" in search_values:
        search_values["county"] = {
            "type": "prefix",
            "value": trim_county_name(search_values["county"])
        }
    return {
        "rq": search_values,
        "limit": 1,
        "offset": 0
    }

def get_record_count(record):
    query = make_record_count_query(record)
    return rq.post("http://search.idigbio.org/v2/search/records/", json=query).json()["itemCount"]

def save_record_counts(group_by, path):
    df = presence_with_fungi[group_by].drop_duplicates()
    df["county"] = df["county"].map(trim_county_name)
    df.drop_duplicates()
    df["record count"] = df.apply(axis=1, func=get_record_count)
    df.to_csv(path, sep="\t", index=False)

In [64]:
save_record_counts(["country", "stateprovince", "county"], 
                   "../results/idigbio-sample/input/record-counts-by-county.tsv")

save_record_counts(["country", "stateprovince"],
                   "../results/idigbio-sample/input/record-counts-by-stateprovince.tsv")

save_record_counts(["kingdom", "phylum", "family", "genus", "specificepithet"], 
                   "../results/idigbio-sample/input/record-counts-by-species.tsv")

save_record_counts(["kingdom", "phylum", "family", "genus"], 
                   "../results/idigbio-sample/input/record-counts-by-genus.tsv")

save_record_counts(["kingdom", "phylum", "family"], 
                   "../results/idigbio-sample/input/record-counts-by-family.tsv")

save_record_counts(["kingdom", "phylum"], 
                   "../results/idigbio-sample/input/record-counts-by-phylum.tsv")