In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import geopandas as gpd
from collections import Counter
from typing import NamedTuple

In [2]:
def get_continental_counties(path_to_state_counties_tsv, path_to_counties_shp, path_to_states_shp):
    state_name_map = gpd.read_file(path_to_states_shp)[["STATEFP", "NAME"]].set_index("STATEFP")["NAME"]

    df = pd.read_csv(path_to_state_counties_tsv, sep="\t")[["state", "county"]]
    df = df.set_index(df["state"] + ", " + df["county"])

    gdf = gpd.read_file(path_to_counties_shp)[["STATEFP", "NAMELSAD", "geometry"]]
    gdf = gdf.set_index(gdf["STATEFP"].apply(lambda x: state_name_map[x]) + ", " + gdf["NAMELSAD"])

    return gdf.join(df, how="right")[["geometry"]]

In [3]:
us_df = get_continental_counties(\
    "../resources/us-maps/us-state-counties.tsv",
    "../resources/us-maps/tl_2022_us_county/tl_2022_us_county.shp",
    "../resources/us-maps/tl_2022_us_state/tl_2022_us_state.shp")
us_df.head()

Unnamed: 0,geometry
"Alabama, Autauga County","POLYGON ((-86.58826 32.36775, -86.58834 32.367..."
"Alabama, Baldwin County","POLYGON ((-87.97692 31.08658, -87.97688 31.087..."
"Alabama, Barbour County","POLYGON ((-85.41585 31.68164, -85.41619 31.677..."
"Alabama, Bibb County","POLYGON ((-86.87657 33.01891, -86.87657 33.018..."
"Alabama, Blount County","POLYGON ((-86.56421 33.80194, -86.56556 33.801..."


In [4]:
from sklearn.ensemble import HistGradientBoostingClassifier

def train_conf_model(train_df, features):
    return HistGradientBoostingClassifier(
        loss='log_loss',
        learning_rate=0.01,
        min_samples_leaf=10,
        max_iter=200,
        monotonic_cst=np.ones_like(features, dtype=int)
    ).fit(train_df[features], train_df["correct"])

In [12]:
import regex as re
pattern = re.compile("^um[0-9]+_.+")

df = sugarmaple
features = [c for c in df.columns if pattern.match(c) is not None]

print("Num uncertainty measures:", len(features))
print("\n".join(features))

Num uncertainty measures: 0



In [7]:
from mpl_toolkits.axes_grid1 import make_axes_locatable

def plot_scores(dfs: list, name, score_field="score", dpi=150):
    if type(dfs) != list:
        dfs = [dfs]
    
    f, ax = plt.subplots(1,1, figsize=(8,6), sharex=True, sharey=True, dpi=dpi)
    plt.title(f"Occurrence map of {name}")
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="3%",pad=0,alpha=0.5)
    for df in dfs:
        df.plot(score_field, ax=ax, alpha=0.5, cmap='Reds', edgecolor='k', legend=True, cax=cax, linewidth=0.1)
    plt.show()

In [36]:
class InputFiles(NamedTuple):
    taxonomy_scores: str
    taxonomy: str
    record_counts_by_phylum: str
    record_counts_by_family: str
    record_counts_by_genus: str
    record_counts_by_species: str
    record_counts_by_stateprovince: str
    record_counts_by_county: str

class OutputFiles(NamedTuple):
    results_with_ums: str

class Env(NamedTuple):
    input_files: InputFiles
    output_files: OutputFiles
    num_phrasings: int
    phrasings: list[str]
    query_fields: list[str]
    seed: int

if "snakemake" in globals():
    env = Env(
        input_files=snakemake.input,
        output_files=snakemake.output,
        num_phrasings=len(snakemake.params.phrasings),
        phrasings=snakemake.params.phrasings,
        query_fields=snakemake.params.query_fields,
        seed=snakemake.params.seed
    )
else: # Fill in parameters manually for testing outside of snakemake
    import os
    ROOT = os.path.expanduser("~/biodiversity-llms")
    # LLM = "gpt-3.5-turbo-0125"
    LLM = "gpt-4-1106-preview"

    RESULTS = "../results"
    env = Env(
        input_files = InputFiles(
            f"{RESULTS}/us-maps/{LLM}/taxonomy/summary.tsv",
            f"{RESULTS}/us-maps/input/taxa-genus.tsv",
            f"{RESULTS}/us-maps/input/record-counts-by-phylum.tsv",
            f"{RESULTS}/us-maps/input/record-counts-by-family.tsv",
            f"{RESULTS}/us-maps/input/record-counts-by-genus.tsv",
            f"{RESULTS}/us-maps/input/record-counts-by-species.tsv",
            f"{RESULTS}/us-maps/input/record-counts-by-stateprovince.tsv",
            f"{RESULTS}/us-maps/input/record-counts-by-county.tsv",
        ),
        output_files = OutputFiles(
            results_with_ums=f"{RESULTS}/us-maps/{LLM}/results-with-ums.tsv"
        ),
        num_phrasings=6,
        phrasings=[
            "Can species {genus} {specificepithet} be found in {county}, {stateprovince}, {country}?",
            "Is it possible to encounter species {genus} {specificepithet} in {county}, {stateprovince}, {country}?",
            "Is there a presence of species {genus} {specificepithet} within {county}, {stateprovince}, {country}?",
            "Does {county}, {stateprovince}, {country} harbor species {genus} {specificepithet}?",
            "Is species {genus} {specificepithet} present in {county}, {stateprovince}, {country}?",
            "Can one observe species {genus} {specificepithet} in {county}, {stateprovince}, {country}?"
        ],
        query_fields=[
            "kingdom",
            "phylum",
            "family",
            "genus",
            "specificepithet",
            "country",
            "stateprovince",
            "county"
        ],
        seed=69847
    )


In [27]:
def count_item(values, item):
    counts = dict(Counter(values).most_common())
    return counts[item] if item in counts else 0

def make_id(df):
    return df.apply(lambda r: hash("".join([str(v) for v in r.values]).lower()), axis=1)

def get_results(responses):
    df = pd.read_csv(responses, sep="\t")
    
    df["genus"] = df["genus"].str.lower()
    df = df[df["family"] != df["genus"]]

    df[["class", "order"]] = df.apply(axis=1, func=lambda r: kpfg_class_order.loc[r["kingdom"], r["phylum"], r["family"], r["genus"]])

    df["phrasing"] = df["question number"].astype(int) % env.num_phrasings
    df["query id"] = make_id(df[env.query_fields])

    df["response id"] = make_id(df[["query id", "phrasing"]])
    df = df.groupby("response id").head(1) # Drop responses for repeated questions

    df["scores"] = df["responses"].apply(lambda r: count_item(r.lower().split(), "yes"))
    df["yesnos"] = df["responses"].apply(lambda r: count_item(r.lower().split(), "yes") + count_item(r.lower().split(), "no"))
    df["abstains"] = 10 - df["yesnos"]

    df["prediction"] = df["scores"].apply(lambda x: -1 if x == 0 else 1)

    return df

kpfg_class_order = pd.read_csv(open(env.input_files.taxonomy, "r"), sep="\t")\
    .set_index(["kingdom", "phylum", "family", "taxon"])[["class", "order"]]

UNUSED_FIELDS = ["query"]

responses_path = "../results/us-maps/gpt-4-1106-preview/occurrence/responses.tsv"
res = get_results(responses_path)\
    .drop(columns=UNUSED_FIELDS)

print(f"{len(res) / env.num_phrasings:,.0f} records")
print(f"{len(res):,.0f} queries (#records x #phrasings)")
res.head(1)

12,436 records
74,616 queries (#records x #phrasings)


Unnamed: 0,country,stateprovince,county,kingdom,phylum,family,genus,specificepithet,responses,input token count,...,question number,class,order,phrasing,query id,response id,scores,yesnos,abstains,prediction
0,United States,Alabama,Autauga County,plantae,tracheophyta,sapindaceae,acer,saccharum,No No No No No No No No No No,31,...,0,magnoliopsida,sapindales,0,-1877092219711042595,9006511588438991320,0,10,0,-1


In [28]:
phrasing_avg_pred = res.groupby("query id")["prediction"].mean()
phrasing_var_score = res.groupby("query id")["scores"].var()

tax_data = pd.read_csv(open(env.input_files.taxonomy_scores, "r"), sep="\t")\
    .set_index(["subject_rank", "taxon"])

tax_scores = tax_data["rank exact match mean"]
tax_garbage_counts = tax_data["garbage responses"]

average_tax_garbage_counts = tax_garbage_counts.groupby("subject_rank").mean()
average_tax_scores = tax_scores.groupby("subject_rank").mean()

def get_record_counts(path, fields):
    df = pd.read_csv(open(path, "r"), sep="\t")
    if "genus" in df.columns:
        df["genus"] = df["genus"].str.lower()
    return df.groupby(fields)["record count"].first()

record_counts_by_phylum = get_record_counts(env.input_files.record_counts_by_phylum, ["kingdom", "phylum"])
record_counts_by_family = get_record_counts(env.input_files.record_counts_by_family, ["kingdom", "phylum", "family"])
record_counts_by_genus = get_record_counts(env.input_files.record_counts_by_genus, ["kingdom", "phylum", "family", "genus"])
record_counts_by_species = get_record_counts(env.input_files.record_counts_by_species, ["kingdom", "phylum", "family", "genus", "specificepithet"])
record_counts_by_stateprovince = get_record_counts(env.input_files.record_counts_by_stateprovince, ["country", "stateprovince"])
record_counts_by_county = get_record_counts(env.input_files.record_counts_by_county, ["country", "stateprovince", "county"])

In [29]:
import regex as re
def trim_county_name(county):
    parts = county.split()
    if len(parts) > 1 and re.sub(r'[^\w]', '', parts[-1]).lower() in ("co", "county", "mun", "par", "prov"):
        return " ".join(parts[:-1])
    else:
        return county

In [None]:
def get_ums(instance):
    # TODO: condition um3 on prediction?
    num_responses = 10 - instance["abstains"]
    global r
    r = instance

    # Higher values = more certainty
    return pd.Series({
        # Scores
        "um1_total_score": instance["scores"] if instance["prediction"] == 1 else num_responses - instance["scores"],
        "um1_percent_score": (instance["scores"] if instance["prediction"] == 1 else num_responses - instance["scores"]) / max(1, num_responses),

        # Abstains
        "um2_abstains": num_responses,

        # Sensitivity to phrasing
        "um4_phrasing_agreement": phrasing_avg_pred[instance["query id"]] * instance["prediction"],
        "um4_phrasing_score_var": phrasing_var_score[instance["query id"]] * -1,

        # iDigBio record counts by taxonomic ranks
        # "um5_record_count_by_kingdom": taxon_record_counts["kingdomCount"],
        "um5_record_count_by_phylum": record_counts_by_phylum.loc[instance["kingdom"], instance["phylum"]],
        "um5_record_count_by_family": record_counts_by_family.loc[instance["kingdom"], instance["phylum"], instance["family"]],
        "um5_record_count_by_genus": record_counts_by_genus.loc[instance["kingdom"], instance["phylum"], instance["family"], instance["genus"]],
        "um5_record_count_by_species": record_counts_by_species.loc[instance["kingdom"], instance["phylum"], instance["family"], instance["genus"], instance["specificepithet"]],
        "um5_record_count_by_stateprovince": record_counts_by_stateprovince.loc[instance["country"], instance["stateprovince"]],
        "um5_record_count_by_county": record_counts_by_county.loc[instance["country"], instance["stateprovince"], trim_county_name(instance["county"])],

        # Accuracy on taxonomy questions
        "um6_taxqa_accuracy_by_phylum": tax_scores["phylum"].get(instance["phylum"], average_tax_scores["phylum"]),
        "um6_taxqa_accuracy_by_class": tax_scores["class"].get(instance["class"], average_tax_scores["class"]),
        "um6_taxqa_accuracy_by_order": tax_scores["order"].get(instance["order"], average_tax_scores["order"]),
        "um6_taxqa_accuracy_by_family": tax_scores["family"].get(instance["family"], average_tax_scores["family"]),
        "um6_taxqa_accuracy_by_genus": tax_scores["genus"].get(instance["genus"], average_tax_scores["genus"]),

        # Number of yes-no responses to taxonomy questions
        "um7_taxqa_responses_by_phylum": -tax_garbage_counts["phylum"].get(instance["phylum"], average_tax_garbage_counts["phylum"]) / 10,
        "um7_taxqa_responses_by_class": -tax_garbage_counts["class"].get(instance["class"], average_tax_garbage_counts["class"]) / 20,
        "um7_taxqa_responses_by_order": -tax_garbage_counts["order"].get(instance["order"], average_tax_garbage_counts["order"]) / 30,
        "um7_taxqa_responses_by_family": -tax_garbage_counts["family"].get(instance["family"], average_tax_garbage_counts["family"]) / 40,
        "um7_taxqa_responses_by_genus": -tax_garbage_counts["genus"].get(instance["genus"], average_tax_garbage_counts["genus"]) / 50,
    })

df = res[res["phrasing"] == 0]
full_df = pd.concat([df.apply(lambda row: get_ums(row), axis=1), df], axis=1)
full_df.to_csv(env.output_files.results_with_ums, sep="\t")